https://dev.swfinstitute.org/profiles/wealth-manager/europe

companies_list 是一个 list of json dicts，包含了公司名称、URL、国家等信息。

In [1]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from typing import List, Dict

url = "https://dev.swfinstitute.org/profiles/wealth-manager/europe"

# Define headers to increase request success rate
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9",
}

# request url
response = requests.get(url, headers=HEADERS)
soup = BeautifulSoup(response.text, "html.parser")

# 处理 HTML 内容
list_container = soup.find("div", class_="list-group list-group-wrap")
print(list_container)

<div class="list-group list-group-wrap"><a href="/profile/5dabfa3c5295eb340fa1028f"><span><strong class="list-group-item-title">1875 Finance</strong><br/><small>Wealth Manager in Switzerland, Europe</small></span></a><a href="/profile/60862d6732b4e13af14a634d"><span><strong class="list-group-item-title">81 Family Office</strong><br/><small>Wealth Manager in Italy, Europe</small></span></a><a href="/profile/5e39a60ffcbe7e8ca727e542"><span><strong class="list-group-item-title">ACT Asset Management AG</strong><br/><small>Wealth Manager in Switzerland, Europe</small></span></a><a href="/profile/6120a2127931af78554ac0d2"><span><strong class="list-group-item-title">ATTENTIUM AG</strong><br/><small>Wealth Manager in Germany, Europe</small></span></a><a href="/profile/5e39a5d2fcbe7e8ca725bd94"><span><strong class="list-group-item-title">Agami Family Office</strong><br/><small>Wealth Manager in France, Europe</small></span></a><a href="/profile/5e39a2c2fcbe7e8ca713a6c8"><span><strong class="lis

In [2]:
BASE = "https://dev.swfinstitute.org"


def parse_company_urls(container) -> List[Dict[str, str]]:
    """
    container: bs4.Tag | str | None
    Returns: list of dicts, each with keys: company_name, url
    Logic:
    - Only keep anchors linking to '/profile/...'
    - Company name is taken from <strong class="list-group-item-title"> if present, else fallback to anchor text
    """
    if container is None:
        return []

    # 如果传入的是 HTML 字符串，先解析
    if isinstance(container, str):
        _soup = BeautifulSoup(container, "html.parser")
        container = _soup.find("div", class_="list-group list-group-wrap") or _soup

    items: List[Dict[str, str]] = []
    seen = set()
    for a in container.find_all("a", href=True):
        raw_href = a.get("href", "")
        # 只保留 /profile/ 详情页链接
        if "/profile/" not in raw_href:
            continue
        href = urljoin(BASE, raw_href)

        # 优先从 strong.list-group-item-title 取公司名
        title_el = a.select_one("strong.list-group-item-title") or a.find("strong")
        name = (title_el.get_text(strip=True) if title_el else a.get_text(strip=True))
        name = name.strip()
        if not name:
            continue

        if href and href not in seen:
            seen.add(href)
            items.append({
                "company_name": name,
                "url": href,
            })
    return items


companies_list = parse_company_urls(list_container)
print(f"Parsed {len(companies_list)} companies")

# 预览前 5 条
for item in companies_list[:5]:
    print(item)

# 最终输出
# companies_list

Parsed 50 companies
{'company_name': '1875 Finance', 'url': 'https://dev.swfinstitute.org/profile/5dabfa3c5295eb340fa1028f'}
{'company_name': '81 Family Office', 'url': 'https://dev.swfinstitute.org/profile/60862d6732b4e13af14a634d'}
{'company_name': 'ACT Asset Management AG', 'url': 'https://dev.swfinstitute.org/profile/5e39a60ffcbe7e8ca727e542'}
{'company_name': 'ATTENTIUM AG', 'url': 'https://dev.swfinstitute.org/profile/6120a2127931af78554ac0d2'}
{'company_name': 'Agami Family Office', 'url': 'https://dev.swfinstitute.org/profile/5e39a5d2fcbe7e8ca725bd94'}


In [20]:
def fetch_profile_attributes(url: str) -> dict:
    """
    Fetch a profile page and extract attributes (e.g., Phone, Country, City) from the table.
    Returns a dict with available keys.
    """
    # Define headers to increase request success rate
    HEADERS = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0 Safari/537.36",
        "Accept-Language": "en-US,en;q=0.9",
    }
    
    resp = requests.get(url, headers=HEADERS)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")

    table_container = soup.select_one(
        "#swfiProfileSingle > section:nth-child(2) > div > div:nth-child(2) > div.table-responsive"
    )
    if not table_container:
        return {}

    table = table_container.find("table")
    if not table:
        return {}

    result = {}
    for row in table.find_all("tr"):
        cells = row.find_all("td")
        if len(cells) == 2:
            key = cells[0].get_text(strip=True).replace(":", "")
            value = cells[1].get_text(strip=True)
            if key and value:
                result[key] = value

    return result


# 简单自测
if __name__ == "__main__":
    test_url = "https://dev.swfinstitute.org/profile/5dabfa3c5295eb340fa1028f"
    attrs = fetch_profile_attributes(test_url)
    print({k: attrs.get(k) for k in ("Phone", "Country", "City")})

{'Phone': '41-22-595-18-75', 'Country': 'Switzerland', 'City': 'Geneva'}


In [24]:
# Enrich companies_list with attributes from profile pages (list of dicts only)
# Each item like: {"company_name": str, "url": str, ...}

for company in companies_list:
    if not isinstance(company, dict):
        continue
    name = company.get("company_name") or company.get("name") or ""
    url = company.get("url")
    if not url:
        continue
    try:
        attrs = fetch_profile_attributes(url)
    except Exception:
        attrs = {}
    print(
        f"Company: {name}, Phone: {attrs.get('Phone')}, "
        f"Country: {attrs.get('Country')}, City: {attrs.get('City')}"
    )
    # write back enriched fields; keep original lower-case 'country' if present
    company["Phone"] = attrs.get("Phone")
    company["Country"] = company.get("country") or attrs.get("Country")
    company["City"] = attrs.get("City")
    # store Website if present for later contact page guessing
    if attrs.get("Website"):
        company["Website"] = attrs.get("Website")

Company: 1875 Finance, Phone: 41-22-595-18-75, Country: Switzerland, City: Geneva
Company: 81 Family Office, Phone: 39-0444-546662, Country: Italy, City: Vicenza
Company: 81 Family Office, Phone: 39-0444-546662, Country: Italy, City: Vicenza
Company: ACT Asset Management AG, Phone: 41-43-499-06-50, Country: Switzerland, City: Zurich
Company: ACT Asset Management AG, Phone: 41-43-499-06-50, Country: Switzerland, City: Zurich
Company: ATTENTIUM AG, Phone: 49-541-5805020, Country: Germany, City: Osnabrück
Company: ATTENTIUM AG, Phone: 49-541-5805020, Country: Germany, City: Osnabrück
Company: Agami Family Office, Phone: 33-1-76-74-74-00, Country: France, City: Paris
Company: Agami Family Office, Phone: 33-1-76-74-74-00, Country: France, City: Paris
Company: Allconsult Ackermans Family Office, Phone: None, Country: Germany, City: None
Company: Allconsult Ackermans Family Office, Phone: None, Country: Germany, City: None
Company: Alpha Blue Ocean, Phone: None, Country: France, City: None
Co

In [25]:
# Preview enriched data (first 5 entries)
preview = []
for obj in companies_list[:5]:
    if isinstance(obj, dict):
        preview.append({
            "company_name": obj.get("company_name") or obj.get("name"),
            "url": obj.get("url"),
            "Phone": obj.get("Phone"),
            "Country": obj.get("Country"),
            "City": obj.get("City"),
            "contact_page": obj.get("contact_page"),
        })
preview

[{'company_name': '1875 Finance',
  'url': 'https://dev.swfinstitute.org/profile/5dabfa3c5295eb340fa1028f',
  'Phone': '41-22-595-18-75',
  'Country': 'Switzerland',
  'City': 'Geneva',
  'contact_page': None},
 {'company_name': '81 Family Office',
  'url': 'https://dev.swfinstitute.org/profile/60862d6732b4e13af14a634d',
  'Phone': '39-0444-546662',
  'Country': 'Italy',
  'City': 'Vicenza',
  'contact_page': None},
 {'company_name': 'ACT Asset Management AG',
  'url': 'https://dev.swfinstitute.org/profile/5e39a60ffcbe7e8ca727e542',
  'Phone': '41-43-499-06-50',
  'Country': 'Switzerland',
  'City': 'Zurich',
  'contact_page': None},
 {'company_name': 'ATTENTIUM AG',
  'url': 'https://dev.swfinstitute.org/profile/6120a2127931af78554ac0d2',
  'Phone': '49-541-5805020',
  'Country': 'Germany',
  'City': 'Osnabrück',
  'contact_page': None},
 {'company_name': 'Agami Family Office',
  'url': 'https://dev.swfinstitute.org/profile/5e39a5d2fcbe7e8ca725bd94',
  'Phone': '33-1-76-74-74-00',
  '

In [26]:
from typing import List
from google_search_api import google_search_formattedUrl
from llm import chat_once


def select_contact_page(company: str, n: int = 10) -> str:
    """Use Google search to gather candidates and LLM to pick the best contact page URL.

    Returns an empty string if no candidates found.
    """
    url_candidates: List[str] = google_search_formattedUrl(f"{company} contact page", n=n)
    if not url_candidates:
        return ""

    system_msg = "You are a precise web assistant. Choose the best contact page URL from the provided list."
    user_msg = (
        "Company: " + company + "\n"
        "Pick exactly ONE URL from the candidates that is the company's official contact page (not the homepage).\n"
        "Rules:\n"
        "- Must be one of the candidates.\n"
        "- Prefer URLs with path like /contact, /contact-us, /kontakt, /contacts, /impressum.\n"
        "- Avoid social media or directory sites.\n"
        "- Output ONLY the URL, nothing else.\n\n"
        "Candidates:\n" + "\n".join(url_candidates)
    )

    selected = (chat_once(system_msg, user_msg, temperature=0.0, max_tokens=60) or "").strip()

    # Fallback if LLM output not among candidates
    if selected not in url_candidates:
        contact_keywords = ["contact", "contact-us", "kontakt", "contacts", "impressum"]
        picked = next((u for u in url_candidates if any(k in u.lower() for k in contact_keywords)), None)
        selected = picked or url_candidates[0]

    return selected

In [None]:
# Try to guess an official contact page from the company's Website by probing common paths
import re
from urllib.parse import urlparse, urljoin


def _normalize_base_website(website: str) -> str:
    if not website:
        return ""
    website = website.strip()
    # Prepend scheme if missing
    if not re.match(r"^https?://", website, flags=re.I):
        website = "https://" + website
    parsed = urlparse(website)
    if not parsed.netloc:
        return ""
    base = f"{parsed.scheme}://{parsed.netloc}"
    return base.rstrip("/")


def guess_contact_from_website(website: str, timeout: int = 8) -> str:
    """Given a company's Website, probe common contact paths and return the first valid URL.
    Returns empty string if none found or on errors.
    """
    # Define headers to increase request success rate
    HEADERS = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0 Safari/537.36",
        "Accept-Language": "en-US,en;q=0.9",
    }
    
    base = _normalize_base_website(website)
    if not base:
        return ""

    # Common paths (ordered by likelihood)
    paths = [
        "/contact",
        "/contact-us",
        "/contacts",
        "/kontakt",
        "/en/contact",
        "/en/contacts",
        "/about/contact",
        "/impressum",
    ]

    # Prefer HEAD, fallback to GET if not allowed
    for p in paths:
        candidate = urljoin(base + "/", p.lstrip("/"))
        try:
            r = requests.head(candidate, headers=HEADERS, allow_redirects=True, timeout=timeout)
            # Some sites block HEAD; try GET on 405/403/401
            if r.status_code in (405, 403, 401):
                r = requests.get(candidate, headers=HEADERS, allow_redirects=True, timeout=timeout, stream=True)
            if 200 <= r.status_code < 400:
                return r.url if getattr(r, "url", None) else candidate
        except Exception:
            continue
    return ""

In [28]:
# Enrich companies_list with likely contact page URL using Website guess first, then Google + LLM
for company in companies_list:
    if not isinstance(company, dict):
        continue
    # Skip if already has contact_page
    if company.get("contact_page"):
        continue

    # 1) Try from Website common paths
    website = company.get("Website") or company.get("website") or ""
    contact_url = ""
    if website:
        try:
            contact_url = guess_contact_from_website(website)
        except Exception:
            contact_url = ""

    # 2) Fallback: Google + LLM
    if not contact_url:
        name = company.get("company_name") or company.get("name") or ""
        try:
            contact_url = select_contact_page(name)
        except Exception:
            contact_url = ""

    company["contact_page"] = contact_url

In [29]:
companies_list

[{'company_name': '1875 Finance',
  'url': 'https://dev.swfinstitute.org/profile/5dabfa3c5295eb340fa1028f',
  'Phone': '41-22-595-18-75',
  'Country': 'Switzerland',
  'City': 'Geneva',
  'contact_page': 'https://1875.ch/contact/'},
 {'company_name': '81 Family Office',
  'url': 'https://dev.swfinstitute.org/profile/60862d6732b4e13af14a634d',
  'Phone': '39-0444-546662',
  'Country': 'Italy',
  'City': 'Vicenza',
  'contact_page': 'https://www.leominsterfamilydentists.com/contact-us/'},
 {'company_name': 'ACT Asset Management AG',
  'url': 'https://dev.swfinstitute.org/profile/5e39a60ffcbe7e8ca727e542',
  'Phone': '41-43-499-06-50',
  'Country': 'Switzerland',
  'City': 'Zurich',
  'contact_page': 'https://www.aphis.usda.gov/contact'},
 {'company_name': 'ATTENTIUM AG',
  'url': 'https://dev.swfinstitute.org/profile/6120a2127931af78554ac0d2',
  'Phone': '49-541-5805020',
  'Country': 'Germany',
  'City': 'Osnabrück',
  'contact_page': 'https://www.swfinstitute.org/profile/6120a2127931af

In [35]:
from llm import extract_contact_info

url = "https://1875.ch/contact/"

# Define headers to increase request success rate
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9",
}

# requests url with headers
response = requests.get(url, headers=HEADERS)   
# 然后把 response.text 传给 BeautifulSoup
soup = BeautifulSoup(response.text, "html.parser")
# 让 llm 提取 联系方式， email，电话。
contact_info = extract_contact_info(soup)

print(contact_info)

# 添加信息，作为 contact_email，contact_phone
company["contact_email"] = contact_info.get("email")
company["contact_phone"] = contact_info.get("phone")

{'email': 'info@1875.ch', 'phone': '+41 22 595 18 75'}


In [36]:
def extract_company_contact_info(company: dict) -> dict:
    """
    Extract contact information for a single company from its contact page.
    
    Args:
        company: Company dict with at least 'contact_page' key
        
    Returns:
        Updated company dict with 'contact_email' and 'contact_phone' fields
    """
    # Define headers to increase request success rate
    HEADERS = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0 Safari/537.36",
        "Accept-Language": "en-US,en;q=0.9",
    }
    
    # Skip if no contact page URL
    contact_page_url = company.get("contact_page")
    if not contact_page_url:
        company["contact_email"] = None
        company["contact_phone"] = None
        return company
    
    try:
        # Fetch the contact page with headers
        response = requests.get(contact_page_url, headers=HEADERS, timeout=10)
        response.raise_for_status()
        
        # Parse HTML
        soup = BeautifulSoup(response.text, "html.parser")
        
        # Extract contact info using LLM
        contact_info = extract_contact_info(soup)
        
        # Add extracted info to company dict
        company["contact_email"] = contact_info.get("email")
        company["contact_phone"] = contact_info.get("phone")
        
        print(f"✓ {company.get('company_name', 'Unknown')}: email={contact_info.get('email')}, phone={contact_info.get('phone')}")
        
    except Exception as e:
        print(f"✗ Failed to extract contact info for {company.get('company_name', 'Unknown')}: {str(e)}")
        company["contact_email"] = None
        company["contact_phone"] = None
    
    return company

In [50]:
# Apply contact extraction to all companies
print(f"Starting contact extraction for {len(companies_list)} companies...")
print("-" * 60)

for i, company in enumerate(companies_list, 1):
    if not isinstance(company, dict):
        continue
    
    # Skip if already has contact info
    if company.get("contact_email") is not None or company.get("contact_phone") is not None:
        print(f"[{i}/{len(companies_list)}] Skipping {company.get('company_name', 'Unknown')} - already has contact info")
        continue
    
    print(f"[{i}/{len(companies_list)}] Processing {company.get('company_name', 'Unknown')}...")
    company = extract_company_contact_info(company)
    
    # Add a small delay to be respectful to websites
    import time
    time.sleep(1)

print("-" * 60)
print("Contact extraction completed!")

# Count results
total_companies = len([c for c in companies_list if isinstance(c, dict)])
with_email = len([c for c in companies_list if isinstance(c, dict) and c.get("contact_email")])
with_phone = len([c for c in companies_list if isinstance(c, dict) and c.get("contact_phone")])

print(f"Results: {total_companies} companies processed")
print(f"- {with_email} companies with email addresses")
print(f"- {with_phone} companies with phone numbers")

Starting contact extraction for 50 companies...
------------------------------------------------------------
[1/50] Skipping 1875 Finance - already has contact info
[2/50] Processing 81 Family Office...
✗ Failed to extract contact info for 81 Family Office: 403 Client Error: Forbidden for url: https://www.leominsterfamilydentists.com/contact-us/
✗ Failed to extract contact info for 81 Family Office: 403 Client Error: Forbidden for url: https://www.leominsterfamilydentists.com/contact-us/
[3/50] Processing ACT Asset Management AG...
[3/50] Processing ACT Asset Management AG...
✓ ACT Asset Management AG: email=None, phone=None
✓ ACT Asset Management AG: email=None, phone=None
[4/50] Processing ATTENTIUM AG...
[4/50] Processing ATTENTIUM AG...
✗ Failed to extract contact info for ATTENTIUM AG: 403 Client Error: Forbidden for url: https://www.swfinstitute.org/profile/6120a2127931af78554ac0d2
✗ Failed to extract contact info for ATTENTIUM AG: 403 Client Error: Forbidden for url: https://www

In [38]:
# Preview companies with contact information
print("Companies with contact information (first 10):")
print("=" * 80)

count = 0
for company in companies_list:
    if not isinstance(company, dict):
        continue
    
    email = company.get("contact_email")
    phone = company.get("contact_phone")
    
    if email or phone:
        count += 1
        if count <= 10:
            print(f"\n{count}. {company.get('company_name', 'Unknown')}")
            print(f"   Country: {company.get('Country', 'N/A')}")
            print(f"   Email: {email or 'Not found'}")
            print(f"   Phone: {phone or 'Not found'}")
            print(f"   Contact Page: {company.get('contact_page', 'N/A')}")
        elif count == 11:
            print(f"\n... and {len([c for c in companies_list if isinstance(c, dict) and (c.get('contact_email') or c.get('contact_phone'))]) - 10} more companies with contact info")
            break

print(f"\nTotal companies with contact info: {len([c for c in companies_list if isinstance(c, dict) and (c.get('contact_email') or c.get('contact_phone'))])}")

Companies with contact information (first 10):

1. 1875 Finance
   Country: Switzerland
   Email: info@1875.ch
   Phone: +41 22 595 18 75
   Contact Page: https://1875.ch/contact/

2. Agami Family Office
   Country: France
   Email: Not found
   Phone: +33 (0)1 76 74 74 00
   Contact Page: https://agami.com/

3. Amadeus Capital SA
   Country: Switzerland
   Email: Not found
   Phone: +41 22 808 04 51
   Contact Page: https://www.amadeus.ch/legal

4. Artorius Wealth Management
   Country: United Kingdom
   Email: enquiries@artorius.com
   Phone: +44 (0) 161 711 0730
   Contact Page: https://www.artorius.com/contact

5. Belvedere Asset Management
   Country: Switzerland
   Email: info@belvedere-am.com
   Phone: +41 43 244 77 77
   Contact Page: https://belvedere-am.com/en/career/

6. Black Oak Family Office SA
   Country: Switzerland
   Email: blackoak@blackoak.ch
   Phone: +41 22 322 94 41
   Contact Page: https://blackoak.ch/contact/

7. Brewin Dolphin
   Country: United Kingdom
   Ema

In [39]:
companies_list

[{'company_name': '1875 Finance',
  'url': 'https://dev.swfinstitute.org/profile/5dabfa3c5295eb340fa1028f',
  'Phone': '41-22-595-18-75',
  'Country': 'Switzerland',
  'City': 'Geneva',
  'contact_page': 'https://1875.ch/contact/',
  'contact_email': 'info@1875.ch',
  'contact_phone': '+41 22 595 18 75'},
 {'company_name': '81 Family Office',
  'url': 'https://dev.swfinstitute.org/profile/60862d6732b4e13af14a634d',
  'Phone': '39-0444-546662',
  'Country': 'Italy',
  'City': 'Vicenza',
  'contact_page': 'https://www.leominsterfamilydentists.com/contact-us/',
  'contact_email': None,
  'contact_phone': None},
 {'company_name': 'ACT Asset Management AG',
  'url': 'https://dev.swfinstitute.org/profile/5e39a60ffcbe7e8ca727e542',
  'Phone': '41-43-499-06-50',
  'Country': 'Switzerland',
  'City': 'Zurich',
  'contact_page': 'https://www.aphis.usda.gov/contact',
  'contact_email': None,
  'contact_phone': None},
 {'company_name': 'ATTENTIUM AG',
  'url': 'https://dev.swfinstitute.org/profile

查找 ceo，Geschäftsführers， funder,co-founder

In [43]:
# Reload the module to get the latest changes
import importlib
import google_search_api
importlib.reload(google_search_api)

# Import the management search function
from google_search_api import google_search_manager

# Test with Inpagest A.G.
company_name = "Inpagest A.G."
print(f"Searching for management information for: {company_name}")
print("-" * 60)

# Search for management information (CEO, Geschäftsführer, founder, co-founder)
mgmt_results = google_search_manager(company_name, n=3)
print(f"Found {len(mgmt_results)} management-related search results")

# Display results
for i, result in enumerate(mgmt_results, 1):
    print(f"\nResult {i}:")
    print(f"  Search Query: {result.get('search_query', 'N/A')}")
    print(f"  Title: {result.get('title', 'N/A')}")
    print(f"  Snippet: {result.get('snippet', 'N/A')[:150]}...")
    print(f"  URL: {result.get('link', 'N/A')}")
    
print(f"\n✓ Successfully found management information for {company_name}")

Searching for management information for: Inpagest A.G.
------------------------------------------------------------
Found 6 management-related search results

Result 1:
  Search Query: Inpagest A.G. CEO
  Title: Inpagest A.G. (Inpagest A.G.) - Multi Family Office, Switzerland | SWFI
  Snippet: Inpagest A.G. (Inpagest A.G.) is a Multi Family Office located in Zurich Switzerland, Europe....
  URL: https://www.swfinstitute.org/profile/598cdaa50124e9fd2d05b46b

Result 2:
  Search Query: Inpagest A.G. CEO
  Title: Untitled
  Snippet: ... AG Capital Recovery VI Holdings, L.P.",CAYMAN ISLANDS 8ZHFR0.00462.SF.276 ... Inpagest SA,SWITZERLAND XH55BR.00003.ME.756,Picard Angst Holding AG ...
  URL: https://www.irs.gov/pub/fatca/July2019FFIListFull.csv

Result 3:
  Search Query: Inpagest A.G. CEO
  Title: Delfin Private Office Multi Family Office in United Kingdom/ Europe
  Snippet: Inpagest A.G.. Multi Family Office in Switzerland · Mount Street Growth Capital ... Link to video owner's profile. I

In [44]:
# Test the LLM agent for management information extraction
import importlib
importlib.reload(llm)
from llm import extract_management_info

# Use the search results we got earlier
company_name = "Inpagest A.G."
print(f"Extracting management information for: {company_name}")
print("=" * 60)

# Extract management info using LLM
management_info = extract_management_info(mgmt_results, company_name)

# Display results
print("Extracted Management Information:")
print("-" * 40)
for key, value in management_info.items():
    if key not in ["extraction_source", "total_search_results"]:
        print(f"{key.replace('_', ' ').title()}: {value or 'Not found'}")

print(f"\nExtraction Source: {management_info.get('extraction_source', 'N/A')}")
print(f"Total Search Results Processed: {management_info.get('total_search_results', 0)}")

# Check for errors
if "error" in management_info:
    print(f"\nError: {management_info['error']}")
    if "raw_response" in management_info and management_info["raw_response"]:
        print(f"Raw Response: {management_info['raw_response'][:200]}...")
else:
    print(f"\n✓ Successfully extracted management information for {company_name}")

Extracting management information for: Inpagest A.G.
Extracted Management Information:
----------------------------------------
Company Name: Inpagest A.G.
Ceo: Not found
Founder: Not found
Co Founder: Not found
Managing Director: Not found
Other Executives: Sarah Bosshard, Doris Bianchi

Extraction Source: google_search
Total Search Results Processed: 6

✓ Successfully extracted management information for Inpagest A.G.
Extracted Management Information:
----------------------------------------
Company Name: Inpagest A.G.
Ceo: Not found
Founder: Not found
Co Founder: Not found
Managing Director: Not found
Other Executives: Sarah Bosshard, Doris Bianchi

Extraction Source: google_search
Total Search Results Processed: 6

✓ Successfully extracted management information for Inpagest A.G.


In [45]:
def extract_company_management_info(company: dict) -> dict:
    """
    为单个公司提取管理人员信息。
    
    Args:
        company: 公司字典，包含 'company_name' 键
        
    Returns:
        更新后的公司字典，添加了管理人员信息
    """
    company_name = company.get("company_name", "")
    if not company_name:
        print("⚠️ No company name found, skipping...")
        return company
    
    try:
        # 搜索管理人员信息
        mgmt_search_results = google_search_manager(company_name, n=3)
        
        if not mgmt_search_results:
            print(f"⚠️ No search results found for {company_name}")
            company["management_info"] = {"error": "No search results found"}
            return company
        
        # 使用 LLM 提取管理人员信息
        management_info = extract_management_info(mgmt_search_results, company_name)
        
        # 添加到公司数据中
        company["management_info"] = management_info
        
        # 提取关键信息到顶层
        company["ceo"] = management_info.get("ceo")
        company["founder"] = management_info.get("founder") 
        company["managing_director"] = management_info.get("managing_director")
        
        # 显示结果
        found_roles = []
        if management_info.get("ceo"): found_roles.append("CEO")
        if management_info.get("founder"): found_roles.append("Founder")
        if management_info.get("managing_director"): found_roles.append("Managing Director")
        if management_info.get("other_executives"): found_roles.append("Other Executives")
        
        if found_roles:
            print(f"✓ {company_name}: Found {', '.join(found_roles)}")
        else:
            print(f"○ {company_name}: No specific management roles found")
            
    except Exception as e:
        print(f"✗ Error processing {company_name}: {str(e)}")
        company["management_info"] = {"error": str(e)}
    
    return company

In [46]:
# 测试 extract_company_management_info 函数
print("Testing extract_company_management_info function")
print("=" * 60)

# 创建一个测试公司数据
test_company = {
    "company_name": "Inpagest A.G.",
    "Country": "Switzerland",
    "City": "Zurich"
}

print(f"Testing with company: {test_company['company_name']}")
print(f"Location: {test_company['City']}, {test_company['Country']}")
print("-" * 40)

# 调用函数测试
result_company = extract_company_management_info(test_company)

print("\n" + "=" * 60)
print("Test Results:")
print("-" * 40)

# 显示结果
print(f"Company Name: {result_company.get('company_name', 'N/A')}")
print(f"CEO: {result_company.get('ceo', 'Not found')}")
print(f"Founder: {result_company.get('founder', 'Not found')}")
print(f"Managing Director: {result_company.get('managing_director', 'Not found')}")

# 显示完整的管理信息
if "management_info" in result_company:
    mgmt_info = result_company["management_info"]
    print(f"\nFull Management Info:")
    print(f"- Total Search Results: {mgmt_info.get('total_search_results', 0)}")
    print(f"- Other Executives: {mgmt_info.get('other_executives', 'Not found')}")
    
    if mgmt_info.get("error"):
        print(f"- Error: {mgmt_info['error']}")
    else:
        print("- ✓ Extraction completed successfully")

print(f"\n✓ Function test completed for {test_company['company_name']}")

Testing extract_company_management_info function
Testing with company: Inpagest A.G.
Location: Zurich, Switzerland
----------------------------------------
✓ Inpagest A.G.: Found Other Executives

Test Results:
----------------------------------------
Company Name: Inpagest A.G.
CEO: None
Founder: None
Managing Director: None

Full Management Info:
- Total Search Results: 6
- Other Executives: Sarah Bosshard, Doris Bianchi
- ✓ Extraction completed successfully

✓ Function test completed for Inpagest A.G.
✓ Inpagest A.G.: Found Other Executives

Test Results:
----------------------------------------
Company Name: Inpagest A.G.
CEO: None
Founder: None
Managing Director: None

Full Management Info:
- Total Search Results: 6
- Other Executives: Sarah Bosshard, Doris Bianchi
- ✓ Extraction completed successfully

✓ Function test completed for Inpagest A.G.


In [47]:
# 测试 companies_list 中的真实公司
print("\nTesting with a company from companies_list")
print("=" * 60)

# 找一个还没有管理信息的公司
test_company_from_list = None
for company in companies_list[:5]:  # 检查前5个公司
    if isinstance(company, dict) and not company.get("management_info"):
        test_company_from_list = company
        break

if test_company_from_list:
    print(f"Testing with: {test_company_from_list.get('company_name', 'Unknown')}")
    print(f"Country: {test_company_from_list.get('Country', 'N/A')}")
    print(f"Website: {test_company_from_list.get('Website', 'N/A')}")
    print("-" * 40)
    
    # 测试函数
    result = extract_company_management_info(test_company_from_list.copy())  # 使用副本避免修改原数据
    
    print("\nResults:")
    print(f"CEO: {result.get('ceo', 'Not found')}")
    print(f"Founder: {result.get('founder', 'Not found')}")  
    print(f"Managing Director: {result.get('managing_director', 'Not found')}")
    
    mgmt_info = result.get("management_info", {})
    if mgmt_info.get("other_executives"):
        print(f"Other Executives: {mgmt_info['other_executives']}")
    
    print(f"\n✓ Test completed for {result.get('company_name', 'Unknown')}")
else:
    print("All companies in the first 5 already have management info or are not valid")
    print("Let's check what companies we have:")
    for i, company in enumerate(companies_list[:5], 1):
        if isinstance(company, dict):
            has_mgmt = "✓" if company.get("management_info") else "○"
            print(f"{i}. {has_mgmt} {company.get('company_name', 'Unknown')}")


Testing with a company from companies_list
Testing with: 1875 Finance
Country: Switzerland
Website: N/A
----------------------------------------
✓ 1875 Finance: Found CEO, Managing Director, Other Executives

Results:
CEO: Paul Kohler
Founder: None
Managing Director: Balta
Other Executives: Michel Alain Bizon

✓ Test completed for 1875 Finance


In [52]:
# 为所有公司提取管理人员信息
print(f"Starting management extraction for {len(companies_list)} companies...")
print("=" * 70)

processed_count = 0
found_management_count = 0
total_companies = len([c for c in companies_list if isinstance(c, dict)])

for i, company in enumerate(companies_list, 1):  # 处理全部公司
    if not isinstance(company, dict):
        continue
    
    # 跳过已经有管理信息的公司
    if company.get("management_info"):
        print(f"[{i}/{total_companies}] Skipping {company.get('company_name', 'Unknown')} - already has management info")
        continue
    
    print(f"[{i}/{total_companies}] Processing {company.get('company_name', 'Unknown')}...")
    company = extract_company_management_info(company)
    processed_count += 1
    
    # 检查是否找到了管理人员信息
    mgmt_info = company.get("management_info", {})
    if mgmt_info and not mgmt_info.get("error"):
        if any([mgmt_info.get("ceo"), mgmt_info.get("founder"), mgmt_info.get("managing_director"), mgmt_info.get("other_executives")]):
            found_management_count += 1
    
    # 添加延迟以避免过度请求
    import time
    time.sleep(2)
    
    # 每处理10个公司显示一次进度
    if processed_count % 10 == 0:
        print(f"Progress: {processed_count} companies processed, {found_management_count} with management info found")
    print()

print("=" * 70)
print("Management extraction completed!")
print(f"Results: {processed_count} companies processed")
print(f"- {found_management_count} companies with management information found")

# 显示找到管理人员信息的公司
print("\nCompanies with management information:")
print("-" * 50)
mgmt_count = 0
for company in companies_list:
    if isinstance(company, dict) and company.get("management_info"):
        mgmt_info = company["management_info"]
        if not mgmt_info.get("error"):
            name = company.get("company_name", "Unknown")
            ceo = mgmt_info.get("ceo")
            founder = mgmt_info.get("founder")
            md = mgmt_info.get("managing_director")
            
            if any([ceo, founder, md]):
                mgmt_count += 1
                if mgmt_count <= 20:  # 只显示前20个有管理信息的公司
                    print(f"\n{mgmt_count}. {name}:")
                    if ceo: print(f"  CEO: {ceo}")
                    if founder: print(f"  Founder: {founder}")
                    if md: print(f"  Managing Director: {md}")
                elif mgmt_count == 21:
                    print(f"\n... and {found_management_count - 20} more companies with management info")
                    break

Starting management extraction for 50 companies...
[1/50] Skipping 1875 Finance - already has management info
[2/50] Skipping 81 Family Office - already has management info
[3/50] Skipping ACT Asset Management AG - already has management info
[4/50] Skipping ATTENTIUM AG - already has management info
[5/50] Skipping Agami Family Office - already has management info
[6/50] Skipping Allconsult Ackermans Family Office - already has management info
[7/50] Skipping Alpha Blue Ocean - already has management info
[8/50] Skipping Alvarium Investments - already has management info
[9/50] Skipping Amadeus Capital SA - already has management info
[10/50] Skipping Andersen Charnley Investment Management Ltd - already has management info
[11/50] Processing Apricus Finance SA...
✓ Apricus Finance SA: Found CEO, Managing Director, Other Executives
✓ Apricus Finance SA: Found CEO, Managing Director, Other Executives

[12/50] Processing Artorius Wealth Management...

[12/50] Processing Artorius Wealth 

In [None]:
# 检查处理进度和结果统计
print("Processing Status Check")
print("=" * 50)

total_companies = len([c for c in companies_list if isinstance(c, dict)])
companies_with_mgmt = 0
companies_processed = 0

# 统计各种管理角色
ceo_count = 0
founder_count = 0
md_count = 0
other_exec_count = 0

for company in companies_list:
    if isinstance(company, dict):
        companies_processed += 1
        
        if company.get("management_info"):
            mgmt_info = company["management_info"]
            if not mgmt_info.get("error"):
                has_any_mgmt = False
                
                if mgmt_info.get("ceo"):
                    ceo_count += 1
                    has_any_mgmt = True
                if mgmt_info.get("founder"):
                    founder_count += 1
                    has_any_mgmt = True
                if mgmt_info.get("managing_director"):
                    md_count += 1
                    has_any_mgmt = True
                if mgmt_info.get("other_executives"):
                    other_exec_count += 1
                    has_any_mgmt = True
                
                if has_any_mgmt:
                    companies_with_mgmt += 1

print(f"Total companies: {total_companies}")
print(f"Companies with management info: {companies_with_mgmt}")
print(f"Success rate: {companies_with_mgmt/total_companies*100:.1f}%")
print()
print("Management roles found:")
print(f"- CEO: {ceo_count} companies")
print(f"- Founder: {founder_count} companies") 
print(f"- Managing Director: {md_count} companies")
print(f"- Other Executives: {other_exec_count} companies")

print(f"\nSample companies with management info (first 5):")
print("-" * 40)
sample_count = 0
for company in companies_list:
    if isinstance(company, dict) and company.get("management_info"):
        mgmt_info = company["management_info"]
        if not mgmt_info.get("error") and sample_count < 5:
            name = company.get("company_name", "Unknown")
            ceo = mgmt_info.get("ceo")
            founder = mgmt_info.get("founder")
            md = mgmt_info.get("managing_director")
            
            if any([ceo, founder, md]):
                sample_count += 1
                print(f"\n{sample_count}. {name}")
                if ceo: print(f"   CEO: {ceo}")
                if founder: print(f"   Founder: {founder}")
                if md: print(f"   Managing Director: {md}")
                
        if sample_count >= 5:
            break

In [53]:
# 将 companies_list 的所有公司数据导出到 Excel 文件
import pandas as pd
from datetime import datetime

# 获取所有公司数据
all_companies = []

for company in companies_list:
    if isinstance(company, dict):
        # 创建一个清理后的公司数据字典
        clean_company = {
            'Company Name': company.get('company_name', ''),
            'Country': company.get('Country', ''),
            'City': company.get('City', ''),
            'Phone': company.get('Phone', ''),
            'Website': company.get('Website', ''),
            'Contact Page': company.get('contact_page', ''),
            'Contact Email': company.get('contact_email', ''),
            'Contact Phone': company.get('contact_phone', ''),
            'CEO': company.get('ceo', ''),
            'Founder': company.get('founder', ''),
            'Managing Director': company.get('managing_director', ''),
            'Profile URL': company.get('url', '')
        }
        
        # 处理管理信息
        if company.get('management_info'):
            mgmt_info = company['management_info']
            clean_company['Other Executives'] = mgmt_info.get('other_executives', '')
            clean_company['Management Search Results'] = mgmt_info.get('total_search_results', 0)
        else:
            clean_company['Other Executives'] = ''
            clean_company['Management Search Results'] = 0
        
        all_companies.append(clean_company)

# 创建 DataFrame
df = pd.DataFrame(all_companies)

# 生成文件名（包含时间戳）
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"wealth_managers_europe_{timestamp}.xlsx"

# 导出到 Excel
try:
    df.to_excel(filename, index=False, engine='openpyxl')
    print(f"✓ 成功导出所有公司数据到: {filename}")
    print(f"共导出 {len(df)} 个公司的数据")
    
    # 统计数据概览
    print("\n数据统计:")
    print("=" * 50)
    print(f"总公司数量: {len(df)}")
    print(f"有联系邮箱的公司: {len(df[df['Contact Email'].notna() & (df['Contact Email'] != '')])}")
    print(f"有联系电话的公司: {len(df[df['Contact Phone'].notna() & (df['Contact Phone'] != '')])}")
    print(f"有CEO信息的公司: {len(df[df['CEO'].notna() & (df['CEO'] != '')])}")
    print(f"有创始人信息的公司: {len(df[df['Founder'].notna() & (df['Founder'] != '')])}")
    print(f"有总经理信息的公司: {len(df[df['Managing Director'].notna() & (df['Managing Director'] != '')])}")
    
    # 按国家统计
    country_stats = df['Country'].value_counts()
    print(f"\n按国家分布 (前10):")
    print("-" * 30)
    for country, count in country_stats.head(10).items():
        print(f"{country}: {count}")
    
    # 显示前5个公司的概览
    print(f"\n前5个公司概览:")
    print("-" * 50)
    for i, row in df.head(5).iterrows():
        print(f"{i+1}. {row['Company Name']} ({row['Country']})")
        if row['Contact Email']:
            print(f"   Email: {row['Contact Email']}")
        if row['CEO']:
            print(f"   CEO: {row['CEO']}")
        if row['Founder']:
            print(f"   Founder: {row['Founder']}")
        print()
        
except Exception as e:
    print(f"✗ 导出失败: {str(e)}")
    print("请确保已安装 openpyxl: pip install openpyxl")

✓ 成功导出所有公司数据到: wealth_managers_europe_20250813_074701.xlsx
共导出 50 个公司的数据

数据统计:
总公司数量: 50
有联系邮箱的公司: 14
有联系电话的公司: 18
有CEO信息的公司: 17
有创始人信息的公司: 13
有总经理信息的公司: 18

按国家分布 (前10):
------------------------------
Switzerland: 20
United Kingdom: 14
Germany: 4
Italy: 2
France: 2
Jersey: 1
Portugal: 1
Sweden: 1
Monaco: 1
Norway: 1

前5个公司概览:
--------------------------------------------------
1. 1875 Finance (Switzerland)
   Email: info@1875.ch
   CEO: Paul Kohler

2. 81 Family Office (Italy)

3. ACT Asset Management AG (Switzerland)

4. ATTENTIUM AG (Germany)

5. Agami Family Office (France)



In [2]:
import asyncio
from crawl4ai import AsyncWebCrawler

async def main():
    async with AsyncWebCrawler() as crawler:
        result = await crawler.arun(url="https://agami.com/contact/")
        print(result.markdown)

asyncio.run(main())


RuntimeError: asyncio.run() cannot be called from a running event loop