# Extract entities from Claude Deep Research markdown
This notebook parses the file `22-10-2025_Claude_Deep_Research_Hamburg_CE.md` and extracts entities for later scraping:
- section (e.g., Students, Researchers)
- organization name
- contact person (if present/derivable)
- website URL
It writes the results to `data/claude_deep_research_entities.csv` and prints a quick preview.

In [1]:
import re
import pathlib
import pandas as pd
from urllib.parse import urlparse
from typing import Optional, Tuple

In [2]:
# Paths
ROOT = pathlib.Path('.')
MD_PATH = ROOT / '22-10-2025_Claude_Deep_Research_Hamburg_CE.md'
OUT_DIR = ROOT / 'data'
OUT_DIR.mkdir(parents=True, exist_ok=True)
OUT_CSV = OUT_DIR / 'claude_deep_research_entities.csv'

assert MD_PATH.exists(), f"Markdown file not found: {MD_PATH}"
print(f'Reading from {MD_PATH}')

Reading from 22-10-2025_Claude_Deep_Research_Hamburg_CE.md


In [3]:
# Load markdown
text = MD_PATH.read_text(encoding='utf-8')
print(f'Loaded {len(text):,} characters')

# Preprocess: normalize line breaks
text = text.replace('\r\n','\n').replace('\r','\n')
lines = [ln.rstrip() for ln in text.split('\n')]
print(f'{len(lines)} lines')

Loaded 17,517 characters
371 lines


In [4]:
URL_RE = re.compile(r'(https?://[^\s)]+)')
SECTION_RE = re.compile(r'^##\s+\*?\d+\.\s*(.+)$|^##\s+\*?\*?(.+?)\*?\*?$')
BULLET_RE = re.compile(r'^\s*[-\*\u2022]\s+(.+)$')
SEP = ' - '

PERSON_TITLES = {'prof.', 'dr.', 'prof', 'dipl.-ing.', 'ing.', 'jr.', 'sr.'}
ORG_HINTS = {'universität','university','tuhh','haw','hcu','hsu','institute','institut','center','zentrum','fraunhofer','max planck','helmholtz','leibniz','stiftung','e.v.','ev','gmbh','ag','kg','se','ug','eG','e.v','e.V.', 'gGmbH'}

def is_person_like(name: str) -> bool:
    s = name.strip()
    low = s.lower()
    if any(t in low for t in PERSON_TITLES):
        return True
    # 2-4 capitalized tokens heuristic
    toks = [t for t in re.split(r'[^A-Za-zÄÖÜäöüß-]+', s) if t]
    caps = sum(1 for t in toks if t[0].isupper())
    return 2 <= caps <= 4 and caps >= max(2, len(toks)//2)

def guess_org_from_text(parts: list[str]) -> Optional[str]:
    # Look from the right-most segment that contains org hints
    for seg in reversed(parts[1:]):
        low = seg.lower()
        if any(h in low for h in ORG_HINTS):
            return seg.strip()
    return None

KNOWN_DOMAIN_ORGS = {
    'uni-hamburg.de': 'Universität Hamburg',
    'tuhh.de': 'Hamburg University of Technology (TUHH)',
    'haw-hamburg.de': 'HAW Hamburg',
    'hcu-hamburg.de': 'HafenCity University Hamburg (HCU)',
    'hiicce.de': 'Hamburg Institute for Innovation, Climate Protection and Circular Economy (HiiCCE)',
    'fraunhofer.de': 'Fraunhofer-Gesellschaft',
    'hereon.de': 'Helmholtz-Zentrum Hereon',
    'desy.de': 'DESY',
    'leibniz-lib.de': 'Leibniz LIB',
    'liv.de': 'Leibniz Institute of Virology (LIV)',
    'hsu-hh.de': 'Helmut Schmidt University (HSU/UniBw H)',
    'newproductioninstitute.de': 'New Production Institute',
    'hamburg.de': 'Freie und Hansestadt Hamburg',
}

def guess_org_from_url(url: str) -> Optional[str]:
    host = urlparse(url).netloc.lower()
    host = host[4:] if host.startswith('www.') else host
    # try exact
    if host in KNOWN_DOMAIN_ORGS:
        return KNOWN_DOMAIN_ORGS[host]
    # try suffix
    for dom, org in KNOWN_DOMAIN_ORGS.items():
        if host.endswith(dom):
            return org
    # otherwise use second-level domain as label
    parts = host.split('.')
    if len(parts) >= 2:
        sld = parts[-2]
        return sld.capitalize()
    return None

In [5]:
def parse_markdown_entities(lines: list[str]) -> pd.DataFrame:
    section = None
    rows = []
    for ln in lines:
        # update section
        msec = SECTION_RE.match(ln)
        if msec:
            sec = msec.group(1) or msec.group(2) or ''
            section = sec.strip('* ').strip()
            continue
        # pick bullet lines with URL
        mb = BULLET_RE.match(ln)
        if not mb:
            continue
        content = mb.group(1).strip()
        um = URL_RE.search(content)
        if not um:
            continue
        url = um.group(1).rstrip(').,;')
        left = content[:um.start()].strip()
        # split by ' - ' segments
        parts = [p.strip() for p in left.split(SEP) if p.strip()]
        contact_person: Optional[str] = None
        org_name: Optional[str] = None
        if not parts:
            # no text before URL -> use host as org
            org_name = guess_org_from_url(url) or urlparse(url).netloc
        else:
            first = parts[0]
            if is_person_like(first):
                contact_person = first
                org_name = guess_org_from_text(parts) or guess_org_from_url(url) or first
            else:
                # assume first part is organization name
                org_name = first
                # if there is a segment that looks like a person, pick it
                for seg in parts[1:]:
                    if is_person_like(seg):
                        contact_person = seg
                        break
        rows.append({
            'section': section or '',
            'organization': org_name or '',
            'contact_person': contact_person or '',
            'website_url': url,
            'source_line': ln.strip(),
        })
    df = pd.DataFrame(rows)
    # Clean and dedupe
    df['organization'] = df['organization'].str.strip()
    df['contact_person'] = df['contact_person'].str.strip()
    df['website_url'] = df['website_url'].str.strip()
    df = df.drop_duplicates(subset=['organization','website_url']).reset_index(drop=True)
    return df

In [6]:
# Run extraction
entities_df = parse_markdown_entities(lines)
print(f'Extracted {len(entities_df)} entities')
display(entities_df.head(20))

# Save
entities_df.to_csv(OUT_CSV, index=False)
print(f'Saved to {OUT_CSV}')

# Quick counts by section
print(entities_df['section'].value_counts().to_string())

Extracted 202 entities


Unnamed: 0,section,organization,contact_person,website_url,source_line
0,1. STUDENTS,Freie und Hansestadt Hamburg,Students for Future Hamburg -,https://studentsforfuture-hamburg.de,- Students for Future Hamburg - https://studen...
1,1. STUDENTS,Universität Hamburg,Green Office Uni Hamburg -,https://www.uni-hamburg.de/nachhaltigkeit/gree...,- Green Office Uni Hamburg - https://www.uni-h...
2,1. STUDENTS,Asta-uhh,AStA Uni Hamburg,https://www.asta-uhh.de/en/0-your-asta/1-conta...,- AStA Uni Hamburg - Sustainability and Ecolog...
3,1. STUDENTS,Hamburg University of Technology (TUHH),AStA TUHH,https://www.asta.tuhh.de/en/members/sustainabi...,- AStA TUHH - Sustainability Department - http...
4,1. STUDENTS,HAW Hamburg,Green Office HAW Hamburg -,https://www.haw-hamburg.de/en/ftz-nk/green-off...,- Green Office HAW Hamburg - https://www.haw-h...
5,1. STUDENTS,REAP Student Council (FSR) at HCU Hamburg -,,https://www.hcu-hamburg.de/master/reap/,- REAP Student Council (FSR) at HCU Hamburg - ...
6,1. STUDENTS,Fab City Hamburg Student Involvement -,,https://www.fabcity.hamburg,- Fab City Hamburg Student Involvement - https...
7,1. STUDENTS,Circular Economy Club (CEC) Hamburg -,,https://www.circulareconomyclub.com/club/hamburg/,- Circular Economy Club (CEC) Hamburg - https:...
8,2. RESEARCHERS,Universität Hamburg,Prof. Dr. Fenna Blomsma,https://www.wiso.uni-hamburg.de/fachbereich-so...,- Prof. Dr. Fenna Blomsma - Junior Professor f...
9,2. RESEARCHERS,Research Group Circular Resource Engineering a...,Prof. Dr.-Ing. Kerstin Kuchta,https://www.tuhh.de/crem,- Prof. Dr.-Ing. Kerstin Kuchta - Research Gro...


Saved to data/claude_deep_research_entities.csv
section
6. INDUSTRY PARTNERS                        28
7. STARTUPS AND ENTREPRENEURS               25
4. RESEARCH INSTITUTES                      22
5. NON-GOVERNMENTAL ORGANIZATIONS           20
14. KNOWLEDGE AND INNOVATION COMMUNITIES    19
11. CITIZEN ASSOCIATIONS                    18
2. RESEARCHERS                              15
9. POLICY MAKERS                            11
12. MEDIA AND COMMUNICATION PARTNERS        10
13. FUNDING BODIES                           9
1. STUDENTS                                  8
3. HIGHER EDUCATION INSTITUTIONS             8
10. END-USERS                                5
8. PUBLIC AUTHORITIES                        4
