# Extract entities from Gemini DOCX tables
This notebook parses the file `22-10-2025_Gemini_Hamburg Circular Economy Ecosystem Actors.docx` and extracts entities for later scraping:
- section (from surrounding headings)
- organization name
- contact person (if present)
- website URL
It writes the results to `data/gemini_deep_research_entities.csv` and prints a quick preview.

In [1]:
# Install/import dependencies
import sys, subprocess, importlib, pathlib, re
from typing import Optional, Tuple, Iterable, Dict, List
import pandas as pd
from urllib.parse import urlparse

def ensure_package(pkg: str, import_name: Optional[str] = None):
    try:
        return importlib.import_module(import_name or pkg)
    except Exception:
        print(f"Installing {pkg}…")
        subprocess.check_call([sys.executable, '-m', 'pip', 'install', pkg])
        return importlib.import_module(import_name or pkg)

docx = ensure_package('python-docx', 'docx')
Document = docx.Document
try:
    # optional fallback for merged cells parsing
    docx2python = importlib.import_module('docx2python')
except Exception:
    docx2python = None
    # Uncomment to install if needed
    # docx2python = ensure_package('docx2python')

Installing python-docx…
Collecting python-docx
  Downloading python_docx-1.2.0-py3-none-any.whl.metadata (2.0 kB)
Downloading python_docx-1.2.0-py3-none-any.whl (252 kB)
Installing collected packages: python-docx
Successfully installed python-docx-1.2.0


In [2]:
# Paths and setup
ROOT = pathlib.Path('.')
DOCX_PATH = ROOT / '22-10-2025_Gemini_Hamburg Circular Economy Ecosystem Actors.docx'
OUT_DIR = ROOT / 'data'
OUT_DIR.mkdir(parents=True, exist_ok=True)
OUT_CSV = OUT_DIR / 'gemini_deep_research_entities.csv'

assert DOCX_PATH.exists(), f"DOCX file not found: {DOCX_PATH}"
print(f'Reading from {DOCX_PATH}')

Reading from 22-10-2025_Gemini_Hamburg Circular Economy Ecosystem Actors.docx


In [8]:
# Helpers
URL_RE = re.compile(r'(https?://[^\s)]+)')
DOMAIN_RE = re.compile(r'\b(?:[a-z0-9][a-z0-9.-]*\.[a-z]{2,})(?:/[^\s]*)?', re.I)

def clean(s: str | None) -> str:
    if not s: return ''
    return re.sub(r'\s+', ' ', s).strip()

def normalize_url(u: str | None) -> str:
    if not u: return ''
    u = u.strip().strip(').,;')
    # remove trailing whitespace-number artifacts like " 3  2  2"
    u = re.sub(r'\s+\d+(?:\s+\d+)*$', '', u)
    if not re.match(r'^https?://', u):
        u = 'https://' + u
    # drop obvious trackers
    u = re.sub(r'([?&](utm_[^=&]+|gclid|fbclid)=[^&]*)', '', u)
    u = u.replace('?&', '?').rstrip('?&')
    return u

def extract_first_url(text: str | None) -> Optional[str]:
    if not text:
        return None
    m = URL_RE.search(text)
    if m:
        return normalize_url(m.group(1))
    m2 = DOMAIN_RE.search(text)
    if m2:
        return normalize_url(m2.group(0))
    return None

def is_person_like(name: str) -> bool:
    if not name: return False
    s = name.strip()
    low = s.lower()
    if any(t in low for t in ['prof', 'dr.', 'dr ', 'dipl.-', 'ing.', 'jr.', 'sr.','prof.']):
        return True
    toks = [t for t in re.split(r'[^A-Za-zÄÖÜäöüß-]+', s) if t]
    caps = sum(1 for t in toks if t and t[0].isupper())
    return caps >= 2 and len(toks) <= 5

def guess_org_from_url(url: str) -> str:
    host = urlparse(url).netloc.lower().replace('www.','')
    if not host:
        return ''
    parts = host.split('.')
    if len(parts) >= 2:
        sld = parts[-2]
        return sld.capitalize()
    return host.capitalize()

HEADER_SYNONYMS: Dict[str,str] = {
    'organization':'organization','organisation':'organization','name':'organization','entity':'organization','actor':'organization','company':'organization','institution':'organization',
    'contact':'contact_person','contact person':'contact_person','contact name':'contact_person','person':'contact_person','ansprechpartner':'contact_person','ansprechpartnerin':'contact_person',
    'website':'website_url','webseite':'website_url','url':'website_url','link':'website_url','homepage':'website_url','web':'website_url','www':'website_url'
}

def norm_header(h: str) -> str:
    h = clean(h).lower()
    h = re.sub(r'[^a-z0-9\s]', ' ', h)
    h = re.sub(r'\s+', ' ', h).strip()
    return HEADER_SYNONYMS.get(h, h)

In [9]:
# Iterate DOCX in document order to capture sections and tables
from docx.document import Document as _Document
from docx.oxml.text.paragraph import CT_P
from docx.oxml.table import CT_Tbl
from docx.table import _Cell, Table
from docx.text.paragraph import Paragraph

def iter_block_items(parent):
    if isinstance(parent, _Document):
        parent_elm = parent.element.body
    elif isinstance(parent, _Cell):
        parent_elm = parent._tc
    else:
        raise ValueError("Unknown parent type")
    for child in parent_elm.iterchildren():
        if isinstance(child, CT_P):
            yield Paragraph(child, parent)
        elif isinstance(child, CT_Tbl):
            yield Table(child, parent)

def heading_level(p: Paragraph) -> int | None:
    try:
        name = p.style.name or ''
    except Exception:
        name = ''
    m = re.match(r'Heading\s*(\d+)', name)
    if m:
        return int(m.group(1))
    # also accept German 'Überschrift'
    m2 = re.match(r'Überschrift\s*(\d+)', name)
    if m2:
        return int(m2.group(1))
    return None

def cell_text(cell) -> str:
    # python-docx cell.text already joins runs with newlines; we normalize whitespace
    return clean(cell.text)

def table_to_matrix(tbl: Table) -> list[list[str]]:
    rows = []
    for r in tbl.rows:
        rows.append([cell_text(c) for c in r.cells])
    return rows

def find_header_row(mat: List[List[str]]) -> Tuple[int, List[str]]:
    # default to first non-empty row
    for idx, row in enumerate(mat):
        non_empty = sum(1 for x in row if clean(x))
        if non_empty >= max(1, len(row)//2):
            headers = [norm_header(h) for h in row]
            return idx, headers
    return 0, [norm_header(h) for h in (mat[0] if mat else [])]

def extract_url_from_row(row: List[str]) -> Optional[str]:
    for cell in row:
        u = extract_first_url(cell)
        if u:
            return u
    return None

In [10]:
def parse_docx_entities(path: pathlib.Path) -> pd.DataFrame:
    doc = Document(str(path))
    current_section = ''
    all_rows = []
    tbl_idx = -1
    for block in iter_block_items(doc):
        if isinstance(block, Paragraph):
            lvl = heading_level(block)
            if lvl is not None and lvl <= 3:
                txt = clean(block.text)
                if txt:
                    current_section = txt
        else:  # Table
            tbl_idx += 1
            mat = table_to_matrix(block)
            if not mat:
                continue
            hdr_idx, headers = find_header_row(mat)
            # map columns
            col_map: Dict[str,int] = {}
            for j, h in enumerate(headers):
                if h in ('organization','contact_person','website_url'):
                    col_map[h] = j
            data_start = hdr_idx + 1
            for i in range(data_start, len(mat)):
                row = mat[i]
                org = ''
                contact = ''
                url = ''
                if 'organization' in col_map and col_map['organization'] < len(row):
                    org = clean(row[col_map['organization']])
                if 'contact_person' in col_map and col_map['contact_person'] < len(row):
                    contact = clean(row[col_map['contact_person']])
                if 'website_url' in col_map and col_map['website_url'] < len(row):
                    url = extract_first_url(row[col_map['website_url']]) or ''
                if not url:
                    url = extract_url_from_row(row) or ''
                # if no explicit org, try to infer from other cells or url
                if not org:
                    # pick the longest non-url cell
                    cand_cells = [c for c in row if c and not URL_RE.search(c)]
                    cand_cells.sort(key=lambda x: len(x), reverse=True)
                    org = clean(cand_cells[0]) if cand_cells else ''
                # contact heuristic
                if not contact:
                    for c in row:
                        if is_person_like(c):
                            contact = clean(c)
                            break
                # if still no org, use host from url
                if not org and url:
                    org = guess_org_from_url(url)
                if url:
                    url = normalize_url(url)
                if not any([org, contact, url]):
                    continue
                all_rows.append({
                    'section': current_section,
                    'organization': org,
                    'contact_person': contact,
                    'website_url': url,
                    'source_table_index': tbl_idx,
                    'source_row_index': i,
                })
    df = pd.DataFrame(all_rows)
    if df.empty:
        return df
    # cleanup and dedupe
    for col in ['section','organization','contact_person','website_url']:
        df[col] = df[col].astype(str).map(clean)
    df['website_url'] = df['website_url'].map(normalize_url)
    # Drop rows with clearly invalid URLs (just numbers etc.)
    df = df[~df['website_url'].str.match(r'^https?://\s*$')]
    df = df.drop_duplicates(subset=['organization','website_url']).reset_index(drop=True)
    return df

In [6]:
# Load and inventory tables
doc = Document(str(DOCX_PATH))
num_tables = len(doc.tables)
summary = [(i, len(t.rows), len(t.columns)) for i, t in enumerate(doc.tables)]
print(f"Tables: {num_tables}")
print("index  rows  cols")
for i, r, c in summary[:20]:
    print(f"{i:>5}  {r:>4}  {c:>4}")
if num_tables > 20:
    print("… (truncated)")

Tables: 14
index  rows  cols
    0     9     2
    1     7     2
    2     6     2
    3    10     2
    4    11     2
    5    11     2
    6     8     2
    7     5     2
    8     4     2
    9     2     2
   10     5     2
   11     7     2
   12     6     2
   13     7     2


In [11]:
# Run extraction and save
entities = parse_docx_entities(DOCX_PATH)
print(f"Extracted {len(entities)} entities")
if not entities.empty:
    display(entities.head(20))
    entities.to_csv(OUT_CSV, index=False)
    print(f"Saved to {OUT_CSV}")
    print(entities['section'].value_counts().to_string())
else:
    print('No entities found. Please review table structure or adjust header mapping.')

Extracted 84 entities


Unnamed: 0,section,organization,contact_person,website_url,source_table_index,source_row_index
0,Students,AStA TUHH (Department for Sustainability),AStA TUHH (Department for Sustainability),https://www.asta.tuhh.de/mitglieder/nachhaltig...,0,1
1,Students,AStA University of Hamburg (Department for Sus...,,https://www.bwl.uni-hamburg.de/en/transfer/kar...,0,2
2,Students,AStA HAW Hamburg,AStA HAW Hamburg,https://www.haw-hamburg.de/studium/campusleben...,0,3
3,Students,AStA HCU Hamburg,AStA HCU Hamburg,https://asta-hcu.de/projekte-und-initiativen/,0,4
4,Students,Green Office University of Hamburg,Green Office University of Hamburg,https://www.bwl.uni-hamburg.de/en/transfer/kar...,0,5
5,Students,Green Office HAW Hamburg,Green Office HAW Hamburg,https://www.haw-hamburg.de/ftz-nk/green-office/,0,6
6,Students,Impact Week: Sustainability (TUHH),Impact Week: Sustainability (TUHH),https://www.tuhh.de/uif/en/projects,0,7
7,Students,AI for Sustainable Development Competition (HA...,,https://www.haw-hamburg.de/detail/news/news/sh...,0,8
8,Researchers,Prof. Dr. Fenna Blomsma (University of Hamburg),Prof. Dr. Fenna Blomsma (University of Hamburg),https://www.climate-kic.org/spotlight-initiati...,1,1
9,Researchers,Longwitz Research Group (RG Longwitz) (Univers...,,https://www.chemie.uni-hamburg.de/en/institute...,1,2


Saved to data/gemini_deep_research_entities.csv
section
Non-Governmental Organizations          10
Industry Partners                       10
Research Institutes                      9
Students                                 8
Startups and Entrepreneurs               7
Researchers                              6
Media and Communication Partners         6
Knowledge and Innovation Communities     6
Higher Education Institutions            5
Funding Bodies                           5
Public Authorities                       4
Citizen Associations                     4
Policy Makers                            3
End-Users                                1
