In [1]:
import pandas as pd

In [2]:
df_ce_oa_institution_hamburg = pd.read_csv('circular-economy-openalex-institution-hamburg.csv')
df_ce_patents_applicant_hamburg = pd.read_csv('circular-economy-patents-applicant-hamburg.csv')
df_ce_patents_inventor_hamburg = pd.read_csv('circular-economy-patents-inventor-hamburg.csv')
df_ce_patents_owner_hamburg = pd.read_csv('circular-economy-patents-owner-hamburg.csv')

In [4]:
df_ce_patents_owner_hamburg.head()

Unnamed: 0,#,Jurisdiction,Kind,Display Key,Lens ID,Publication Date,Publication Year,Application Number,Application Date,Priority Numbers,...,Sequence Count,CPC Classifications,IPCR Classifications,US Classifications,NPL Citation Count,NPL Resolved Citation Count,NPL Resolved Lens ID(s),NPL Resolved External ID(s),NPL Citations,Legal Status
0,1,US,B2,US 7276808 B2,044-230-643-354-954,2007-10-02,2007,US 56916406 A,2006-02-22,DE 10339438 A;;EP 2004009486 W,...,0,E02D27/42;;E04H2012/006;;F05B2240/40;;F05B2240...,F03D11/04,290/55;;290/44;;415/4.2,0,0,,,,EXPIRED
1,2,US,A1,US 2006/0267348 A1,016-328-013-942-102,2006-11-30,2006,US 56916406 A,2006-02-22,DE 10339438 A;;EP 2004009486 W,...,0,E02D27/42;;E04H2012/006;;F05B2240/40;;F05B2240...,F03D9/00;;F03D11/04;;H02P9/04,290/55,0,0,,,,EXPIRED
2,3,US,A1,US 2013/0306793 A1,156-946-737-768-407,2013-11-21,2013,US 201313954240 A,2013-07-30,US 201313954240 A;;DE 102011009806 A;;EP 20120...,...,0,B64C1/00;;B64C2001/0027;;B64D11/00;;B64D11/00;...,B64D11/00;;B64C1/22,244/118.1,1,0,,,"Uline Shipping Supplies, 12/25/2008, https://w...",DISCONTINUED
3,4,US,A1,US 2005/0061914 A1,189-246-065-546-00X,2005-03-24,2005,US 93476604 A,2004-09-07,DE 10211437 A;;EP 0301386 W,...,0,B64D11/0696;;B60N2/01575;;B64D11/06;;B64D11/06...,B60N2/06;;B60N2/015;;B64D11/06,244/118.5,0,0,,,,EXPIRED
4,5,US,B2,US 7232094 B2,046-724-247-193-24X,2007-06-19,2007,US 93476604 A,2004-09-07,DE 10211437 A;;EP 0301386 W,...,0,B64D11/0696;;B60N2/01575;;B64D11/06;;B64D11/06...,B60N2/06;;B64D11/06;;B60N2/015,244/118.6;;297/217.3;;297/248;;297/257,0,0,,,,EXPIRED


# Paper Data

In [18]:
# Affiliation enrichment setup: OpenAlex helpers, caching, Hamburg detection, Crunchbase supplement

# Config
OPENALEX_BASE = "https://api.openalex.org/works"
OPENALEX_INSTITUTIONS = "https://api.openalex.org/institutions"
CONTACT_EMAIL = "juergen.thiesen@tuhh.de"  # used in User-Agent per OpenAlex best practices
CACHE_DIR = "data/openalex_cache"
INST_CACHE_DIR = os.path.join(CACHE_DIR, "institutions")
ENRICHED_OUT_DIR = "data/enriched"

# Imports
import os, re, json, time, hashlib
import requests
import pandas as pd
from typing import List, Dict, Any, Optional

# Ensure dirs
os.makedirs(CACHE_DIR, exist_ok=True)
os.makedirs(INST_CACHE_DIR, exist_ok=True)
os.makedirs(ENRICHED_OUT_DIR, exist_ok=True)

# Requests session
session = requests.Session()
session.headers.update({
    "User-Agent": f"ai-innoscence-wp4/1.0 (mailto:{CONTACT_EMAIL})"
})

# ---------- Work-level helpers (by DOI) ----------

def _doi_norm(doi: str) -> str:
    return (doi or "").strip().lower().replace("https://doi.org/", "")


def _work_cache_path(doi: str) -> str:
    key = hashlib.sha256(_doi_norm(doi).encode("utf-8")).hexdigest()
    return os.path.join(CACHE_DIR, f"work_{key}.json")


def _load_cached_work(doi: str) -> Optional[dict]:
    p = _work_cache_path(doi)
    if os.path.exists(p):
        try:
            with open(p, "r", encoding="utf-8") as f:
                return json.load(f)
        except Exception:
            return None
    return None


def _save_cached_work(doi: str, payload: dict) -> None:
    p = _work_cache_path(doi)
    with open(p, "w", encoding="utf-8") as f:
        json.dump(payload, f, ensure_ascii=False)


def fetch_work_by_doi(doi: str, max_retries: int = 3, backoff: float = 1.5) -> Optional[dict]:
    doi = _doi_norm(doi)
    if not doi:
        return None
    cached = _load_cached_work(doi)
    if cached is not None:
        return cached
    url = f"{OPENALEX_BASE}/https://doi.org/{doi}"
    for attempt in range(max_retries):
        try:
            r = session.get(url, timeout=20)
            if r.status_code == 200:
                data = r.json()
                _save_cached_work(doi, data)
                return data
            if r.status_code == 404:
                _save_cached_work(doi, {"__not_found__": True})
                return None
            if r.status_code in (429, 502, 503, 504):
                time.sleep(backoff ** (attempt + 1))
                continue
            _save_cached_work(doi, {"__error__": r.status_code})
            return None
        except requests.RequestException:
            time.sleep(backoff ** (attempt + 1))
    return None


def extract_authorships(work: dict) -> List[Dict[str, Any]]:
    rows = []
    if not work or not isinstance(work, dict):
        return rows
    for auth in work.get("authorships", []) or []:
        author = auth.get("author", {}) or {}
        institutions = auth.get("institutions", []) or []
        if institutions:
            for inst in institutions:
                rows.append({
                    "work_id": work.get("id"),
                    "work_doi": _doi_norm((work.get("doi") or "").replace("https://doi.org/", "")),
                    "work_title": work.get("title"),
                    "author_id": author.get("id"),
                    "author_display_name": author.get("display_name"),
                    "institution_id": inst.get("id"),
                    "institution_display_name": inst.get("display_name"),
                    "institution_country_code": inst.get("country_code"),
                    "institution_ror": inst.get("ror"),
                    "raw_affiliation_string": auth.get("raw_affiliation_string"),
                    "is_corresponding": bool(auth.get("is_corresponding")),
                    "author_position": auth.get("author_position"),
                })
        else:
            rows.append({
                "work_id": work.get("id"),
                "work_doi": _doi_norm((work.get("doi") or "").replace("https://doi.org/", "")),
                "work_title": work.get("title"),
                "author_id": author.get("id"),
                "author_display_name": author.get("display_name"),
                "institution_id": None,
                "institution_display_name": None,
                "institution_country_code": None,
                "institution_ror": None,
                "raw_affiliation_string": auth.get("raw_affiliation_string"),
                "is_corresponding": bool(auth.get("is_corresponding")),
                "author_position": auth.get("author_position"),
            })
    return rows

# ---------- Hamburg detection ----------
HAMBURG_PATTERNS = [
    re.compile(r"\bhamburg\b", re.I),
    re.compile(r"\bhamburg[-\s]?harburg\b", re.I),
    re.compile(r"\buniversit[aä]t hamburg\b|\buniversity of hamburg\b|\buhh\b", re.I),
    re.compile(r"\btechnische universität hamburg\b|\btechnical university of hamburg\b|\btuhh\b", re.I),
    re.compile(r"\bhamburg university of applied sciences\b|\bhaw hamburg\b", re.I),
    re.compile(r"\bhelmholtz (center|zentrum).{0,30}hamburg\b", re.I),
    re.compile(r"\bdesy\b|\bdeutsches elektronen[-\s]?synchrotron\b", re.I),
    re.compile(r"\bmax[-\s]?planck.{0,30}hamburg\b", re.I),
    re.compile(r"\buke\b|\b(universit[aä]ts)?klinikum hamburg[-\s]?eppendorf\b", re.I),
    re.compile(r"\bhafencity university\b|\bhafencity universit[aä]t\b|\bhcu\b", re.I),
    re.compile(r"\bhelmut schmidt university\b|\buniversit[aä]t der bundeswehr hamburg\b|\bhsu\b", re.I),
    re.compile(r"\bhamburg school of business administration\b|\bhsba\b", re.I),
]

def is_hamburg_related(name: Optional[str], raw_affil: Optional[str]) -> bool:
    txt = " ".join([str(x) for x in [name, raw_affil] if x]).lower()
    if not txt.strip():
        return False
    return any(p.search(txt) for p in HAMBURG_PATTERNS)

# ---------- Institutions helpers ----------

def _inst_cache_path(inst_id: str) -> str:
    key = hashlib.sha256((inst_id or "").encode("utf-8")).hexdigest()
    return os.path.join(INST_CACHE_DIR, f"inst_{key}.json")


def _load_cached_inst(inst_id: str) -> Optional[dict]:
    p = _inst_cache_path(inst_id)
    if os.path.exists(p):
        try:
            with open(p, "r", encoding="utf-8") as f:
                return json.load(f)
        except Exception:
            return None
    return None


def _save_cached_inst(inst_id: str, payload: dict) -> None:
    p = _inst_cache_path(inst_id)
    with open(p, "w", encoding="utf-8") as f:
        json.dump(payload, f, ensure_ascii=False)


def fetch_institution(inst_id: str, max_retries: int = 3, backoff: float = 1.5) -> Optional[dict]:
    if not inst_id:
        return None
    cached = _load_cached_inst(inst_id)
    if cached is not None:
        return cached
    suffix = inst_id.split("/")[-1] if inst_id.startswith("http") else inst_id
    url = f"{OPENALEX_INSTITUTIONS}/{suffix}"
    for attempt in range(max_retries):
        try:
            r = session.get(url, timeout=20)
            if r.status_code == 200:
                data = r.json()
                _save_cached_inst(inst_id, data)
                return data
            if r.status_code == 404:
                _save_cached_inst(inst_id, {"__not_found__": True})
                return None
            if r.status_code in (429, 502, 503, 504):
                time.sleep(backoff ** (attempt + 1))
                continue
            _save_cached_inst(inst_id, {"__error__": r.status_code})
            return None
        except requests.RequestException:
            time.sleep(backoff ** (attempt + 1))
    return None

# ---------- Name normalization & Crunchbase supplement ----------

def normalize_name(s: Optional[str]) -> str:
    if not isinstance(s, str):
        return ""
    s = s.lower().strip()
    s = re.sub(r"[^a-z0-9]+", " ", s)
    s = re.sub(r"\b(university|universitaet|universit[aä]t|hochschule|gmbh|ag|ggmbh|ev|e v|institute|institut|college|school|center|centre)\b", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

# ---------- Orchestration helpers ----------

def build_authorships_from_dois(df: pd.DataFrame, doi_col: str = "DOI", limit: Optional[int] = None, sleep_between: float = 0.0) -> pd.DataFrame:
    dois = (
        df[doi_col].dropna().astype(str).map(_doi_norm).replace("", pd.NA).dropna().unique().tolist()
    )
    if limit is not None:
        dois = dois[:limit]
    rows = []
    for i, doi in enumerate(dois, start=1):
        work = fetch_work_by_doi(doi)
        rows.extend(extract_authorships(work) if work else [{
            "work_id": None, "work_doi": doi, "work_title": None,
            "author_id": None, "author_display_name": None,
            "institution_id": None, "institution_display_name": None,
            "institution_country_code": None, "institution_ror": None,
            "raw_affiliation_string": None, "is_corresponding": None,
            "author_position": None,
        }])
        if sleep_between > 0:
            time.sleep(sleep_between)
        if i % 50 == 0:
            print(f"Processed {i} DOIs...")
    auth_df = pd.DataFrame(rows)
    if not auth_df.empty:
        auth_df["is_hamburg_institution"] = auth_df.apply(
            lambda r: is_hamburg_related(r.get("institution_display_name"), r.get("raw_affiliation_string")), axis=1
        )
    return auth_df


def add_institution_homepages(auth_df: pd.DataFrame) -> pd.DataFrame:
    if auth_df.empty or "institution_id" not in auth_df.columns:
        return auth_df
    uniq_ids = auth_df["institution_id"].dropna().astype(str).unique().tolist()
    inst_rows = []
    for i, inst_id in enumerate(uniq_ids, start=1):
        data = fetch_institution(inst_id)
        if data:
            inst_rows.append({
                "institution_id": inst_id,
                "inst_display_name_api": data.get("display_name"),
                "inst_homepage_url": (data.get("homepage_url") or "").strip() or None,
                "inst_country_code": data.get("country_code"),
                "inst_ror": data.get("ror"),
                "inst_lineage": ", ".join(data.get("lineage", []) if isinstance(data.get("lineage"), list) else []),
                "inst_type": data.get("type"),
            })
        if i % 50 == 0:
            print(f"Resolved {i} institutions...")
    inst_df = pd.DataFrame(inst_rows)
    return auth_df.merge(inst_df, on="institution_id", how="left") if not inst_df.empty else auth_df


def supplement_homepages_from_crunchbase(auth_df: pd.DataFrame, cb_csv_path: str = "crunchbase_companies_DEU_Hamburg.csv") -> pd.DataFrame:
    try:
        df_cb = pd.read_csv(cb_csv_path)
    except FileNotFoundError:
        print("Crunchbase CSV not found; skipping supplemental websites.")
        return auth_df
    name_cols = [c for c in df_cb.columns if c.lower() in ("name", "company", "organization")]
    url_cols = [c for c in df_cb.columns if "website" in c.lower() or c.lower() == "url"]
    if not (name_cols and url_cols):
        print("Crunchbase: could not find expected name/website columns; skipping supplemental match.")
        return auth_df
    df_cb = df_cb[[name_cols[0], url_cols[0]]].rename(columns={name_cols[0]: "cb_name", url_cols[0]: "cb_website"})
    df_cb["name_norm"] = df_cb["cb_name"].apply(normalize_name)
    tmp = auth_df.copy()
    inst_name_series = tmp["institution_display_name"].fillna(tmp.get("inst_display_name_api"))
    tmp["name_norm"] = inst_name_series.apply(normalize_name)
    tmp = tmp.merge(df_cb[["name_norm", "cb_website"]].dropna().drop_duplicates("name_norm"), on="name_norm", how="left")
    tmp["inst_homepage_url"] = tmp["inst_homepage_url"].where(
        tmp["inst_homepage_url"].notna() & (tmp["inst_homepage_url"].str.len() > 0), tmp["cb_website"]
    )
    return tmp.drop(columns=["name_norm", "cb_website"], errors="ignore")


def aggregate_and_merge(df_src: pd.DataFrame, auth_df: pd.DataFrame) -> pd.DataFrame:
    if auth_df.empty:
        return df_src.copy()
    agg = auth_df.groupby("work_doi").agg(
        authors_total=("author_id", lambda s: int(s.notna().sum())),
        institutions_total=("institution_id", lambda s: int(s.notna().nunique())),
        has_hamburg_affil=("is_hamburg_institution", "any"),
        hamburg_affil_count=("is_hamburg_institution", "sum"),
    ).reset_index()
    df_aug = df_src.copy()
    df_aug["doi_norm"] = df_aug["DOI"].astype(str).map(_doi_norm)
    merged = df_aug.merge(agg, how="left", left_on="doi_norm", right_on="work_doi")
    merged["has_hamburg_affil"] = merged["has_hamburg_affil"].fillna(False)
    return merged


def save_artifacts(auth_df: pd.DataFrame, merged: Optional[pd.DataFrame] = None):
    # authorships with websites
    p_parquet = os.path.join(ENRICHED_OUT_DIR, "openalex_authorships_with_websites.parquet")
    p_csv = os.path.join(ENRICHED_OUT_DIR, "openalex_authorships_with_websites.csv")
    try:
        auth_df.to_parquet(p_parquet, index=False)
        print(f"Saved: {p_parquet}")
    except Exception:
        auth_df.to_csv(p_csv, index=False)
        print(f"Saved: {p_csv}")

    # institution directory
    inst_directory = (auth_df[[
        "institution_id", "institution_display_name", "inst_display_name_api", "inst_homepage_url", "inst_country_code", "inst_ror"
    ]].drop_duplicates().sort_values(["institution_display_name", "inst_display_name_api"]))
    inst_dir_csv = os.path.join(ENRICHED_OUT_DIR, "institution_directory_with_websites.csv")
    inst_directory.to_csv(inst_dir_csv, index=False)

    if merged is not None:
        m_parquet = os.path.join(ENRICHED_OUT_DIR, "df_oa_with_hamburg_flags.parquet")
        m_csv = os.path.join(ENRICHED_OUT_DIR, "df_oa_with_hamburg_flags.csv")
        try:
            merged.to_parquet(m_parquet, index=False)
            print(f"Saved: {m_parquet}")
        except Exception:
            merged.to_csv(m_csv, index=False)
            print(f"Saved: {m_csv}")

# End of setup

In [19]:
# Affiliation enrichment run: build authorships, resolve websites, aggregate, save

# 1) Build authorships from DOIs
authorships_df = build_authorships_from_dois(df_ce_oa_institution_hamburg, doi_col="DOI", limit=None, sleep_between=0.0)

# 2) Resolve institution homepages from OpenAlex
authorships_enriched_df = add_institution_homepages(authorships_df)

# 3) Supplement missing websites from Crunchbase (optional, if CSV present)
authorships_enriched_df = supplement_homepages_from_crunchbase(authorships_enriched_df, cb_csv_path="crunchbase_companies_DEU_Hamburg.csv")

# 4) Aggregate and merge back to the original dataframe
merged = aggregate_and_merge(df_ce_oa_institution_hamburg, authorships_enriched_df)

# 5) Save artifacts
save_artifacts(authorships_enriched_df, merged)

# Preview
{
    "authorships_rows": len(authorships_enriched_df),
    "institutions_with_websites": int(authorships_enriched_df["inst_homepage_url"].notna().sum()),
    "merged_rows": len(merged),
}

Processed 50 DOIs...
Processed 100 DOIs...
Processed 150 DOIs...
Processed 200 DOIs...
Processed 250 DOIs...
Processed 200 DOIs...
Processed 250 DOIs...
Resolved 50 institutions...
Resolved 50 institutions...
Resolved 100 institutions...
Resolved 100 institutions...
Resolved 150 institutions...
Resolved 150 institutions...
Resolved 200 institutions...
Resolved 200 institutions...
Resolved 250 institutions...
Resolved 250 institutions...
Resolved 300 institutions...
Resolved 300 institutions...
Resolved 350 institutions...
Resolved 350 institutions...
Resolved 400 institutions...
Resolved 400 institutions...
Resolved 450 institutions...
Resolved 450 institutions...
Resolved 500 institutions...
Resolved 500 institutions...
Resolved 550 institutions...
Resolved 550 institutions...
Resolved 600 institutions...
Resolved 600 institutions...
Resolved 650 institutions...
Resolved 650 institutions...
Resolved 700 institutions...
Resolved 700 institutions...
Resolved 750 institutions...
Resolved

  df_cb = pd.read_csv(cb_csv_path)


Crunchbase: could not find expected name/website columns; skipping supplemental match.
Saved: data/enriched/openalex_authorships_with_websites.parquet
Saved: data/enriched/df_oa_with_hamburg_flags.parquet


{'authorships_rows': 34509,
 'institutions_with_websites': 28382,
 'merged_rows': 278}