### Semantic Processing Pipeline

#### 0. Environment Setup
- Resolve `BASE_DIR` to the folder containing this notebook.
- Define input/output roots:
  - `INPUT_ROOT = output-issues-combined-cleaned` (from the previous notebook)
  - `OUTPUT_ROOT = output-semantic` and `CACHE_DIR = output-semantic/cache`
- Create missing folders and print helpful diagnostics if inputs are absent.

In [132]:
# --- Environment & imports (consolidated) -------------------------------------
# Optional installs (uncomment if needed)
# %pip install -q spacy geopy requests-cache rapidfuzz tqdm pandas
# %pip install -q "spacy-transformers>=1.2.5"     # if you want en_core_web_trf
# python -m spacy download en_core_web_sm         # run once if 'sm' is missing
# python -m spacy download en_core_web_trf        # run once if 'trf' is missing

from pathlib import Path
import os, re, json, time, math
import pandas as pd
from tqdm.auto import tqdm

# If previous cell didn't define these, set safe defaults
try:
    BASE_DIR
except NameError:
    BASE_DIR = Path.cwd()
try:
    OUTPUT_ROOT
except NameError:
    OUTPUT_ROOT = BASE_DIR / "output-semantic"
try:
    CACHE_DIR
except NameError:
    CACHE_DIR = OUTPUT_ROOT / "cache"
CACHE_DIR.mkdir(parents=True, exist_ok=True)

# HTTP caching
try:
    import requests_cache
    requests_cache.install_cache(str(CACHE_DIR / "http_cache"), expire_after=60*60*24*14)
    print("[ok] requests-cache enabled")
except Exception as e:
    print(f"[warn] requests-cache not active: {e}")

# spaCy: prefer transformer, fall back to small model
import spacy
def _load_spacy():
    for name in ("en_core_web_trf", "en_core_web_sm"):
        try:
            return spacy.load(name), name
        except Exception:
            pass
    raise RuntimeError(
        "No spaCy English model installed.\n"
        "Install one of:\n"
        "  python -m spacy download en_core_web_sm\n"
        "  python -m spacy download en_core_web_trf"
    )
nlp, NER_MODEL = _load_spacy()
print(f"[ok] spaCy model: {NER_MODEL}")

# Geopy (Nominatim) with rate limiting
try:
    from geopy.geocoders import Nominatim
    from geopy.extra.rate_limiter import RateLimiter
    USER_EMAIL = os.getenv("NOMINATIM_EMAIL", "example@example.com")
    geolocator = Nominatim(user_agent=f"pi-semantic/0.1 ({USER_EMAIL})", timeout=15)
    geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1.0, swallow_exceptions=True)
    print("[ok] geopy + Nominatim ready")
except Exception as e:
    geolocator = None
    geocode = None
    print(f"[warn] geopy not available: {e}")

# Fuzzy matching
try:
    from rapidfuzz import fuzz
    print("[ok] rapidfuzz ready")
except Exception as e:
    fuzz = None
    print(f"[warn] rapidfuzz not available: {e}")

tqdm.pandas(disable=False)


[ok] requests-cache enabled
[ok] spaCy model: en_core_web_trf
[ok] geopy + Nominatim ready
[ok] rapidfuzz ready


In [133]:
# # --- 0. Setup & configuration --------------------------------------------------
# from pathlib import Path
# import os, re, json, gzip, time, math, glob, random, textwrap, itertools
# from collections import defaultdict, Counter

# import pandas as pd
# from tqdm import tqdm
# tqdm.pandas(disable=False)

# # Resolve this notebook's directory (works in Jupyter and if exported to .py)
# if "__file__" in globals():                      # running as a script
#     BASE_DIR = Path(__file__).resolve().parent
# else:                                            # running in Jupyter
#     BASE_DIR = Path.cwd()

# # Inputs/outputs
# # - INPUT_ROOT: combined issues from the previous notebook
# # - OUTPUT_ROOT: fresh working area for all semantic outputs (and cache)
# INPUT_ROOT  = BASE_DIR / "output-issues-combined-cleaned"
# OUTPUT_ROOT = BASE_DIR / "output-semantic"
# CACHE_DIR   = OUTPUT_ROOT / "cache"

# # Create outputs if missing
# for p in (OUTPUT_ROOT, CACHE_DIR):
#     p.mkdir(parents=True, exist_ok=True)

# # Sanity check: help the user if inputs are missing
# if not INPUT_ROOT.exists():
#     print(f"[warn] INPUT_ROOT does not exist: {INPUT_ROOT}")
#     print("       Did you run the previous notebook to produce combined issues?")
# else:
#     print(f"[ok] Found INPUT_ROOT: {INPUT_ROOT}")

# print("BASE_DIR   :", BASE_DIR)
# print("OUTPUT_ROOT:", OUTPUT_ROOT)
# print("CACHE_DIR  :", CACHE_DIR)


### 1 Environment (installs)
This step is done further down the notebook, as the versions in the code below were incompatible. 

In [134]:
# Light stack; British English (1940s–50s) will use general English models.
# %pip -q install spacy spacy-transformers transformers requests requests-cache geopy rapidfuzz

# # spaCy model (transformer is accurate; small corpus so speed is OK)
# !python -m spacy download en_core_web_trf


#### 1. Load combined JSONs → normalized rows
- Recursively scan `output-issues-combined-cleaned` for `*-cleaned.json`.
- Accept various shapes: `{"articles":[...]}`, list, or object.
- Normalize into a DataFrame with: `issue_id`, `id`, `page`, `year`, `title`, `text`, `source_file`.
- Keep the original record under `orig` for later optional fields (author/date/etc.).

In [135]:
import json, re
from pathlib import Path
import pandas as pd

ISSUE_FILE_RX = re.compile(r"^(?P<issue>.+?)-cleaned\.json$", re.IGNORECASE)
YEAR_RX       = re.compile(r"\b(19|20)\d{2}\b")

TITLE_KEYS = ("title","headline","heading")
TEXT_KEYS  = ("content","body","text","article","main_text","raw_text")

def iter_records_from_obj(obj):
    """Yield article dicts from various container shapes."""
    if isinstance(obj, list):
        for it in obj:
            yield it
    elif isinstance(obj, dict):
        if "articles" in obj and isinstance(obj["articles"], list):
            for it in obj["articles"]:
                yield it
        else:
            yield obj
    else:
        yield {"raw_text": str(obj)}

def read_any_json(path: Path):
    """Read JSON or JSONL reliably; yield dict-like article records."""
    txt = path.read_text(encoding="utf-8").strip()
    # Try full JSON first
    try:
        obj = json.loads(txt)
        for rec in iter_records_from_obj(obj):
            yield rec
        return
    except json.JSONDecodeError:
        pass
    # Fallback: JSONL
    for line in txt.splitlines():
        line = line.strip()
        if not line:
            continue
        try:
            obj = json.loads(line)
        except json.JSONDecodeError:
            obj = {"raw_text": line}
        yield from iter_records_from_obj(obj)

def pick_title_text(d: dict):
    """Extract (title, body) from a single article record."""
    title = next((d[k].strip() for k in TITLE_KEYS
                  if isinstance(d.get(k), str) and d[k].strip()), None)
    body  = next((d[k].strip() for k in TEXT_KEYS
                  if isinstance(d.get(k), str) and d[k].strip()), None)
    if body is None and any(k in d for k in ("paragraphs","content_blocks")):
        body = json.dumps({k: d.get(k) for k in ("paragraphs","content_blocks")}, ensure_ascii=False)
    return title, (body or "")

def issue_id_from_path(p: Path) -> str:
    """Use filename stem minus '-cleaned' as the issue id (works for both hyphen/underscore page schemes)."""
    m = ISSUE_FILE_RX.match(p.name)
    return m.group("issue") if m else p.stem.replace("-cleaned", "")

def year_from_issue_or_meta(issue_id: str, article: dict):
    """Try to infer a year from the filename or article metadata."""
    # 1) from issue_id (e.g., 'Pi-Newspaper-1948-Vol2')
    m = YEAR_RX.search(issue_id)
    if m:
        return int(m.group(0))
    # 2) from date-like fields (very light heuristic)
    for k in ("date","pub_date","published","issue_date"):
        v = article.get(k)
        if isinstance(v, str):
            m = YEAR_RX.search(v)
            if m:
                return int(m.group(0))
    return None

def collect_articles(root: Path) -> pd.DataFrame:
    rows = []
    files = sorted(root.rglob("*-cleaned.json"))
    if not files:
        print(f"[warn] No '*-cleaned.json' files found under: {root}")
    for f in files:
        issue_id = issue_id_from_path(f)
        for i, art in enumerate(read_any_json(f), start=1):
            title, body = pick_title_text(art)
            art_id = art.get("id", i)
            page   = art.get("page")
            year   = year_from_issue_or_meta(issue_id, art)
            rows.append({
                "issue_id": issue_id,
                "source_file": f.name,
                "source_path": str(f),
                "unit_index": i,
                "id": art_id,
                "page": page,
                "year": year,
                "title": title,
                "text": body,
                "orig": art,  # keep original for optional fields (author/date/etc.)
            })
    df = pd.DataFrame(rows)
    print(f"[ok] Loaded {len(df)} articles from {len(files)} issue file(s).")
    return df

df = collect_articles(INPUT_ROOT)

# sanity: make sure text is not an articles-wrapper
wrappers = df["text"].astype(str).str.startswith('{"articles"').sum()
print(f"[sanity] Wrapper-like rows in text: {wrappers} (should be 0)")
df.head(10)


[ok] Loaded 3 articles from 1 issue file(s).
[sanity] Wrapper-like rows in text: 0 (should be 0)


Unnamed: 0,issue_id,source_file,source_path,unit_index,id,page,year,title,text,orig
0,Pi-Newspaper-1979,Pi-Newspaper-1979-cleaned.json,/Users/stepanyan/Documents/UCL/GitHub-Projects...,1,1,1,1979,HOW SAFE ARE OUR HALLS?,Recent months have seen a spate of thefts at I...,"{'id': 1, 'title': 'HOW SAFE ARE OUR HALLS?', ..."
1,Pi-Newspaper-1979,Pi-Newspaper-1979-cleaned.json,/Users/stepanyan/Documents/UCL/GitHub-Projects...,2,2,1,1979,UCL IN SPACE?,"Skylab II, the second orbital space laboratory...","{'id': 2, 'title': 'UCL IN SPACE?', 'author': ..."
2,Pi-Newspaper-1979,Pi-Newspaper-1979-cleaned.json,/Users/stepanyan/Documents/UCL/GitHub-Projects...,3,3,1,1979,KING'S v NUS,A nationwide campaign for the reform of NUS ma...,"{'id': 3, 'title': 'KING'S v NUS', 'author': N..."


### 2. Light OCR cleanup (conservative)
- Unicode normalization, strip control chars (preserve whitespace).
- Curly → straight quotes; ligature expansion; odd spaces → normal spaces.
- Normalize dashes; optional de-hyphenation across line breaks (text only).
- Create `title_norm` (no de-hyphenation) and `text_norm` (with de-hyphenation).
- Print quick change counts for sanity.

In [136]:
def clean_ocr(t: str) -> str:
    # soft hyphen at EOL, hard hyphenation joins, normalize quotes & spaces
    t = t.replace("\u00ad\n", "")
    t = re.sub(r"(\w)-\n(\w)", r"\1\2", t)
    t = re.sub(r"[“”]", '"', t).replace("’", "'")
    t = re.sub(r"[ \t]+\n", "\n", t)
    t = re.sub(r"\n{3,}", "\n\n", t)
    return t.strip()

df["text_clean"] = df["text"].map(clean_ocr)
df["title_clean"] = df["title"].map(lambda s: clean_ocr(s) if isinstance(s,str) else None)
df["char_len"] = df["text_clean"].str.len()
df[["year","title_clean","char_len"]].head(5)


Unnamed: 0,year,title_clean,char_len
0,1979,HOW SAFE ARE OUR HALLS?,2289
1,1979,UCL IN SPACE?,1683
2,1979,KING'S v NUS,636


### 4. Geocoding


In [137]:
# %pip install -U typing_extensions
# %load_ext autoreload
# %autoreload 2

In [138]:
# %pip install -U "pydantic>=2.7" "spacy>=3.7.4"
# %pip install -U "typing_extensions>=4.12"
# %pip install -U "spacy>=3.7.2" "pydantic>=2,<3" spacy-transformers transformers torch
# # (Re)install an English model:
# !python -m spacy download en_core_web_trf


# import sys, importlib.metadata as im

# print("Python:", sys.version)
# print("Kernel executable:", sys.executable)
# for pkg in ("spacy", "pydantic", "pydantic-core", "thinc"):
#     try:
#         print(f"{pkg}:", im.version(pkg))
#     except im.PackageNotFoundError:
#         print(f"{pkg}: not installed")


### 4A — Extract location mentions (NER + heuristics)
- Use spaCy NER on `title_norm` + `text_norm` to collect `GPE`/`LOC`/`FAC`.
- Optional regex add-ons (e.g., UK postcodes, street names).
- Deduplicate mentions per article (case/punct-insensitive), preserving order.
- Store:
  - `loc_mentions`: detailed dicts `{text, label, start, end, source}`
  - `loc_texts`: unique mention strings for quick inspection
  
RUN TIME - ~14 mins. 

In [139]:
# ---- Step 4A: Extract all location mentions across the full text ----
import re, json
from collections import Counter

# Ensure an English spaCy pipeline is loaded (we'll try transformer; fall back to small)
try:
    nlp
except NameError:
    import spacy
    try:
        nlp = spacy.load("en_core_web_trf")
    except Exception:
        nlp = spacy.load("en_core_web_sm")

# Simple UK/EN street patterns (extend as needed)
STREET_SUFFIXES = r"(Street|St\.|Road|Rd\.|Avenue|Ave\.|Lane|Ln\.|Close|Way|Square|Sq\.|Terrace|Terr\.|Place|Pl\.|Drive|Dr\.|Crescent|Cresc\.|Court|Ct\.|Row|Quay|Embankment|Parade)"
STREET_RE = re.compile(rf"\b([A-Z][a-zA-Z'-]+(?:\s+[A-Z][a-zA-Z'-]+)*)\s+{STREET_SUFFIXES}\b")

def extract_location_mentions(text: str):
    """
    Returns a list of candidate location mentions with offsets and a coarse 'kind'.
    Kinds: 'street', 'place' (GPE/LOC), 'facility' (FAC), 'country' (if resolved later).
    """
    out = []
    if not isinstance(text, str) or not text.strip():
        return out

    # 1) NER for GPE/LOC/FAC across entire text
    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_ in ("GPE","LOC","FAC"):
            out.append({
                "text": ent.text.strip(".,;:()[]\"' "),
                "start": ent.start_char,
                "end": ent.end_char,
                "kind": "place" if ent.label_ in ("GPE","LOC") else "facility",
                "source": "ner"
            })

    # 2) Street patterns (regex), capture overlaps that spaCy might miss
    for m in STREET_RE.finditer(text):
        span = m.group(0).strip(".,;:()[]\"' ")
        out.append({
            "text": span,
            "start": m.start(),
            "end": m.end(),
            "kind": "street",
            "source": "regex"
        })

    # 3) Normalize & deduplicate near-duplicates (case-insensitive)
    # Keep first occurrence offsets
    seen = set()
    deduped = []
    for m in out:
        key = (m["text"].lower(), m["kind"])
        if key not in seen:
            seen.add(key)
            deduped.append(m)

    return deduped

# Apply to all rows
df["location_mentions"] = df["text_clean"].map(extract_location_mentions)

# Quick peek
df[["title_clean","location_mentions"]].head(10)


Unnamed: 0,title_clean,location_mentions
0,HOW SAFE ARE OUR HALLS?,"[{'text': 'Max Rayne House', 'start': 65, 'end..."
1,UCL IN SPACE?,"[{'text': 'Skylab II', 'start': 0, 'end': 9, '..."
2,KING'S v NUS,[]


### 4B. Geocode → structured locations (cache & rate limiting)
- Geocode unique mentions with Nominatim (via `geopy`), cached on disk + HTTP cache.
- Map OSM address to fields: `street`, `city`, `state`, `country`, `postcode`, `coordinates`, `location_notes`.
- Preserve provenance (`geo` block) for QA.
- Assemble per-article `locations_structured` (dedup by key fields).

RUN TIME ~35 mins. 



In [140]:
# ---- Step 4B: Geocode to structured fields with cache & rate limiter ----
import requests, requests_cache, json, time
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
from pathlib import Path

# Persistent HTTP cache (2 weeks)
CACHE_DIR.mkdir(parents=True, exist_ok=True)
requests_cache.install_cache(str(CACHE_DIR / "http_cache"), expire_after=60*60*24*14)

# Local JSONL geocode cache
GEOCODE_CACHE = CACHE_DIR / "all_locations_geocode_cache.jsonl"
_geo_cache = {}
if GEOCODE_CACHE.exists():
    with open(GEOCODE_CACHE, "r", encoding="utf-8") as f:
        for line in f:
            rec = json.loads(line)
            _geo_cache[(rec["query"].lower(), rec.get("country_bias"))] = rec

def _cache_geocode_write(key, rec):
    _geo_cache[key] = rec
    with open(GEOCODE_CACHE, "a", encoding="utf-8") as f:
        f.write(json.dumps(rec, ensure_ascii=False) + "\n")

# Single shared Nominatim client with rate limiting
_geolocator = Nominatim(user_agent="pi-newswire-alllocs", timeout=15)
_geocode = RateLimiter(_geolocator.geocode, min_delay_seconds=1.0, swallow_exceptions=True)

def normalize_address(addr: dict) -> dict:
    """
    Map OSM address fields to our target schema:
      street, council, city, state, region, country, postcode
    (We keep both 'state' and 'region' for compatibility.)
    """
    if not addr:
        return {}

    def pick(*keys):
        for k in keys:
            v = addr.get(k)
            if v:
                return v
        return None

    street   = pick("road", "pedestrian", "residential", "footway", "path", "neighbourhood", "street", "cycleway")
    council  = pick("borough", "municipality", "city_district", "district", "county", "local_authority", "state_district")
    city     = pick("city", "town", "village", "hamlet", "suburb")
    state    = pick("state", "region", "province")
    country  = pick("country")
    postcode = pick("postcode")

    return {
        "street": street,
        "council": council,
        "city": city,
        "state": state,          # new explicit field
        "region": state,         # keep old name for compatibility
        "country": country,
        "postcode": postcode,    # new
    }

def geocode_text(query: str, country_bias: str|None=None) -> dict|None:
    """
    Geocode a text mention via Nominatim, returning normalized address + lat/lon.
    country_bias: e.g., 'gb' to bias to the UK for ambiguous street names.
    """
    if not query or not query.strip():
        return None
    key = (query.lower(), country_bias)
    if key in _geo_cache:
        return _geo_cache[key]

    try:
        loc = _geocode(query, addressdetails=True, country_codes=country_bias)
    except Exception:
        loc = None

    if not loc:
        rec = {"query": query, "country_bias": country_bias, "ok": False}
        _cache_geocode_write(key, rec)
        return rec

    addr = normalize_address(getattr(loc, "raw", {}).get("address", {}))
    rec = {
        "query": query,
        "country_bias": country_bias,
        "ok": True,
        "lat": loc.latitude,
        "lon": loc.longitude,
        "address": addr,
        "display_name": loc.address,
        "source": "nominatim"
    }
    _cache_geocode_write(key, rec)
    return rec

def geocode_mentions_for_row(mentions, prefer_uk_streets=True):
    """
    For each mention, geocode and return a list of resolved locations.

    Adds:
      - label: original mention text (e.g., "Max Rayne House")
      - type:  NER label (FAC/GPE/LOC) if available, else 'kind'
      - street/state/postcode
      - coordinates: [lat, lon]
      - location_notes: same as display_name

    Keeps existing fields for compatibility (mention, kind, lat, lon, display_name, geo_source, country_bias, council/region/city/country).
    """
    if not mentions:
        return []

    results = []
    seen_keys = set()

    for m in mentions:
        if not isinstance(m, dict):
            continue

        q = (m.get("text") or "").strip()
        if not q:
            continue

        kind = m.get("kind", "place")
        ner_type = m.get("label") or kind   # prefer FAC/GPE/LOC if present
        bias = "gb" if (kind == "street" and prefer_uk_streets) else None

        rec = geocode_text(q, country_bias=bias)
        if not rec or not rec.get("ok"):
            continue

        addr = rec.get("address", {}) or {}
        lat, lon = rec.get("lat"), rec.get("lon")

        norm = {
            # NEW fields you asked for
            "label": q,                           # human label (the mention text)
            "type": ner_type,                     # FAC/GPE/LOC if present
            "street": addr.get("street"),
            "city": addr.get("city"),
            "state": addr.get("state") or addr.get("region"),
            "country": addr.get("country"),
            "postcode": addr.get("postcode"),
            "coordinates": [lat, lon] if (lat is not None and lon is not None) else None,
            "location_notes": rec.get("display_name"),

            # Existing fields preserved (don’t break later cells)
            "mention": q,
            "kind": kind,
            "council": addr.get("council"),
            "region": addr.get("region"),
            "lat": lat,
            "lon": lon,
            "display_name": rec.get("display_name"),
            "geo_source": rec.get("source"),
            "country_bias": rec.get("country_bias"),
        }

        # Stable dedup key (avoid dupes across same resolved point & label/type)
        dkey = (
            round(lat, 6) if lat is not None else None,
            round(lon, 6) if lon is not None else None,
            norm["label"], norm["type"], norm["street"], norm["city"], norm["state"], norm["country"]
        )
        if dkey in seen_keys:
            continue
        seen_keys.add(dkey)
        results.append(norm)

    return results

# Apply to all rows
df["resolved_locations"] = df["location_mentions"].map(geocode_mentions_for_row)

# Inspect a few resolved examples
cols = ["title_clean","resolved_locations"]
df[cols].head(5)


Unnamed: 0,title_clean,resolved_locations
0,HOW SAFE ARE OUR HALLS?,"[{'label': 'Max Rayne House', 'type': 'facilit..."
1,UCL IN SPACE?,"[{'label': 'Skylab II', 'type': 'facility', 's..."
2,KING'S v NUS,[]


#### 4C. Build `locations_mentioned` (normalized, deduplicated)

- Convert each row’s `resolved_locations` (from 4B) into a compact Newswire-ready list.
- For every resolved place, construct a record with:
  - `label` (mention text), `type` (NER label, e.g., `FAC`/`GPE`/`LOC`)
  - `street`, `city`, `state` (falls back to `region`), `country`, optional `postcode`
  - `coordinates` as `[lat, lon]` (only if both present)
  - `location_notes` (prefer enriched note; fallback to geocoder `display_name`)
- **Deduplicate** per article using the key:  
  `(label, type, street, city, state, country, round(lat,6), round(lon,6))`  
  (Keeps the first occurrence; rounding avoids float-noise duplicates.)
- Output column: `df["locations_mentioned"]` — ordered, clean list per article for downst

In [141]:
# ---- Build locations_mentioned (one list per article) ----
from typing import List, Dict, Any

def _to_locations_mentioned(resolved_list: List[Dict[str, Any]] | None) -> List[Dict[str, Any]]:
    """
    Map resolved_locations -> target Newswire schema with enrichments:
      {
        label, type, street, city, state, country,
        coordinates [lat, lon], location_notes, (optional) postcode
      }
    Deduplicate by (label, type, street, city, state, country, lat, lon).
    """
    out: List[Dict[str, Any]] = []
    seen = set()
    if not isinstance(resolved_list, list):
        return out

    for r in resolved_list:
        if not isinstance(r, dict):
            continue

        label   = (r.get("label") or r.get("mention") or "").strip() or None
        typ     = r.get("type") or r.get("kind") or None
        street  = r.get("street") or None
        city    = r.get("city") or None
        state   = r.get("state") or r.get("region") or None
        country = r.get("country") or None
        postcode = r.get("postcode") or None

        lat = r.get("lat"); lon = r.get("lon")
        coords = [float(lat), float(lon)] if (lat is not None and lon is not None) else None

        notes = r.get("location_notes") or r.get("display_name") or ""

        # Dedup key
        key = (
            label, typ, street, city, state, country,
            round(float(lat), 6) if lat is not None else None,
            round(float(lon), 6) if lon is not None else None,
        )
        if key in seen:
            continue
        seen.add(key)

        rec = {
            "label": label,
            "type": typ,
            "street": street,
            "city": city,
            "state": state,
            "country": country,
            "coordinates": coords,
            "location_notes": notes,
        }
        if postcode is not None:
            rec["postcode"] = postcode

        out.append(rec)

    return out

df["locations_mentioned"] = df["resolved_locations"].map(_to_locations_mentioned)

# Quick sanity check
df[["locations_mentioned"]].head(3)


Unnamed: 0,locations_mentioned
0,"[{'label': 'Max Rayne House', 'type': 'facilit..."
1,"[{'label': 'Skylab II', 'type': 'facility', 's..."
2,[]


#### 5. People NER + disambiguation (context/year-aware)
- Collect PERSON mentions (2+ tokens) from NER.
- For each mention, link to Wikidata using:
  - Search → batch fetch facts (birth/death years, occupations) → score with name similarity.
  - Context boost (theme hits in article) and time penalty (publication year).
- Cache keyed by mention + context (theme + decade) to avoid repeated lookups.
- Save a single `people_mentioned` list: each item has `mention`, `wikidata_id` (or null),

RUN TIME ~14 mins.

In [142]:
import spacy
nlp = spacy.load("en_core_web_trf")

def ner_bio(text: str):
    doc = nlp(text)
    words = [t.text for t in doc]
    labels = ["O"] * len(words)
    for ent in doc.ents:
        print(f"Entity: {ent.text} ({ent.label_}) at {ent.start_char}-{ent.end_char}")
        labels[ent.start] = f"B-{ent.label_}"
        for i in range(ent.start+1, ent.end):
            labels[i] = f"I-{ent.label_}"
    ents = [(e.text, e.label_) for e in doc.ents]
    return words, labels, ents

# Store compactly; full arrays go to final record
df["_ner"] = df["text_clean"].progress_map(ner_bio)

  0%|          | 0/3 [00:00<?, ?it/s]

Entity: Recent months (DATE) at 0-13
Entity: Ifor Evans Hall (ORG) at 45-60
Entity: Max Rayne House (FAC) at 65-80
Entity: Ifor Evans (FAC) at 361-371
Entity: Max Rayne House (FAC) at 376-391
Entity: 500 (CARDINAL) at 416-419
Entity: next year (DATE) at 499-508
Entity: UC (ORG) at 544-546
Entity: Ramsey Hall (FAC) at 664-675
Entity: only two (CARDINAL) at 792-800
Entity: the past five years (DATE) at 832-851
Entity: Pi (ORG) at 854-856
Entity: Ifor (ORG) at 888-892
Entity: Max (ORG) at 897-900
Entity: Ifor Evans' (ORG) at 1067-1078
Entity: John Andrews (PERSON) at 1091-1103
Entity: the day (DATE) at 1204-1211
Entity: Pi (PERSON) at 1709-1711
Entity: One (CARDINAL) at 1869-1872
Entity: a couple of minutes (TIME) at 2064-2083
Entity: John Andrews (PERSON) at 2203-2215
Entity: Skylab II (FAC) at 0-9
Entity: second (ORDINAL) at 15-21
Entity: the early '80s (DATE) at 82-96
Entity: UCL (ORG) at 111-114
Entity: Keith Strong (PERSON) at 275-287
Entity: 27 (DATE) at 289-291
Entity: the Mullard 

#### 6. Build Newswire-like JSONL
- Emit one JSON line per article with normalized fields:
  - ids (`issue_id`, `art_id`, `page`, `year`), `title`, `text`
  - `locations_mentioned` (enriched objects with `label`, `type`, `street`, `city`, `state`, `country`, `postcode`, `coordinates`, `location_notes`)
  - `people_mentioned` (all PERSON mentions; `wikidata_id` null if unresolved)
  - `source_file`
- Designed for downstream indexing/search and reproducibility.

RUN TIME 1.3 secs.

In [143]:
# ---- Step 6: People linking (cache keyed by mention; cache *record* uses canonical 'name') ----
import json, re, requests
from rapidfuzz import fuzz

PEOPLE_CACHE = CACHE_DIR / "people_link_cache.jsonl"

def _norm_key(s: str) -> str:
    return re.sub(r"\s+", " ", (s or "")).strip()

# In-memory cache, looked up by the *mention* text (normalized)
_people_store: dict[str, dict | None] = {}

# Fresh start is fine if you deleted the cache file; loader is tolerant anyway
if PEOPLE_CACHE.exists():
    with open(PEOPLE_CACHE, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            try:
                rec = json.loads(line)
            except Exception:
                continue
            if not isinstance(rec, dict):
                continue
            # Prefer original mention to be the key if available; fall back safely
            key = rec.get("query") or rec.get("name") or rec.get("person_name")
            if key:
                _people_store[_norm_key(key)] = rec

def _cache_person_write(mention: str, resolved: dict | None):
    """
    Write one JSONL record per mention:
      - If resolved: include canonical 'name' (Wikidata label) and the original 'query' (mention).
      - If unresolved: write a minimal record with 'name': None.
    """
    PEOPLE_CACHE.parent.mkdir(parents=True, exist_ok=True)
    key = _norm_key(mention)

    if resolved is None:
        rec = {"query": mention, "name": None}
        _people_store[key] = rec
    else:
        # Ensure canonical 'name' present; keep mention in 'query'
        canon = resolved.get("person_name") or resolved.get("name") or mention
        rec = {
            "query": mention,
            "name": canon,                 # canonical label
            **resolved
        }
        _people_store[key] = rec

    with open(PEOPLE_CACHE, "a", encoding="utf-8") as f:
        f.write(json.dumps(rec, ensure_ascii=False) + "\n")

def wb_search(name: str, limit=5):
    url = "https://www.wikidata.org/w/api.php"
    params = {
        "action": "wbsearchentities",
        "language": "en",
        "uselang": "en",
        "type": "item",
        "search": name,
        "limit": limit,
        "format": "json",
    }
    r = requests.get(url, params=params, headers={"User-Agent": "pi-newswire-pipeline"})
    if not r.ok:
        return []
    return r.json().get("search", [])

def is_human_qid(qid: str) -> bool:
    # SPARQL: instance of human (Q5)
    sparql = f"""
    ASK {{ wd:{qid} wdt:P31 wd:Q5 . }}
    """
    url = "https://query.wikidata.org/sparql"
    r = requests.get(url, params={"format": "json", "query": sparql}, headers={"User-Agent": "pi-newswire-pipeline"})
    if not r.ok:
        return False
    try:
        return bool(r.json().get("boolean", False))
    except Exception:
        return False

def link_person(mention: str, context: str | None = None):
    """
    Resolve a PERSON mention to Wikidata.
    Cache lookup is by *mention* (normalized).
    Cache record stores the canonical 'name' (Wikidata label) and keeps 'query'=mention.
    Returns a dict like {"wikidata_id": "Qxx", "person_name": "..."} or None.
    """
    key = _norm_key(mention)
    if key in _people_store:
        return _people_store[key] if _people_store[key].get("name") else None

    cands = wb_search(mention, limit=7)
    best = None
    best_score = -1

    for c in cands:
        qid = c.get("id")
        if not qid:
            continue
        label = c.get("label") or ""
        desc  = c.get("description") or ""
        score = fuzz.token_set_ratio(mention, label)
        if score > best_score:
            best = {"wikidata_id": qid, "person_name": label, "desc": desc}
            best_score = score

    # Validate human & write cache
    if best and best.get("wikidata_id") and is_human_qid(best["wikidata_id"]):
        out = {
            "wikidata_id": best["wikidata_id"],
            "person_name": best["person_name"]
            # Optionally enrich later: gender, occupations, etc.
        }
        _cache_person_write(mention, out)
        return out

    _cache_person_write(mention, None)
    return None

def people_from_ner(ents):
    # Two-token+ names only (avoid bare surnames); adjust if you want
    return sorted({text for (text, label) in ents if label == "PERSON" and len(text.split()) >= 2})

# Quick smoke test on first row:
sample_words, sample_labels, sample_ents = df["_ner"].iloc[0]
people = people_from_ner(sample_ents)
people[:10]


['John Andrews']

In [144]:
# Previous version of transformers did not work. The suggested solution was to update it. 
# %pip install -U "transformers>=4.41" "huggingface_hub>=0.23" "safetensors>=0.4.3" "torch>=2.1,<3"
# CHECK again BEFORE USING

##### Installing Topic Classification Models Locally

This section downloads pre-trained topic classification models from Dell Research Harvard for newspaper content analysis. These models are specifically trained to identify different topics in news articles.

###### Prerequisites

Run these commands in your terminal (outside of Jupyter) to set up the Hugging Face CLI and download the models:

###### Step 1: Install Hugging Face CLI

```bash
pip install -U "huggingface_hub[cli]" hf_transfer
```

###### Step 2: Enable Faster Downloads (Optional)

```bash
export HF_HUB_ENABLE_HF_TRANSFER=1
```

###### Step 3: Create Local Model Directory

```bash
MODEL_HOME="$HOME/hf-models/newswire"
mkdir -p "$MODEL_HOME"
```

###### Step 4: Download Topic Classification Models

```bash
for m in \
  dell-research-harvard/topic-antitrust \
  dell-research-harvard/topic-civil_rights \
  dell-research-harvard/topic-crime \
  dell-research-harvard/topic-govt_regulation \
  dell-research-harvard/topic-labor_movement \
  dell-research-harvard/topic-politics \
  dell-research-harvard/topic-protests \
  dell-research-harvard/topic-sport \
  dell-research-harvard/topic-fire \
  dell-research-harvard/topic-weather \
  dell-research-harvard/topic-obits
do
  name="${m##*/}"
  huggingface-cli download "$m" --local-dir "$MODEL_HOME/$name" --local-dir-use-symlinks False
done
```

###### Notes

- Models will be saved to `~/hf-models/newswire/` by default
- Each model is approximately 400-500MB
- Download time depends on your internet connection
- Models are stored without symlinks for better portability

#### 7. Topic tagging with ALL Newswire heads (local-first, safe, cached) ----

RUN TIME ~long?

In [145]:
# ---- Step 7: Topic tagging with ALL Newswire heads (local-first, safe, cached) ----
from transformers import pipeline
from pathlib import Path
import json, time

# 0) Ensure text column is strings
df["text_clean"] = df["text_clean"].fillna("").astype(str)

# 1) Point to where you stored the models (EDIT THIS to your path if different)
MODEL_HOME = Path("newswire-topic-models")

# 2) All Newswire topic heads
TOPIC_MODELS = {
    "antitrust":        "dell-research-harvard/topic-antitrust",
    "civil_rights":     "dell-research-harvard/topic-civil_rights",
    "crime":            "dell-research-harvard/topic-crime",
    "govt_regulation":  "dell-research-harvard/topic-govt_regulation",
    "labor_movement":   "dell-research-harvard/topic-labor_movement",
    "politics":         "dell-research-harvard/topic-politics",
    "protests":         "dell-research-harvard/topic-protests",
    "sport":            "dell-research-harvard/topic-sport",
    "fire":             "dell-research-harvard/topic-fire",
    "weather":          "dell-research-harvard/topic-weather",
    "obits":            "dell-research-harvard/topic-obits",
}

# Helper: prefer local folder if present
def resolve_model_id(mid: str) -> str:
    local = MODEL_HOME / mid.split("/", 1)[1]
    return str(local) if local.exists() else mid

# If you want to force strictly-offline (error if model missing locally), set:
STRICT_OFFLINE = False  # True -> raise if local folder missing

DEVICE    = -1      # -1 = CPU; set 0 for CUDA-GPU, or None to auto-pick (MPS on Apple Silicon)
BATCH_SIZE= 8
TRUNC     = 4000
THRESH    = 0.5

_topic_pipes = {}
def get_pipe(model_id: str):
    model_ref = resolve_model_id(model_id)
    if STRICT_OFFLINE and not Path(model_ref).exists():
        raise FileNotFoundError(f"Local model not found: {model_ref}")
    if model_ref in _topic_pipes:
        return _topic_pipes[model_ref]
    try:
        _topic_pipes[model_ref] = pipeline("text-classification", model=model_ref, device=DEVICE)
    except Exception as e:
        print(f"[warn] Could not load {model_ref}: {type(e).__name__}: {e}")
        _topic_pipes[model_ref] = None
    return _topic_pipes[model_ref]

def _to_binary_label(hf_output, threshold=THRESH):
    if not hf_output:
        return 0
    rec = hf_output[0] if isinstance(hf_output, list) else hf_output
    label = str(rec.get("label","")).upper()
    score = float(rec.get("score", 0.0))
    is_pos = ("1" in label) or ("POS" in label) or ("YES" in label)
    return 1 if (is_pos and score >= threshold) else 0

def score_one_model(model_id: str, texts: list[str]) -> list[int]:
    clf = get_pipe(model_id)
    if clf is None:
        return [0]*len(texts)
    # retry once on transient errors
    try:
        preds = clf([ (t or "")[:TRUNC] for t in texts ], batch_size=BATCH_SIZE, truncation=True)
    except Exception:
        time.sleep(1.0)
        preds = clf([ (t or "")[:TRUNC] for t in texts ], batch_size=BATCH_SIZE, truncation=True)
    return [_to_binary_label(p) for p in preds]

# Cache predictions so re-runs are quick
CACHE_PATH = Path(OUTPUT_ROOT) / "cache" / "topic_preds.jsonl"
CACHE_PATH.parent.mkdir(parents=True, exist_ok=True)
_cache = {}
if CACHE_PATH.exists():
    with open(CACHE_PATH, "r", encoding="utf-8") as f:
        for line in f:
            j = json.loads(line)
            _cache[(j["source_file"], int(j["unit_index"]))] = j["topics"]

def _write_cache_row(key, topics_dict):
    with open(CACHE_PATH, "a", encoding="utf-8") as f:
        f.write(json.dumps({"source_file": key[0], "unit_index": key[1], "topics": topics_dict}) + "\n")

# Mini-batch over rows to reduce per-model calls
B = 32
pending_keys, pending_texts = [], []
topics_all = []

def flush_pending():
    global pending_keys, pending_texts, topics_all
    if not pending_texts:
        return
    batch = {t: score_one_model(mid, pending_texts) for t, mid in TOPIC_MODELS.items()}
    for i, key in enumerate(pending_keys):
        row_topics = {t: int(batch[t][i]) for t in TOPIC_MODELS}
        topics_all.append(row_topics)
        _write_cache_row(key, row_topics)
    pending_keys.clear(); pending_texts.clear()

for _, row in df.iterrows():
    key = (row["source_file"], int(row["unit_index"]))
    if key in _cache:
        topics_all.append(_cache[key])
        continue
    pending_keys.append(key); pending_texts.append(row["text_clean"])
    if len(pending_texts) >= B:
        flush_pending()
flush_pending()

df["topics"] = topics_all
for t in TOPIC_MODELS:
    df[t] = df["topics"].map(lambda d: int(d.get(t, 0)))

# Quick tally so you can see it worked
display(df[[*TOPIC_MODELS.keys()]].sum())


Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu


antitrust          0
civil_rights       0
crime              0
govt_regulation    0
labor_movement     1
politics           0
protests           0
sport              0
fire               0
weather            0
obits              0
dtype: int64

#### 8. Build Newswire-like records & export Issue-bucketed JSONL
- Read the Newswire JSONL and extract a tidy table:
  - `year`, `source_file`, `issue`, `art_id`, `article`, `page`
  - `label`, `type`, `street`, `city`, `state`, `country`, `postcode`
  - `longitude`, `latitude`, `location_notes`
- Save to `locations_mentioned.csv` for spreadsheet/curation workflows.

In [146]:
def to_newswire_like(row):
    words, labels, ents = row["_ner"]
    ppl   = people_from_ner(ents)
    linked = [link_person(p, row["text_clean"]) for p in ppl]
    linked = [x for x in linked if x]

    return {
        "year": int(row["year"]),
        "dates": [],
        "article": row["text_clean"],
        "byline": "",
        "newspaper_metadata": [],
        "source_file": row.get("source_file"),  # full path or filename from Step 2
        "art_id": row.get("id"),                # map DF 'id' -> JSONL 'art_id'
        "page"

        # topics (present if you ran Step 7)
        "antitrust":        int(row.get("antitrust", 0)),
        "civil_rights":     int(row.get("civil_rights", 0)),
        "crime":            int(row.get("crime", 0)),
        "govt_regulation":  int(row.get("govt_regulation", 0)),
        "labor_movement":   int(row.get("labor_movement", 0)),
        "politics":         int(row.get("politics", 0)),
        "protests":         int(row.get("protests", 0)),
        "sport":            int(row.get("sport", 0)),
        "fire":             int(row.get("fire", 0)),
        "weather":          int(row.get("weather", 0)),
        "obits":            int(row.get("obits", 0)),

        # NER (BIO)
        "ner_words":  words,
        "ner_labels": labels,

        # New: all resolved locations
        "locations_mentioned": row.get("locations_mentioned", []),

        # Optional: keep for audit (not required in the final JSON)
        # "resolved_locations": row.get("resolved_locations", []),

        # People
        "people_mentioned": linked,
    }


out_dir = Path(OUTPUT_ROOT) / "pi_newswire_like"
out_dir.mkdir(parents=True, exist_ok=True)

def write_year_buckets(frame: pd.DataFrame):
    for y, g in frame.groupby("year"):
        outp = out_dir / f"{y}_pi_newswire_like.jsonl"
        with open(outp, "w", encoding="utf-8") as f:
            for _, row in tqdm(g.iterrows(), total=len(g), desc=f"Writing {y}"):
                f.write(json.dumps(to_newswire_like(row), ensure_ascii=False) + "\n")
        print("Wrote", outp)

write_year_buckets(df)


Writing 1979:   0%|          | 0/3 [00:00<?, ?it/s]

Wrote /Users/stepanyan/Documents/UCL/GitHub-Projects/Newspaper-Semantic-Enrichment/2-semantic-processing/output-semantic/pi_newswire_like/1979_pi_newswire_like.jsonl


In [147]:
# --- Step 8B: Export issue-based JSONL (preserve year folders + filename) ---
from pathlib import Path

out_dir_issue = OUTPUT_ROOT / "newswire-format-issues"
out_dir_issue.mkdir(parents=True, exist_ok=True)

def to_newswire_like(row):
    words, labels, ents = row["_ner"]
    ppl   = people_from_ner(ents)
    linked = [link_person(p, row["text_clean"]) for p in ppl]
    linked = [x for x in linked if x]

    return {
        "year": int(row["year"]),
        "dates": [],
        "article": row["text_clean"],
        "byline": "",
        "newspaper_metadata": [],
        "source_file": row.get("source_file"),  # full path or filename from Step 2
        "art_id": row.get("id"),                # map DF 'id' -> JSONL 'art_id'
        "page"

        # topics (present if you ran Step 7)
        "antitrust":        int(row.get("antitrust", 0)),
        "civil_rights":     int(row.get("civil_rights", 0)),
        "crime":            int(row.get("crime", 0)),
        "govt_regulation":  int(row.get("govt_regulation", 0)),
        "labor_movement":   int(row.get("labor_movement", 0)),
        "politics":         int(row.get("politics", 0)),
        "protests":         int(row.get("protests", 0)),
        "sport":            int(row.get("sport", 0)),
        "fire":             int(row.get("fire", 0)),
        "weather":          int(row.get("weather", 0)),
        "obits":            int(row.get("obits", 0)),

        # NER (BIO)
        "ner_words":  words,
        "ner_labels": labels,

        # New: all resolved locations
        "locations_mentioned": row.get("locations_mentioned", []),

        # Optional: keep for audit (not required in the final JSON)
        # "resolved_locations": row.get("resolved_locations", []),

        # People
        "people_mentioned": linked,
    }

def write_issue_buckets(frame: pd.DataFrame):
    # Keep stable ordering within each issue
    sort_cols = [c for c in ["year", "source_file", "unit_index"] if c in frame.columns]
    gframe = frame.sort_values(sort_cols, kind="stable")

    # Group by issue (source_file) within each year
    for (y, src), g in gframe.groupby(["year", "source_file"], sort=False):
        year_dir = out_dir_issue / str(int(y))
        year_dir.mkdir(parents=True, exist_ok=True)

        # Keep original filename but change extension to .jsonl
        fname = Path(src).with_suffix(".jsonl").name
        outp = year_dir / fname

        with open(outp, "w", encoding="utf-8") as f:
            for _, row in tqdm(g.iterrows(), total=len(g), desc=f"Writing {fname}"):
                f.write(json.dumps(to_newswire_like(row), ensure_ascii=False) + "\n")

        print("Wrote", outp)

write_issue_buckets(df)


Writing Pi-Newspaper-1979-cleaned.jsonl:   0%|          | 0/3 [00:00<?, ?it/s]

Wrote /Users/stepanyan/Documents/UCL/GitHub-Projects/Newspaper-Semantic-Enrichment/2-semantic-processing/output-semantic/newswire-format-issues/1979/Pi-Newspaper-1979-cleaned.jsonl


#### 9. Per-issue CSV summary with Topics, NERs, Geolocations. (Optional)


In [148]:
import re
from pathlib import Path
from collections import Counter
import pandas as pd

# -------- Helper: issue id from filename --------
ISSUE_RX = re.compile(r"^pi_(.+?)_(19|20)\d\d$")  # captures between 'pi_' and '_YYYY'
def issue_id_from_path(p: str) -> str:
    stem = Path(p).stem  # e.g., 'pi_vol_2_3_1948-cleaned' -> 'pi_vol_2_3_1948-cleaned'
    stem = stem.replace("-cleaned","")
    m = ISSUE_RX.match(stem)
    if m:
        return m.group(1)  # e.g., '1', 'vol_2_3', '19_first_edition', 'rag'
    # fallback: remove prefix 'pi_' and year suffix
    s = stem
    if s.startswith("pi_"): s = s[3:]
    s = re.sub(r"_(19|20)\d\d$", "", s)
    return s

df["issue_id"] = df["source_file"].map(issue_id_from_path)

# -------- Topics per issue --------
topic_cols_all = [
    "antitrust","civil_rights","crime","govt_regulation","labor_movement",
    "politics","protests","sport","fire","weather","obits"
]
topic_cols = [c for c in topic_cols_all if c in df.columns]

def topics_for_group(g: pd.DataFrame):
    present = [t for t in topic_cols if g[t].sum() > 0]
    return present, len(present)

# -------- NERs per issue (top 25 by frequency) --------
ALLOWED_NER = {"PERSON","ORG","GPE","LOC","FAC","EVENT","WORK_OF_ART","LAW","NORP"}
def ners_for_group(g: pd.DataFrame, topk=25):
    cnt = Counter()
    for item in g["_ner"]:
        if not isinstance(item, (list, tuple)) or len(item) < 3:
            continue
        _, _, ents = item
        for text, label in ents:
            if label in ALLOWED_NER:
                norm = text.strip().strip('.,;:"\'()[]')
                if norm:
                    cnt[norm] += 1
    n_unique = len(cnt)
    listed = [f"{k} ({v})" for k,v in cnt.most_common(topk)]
    return listed, n_unique

# -------- Geolocations per issue (top 20) --------
def geos_for_group(g: pd.DataFrame, topk=20):
    cnt = Counter()
    for L in g.get("resolved_locations", []):
        if not isinstance(L, list):
            continue
        for r in L:
            parts = [r.get("street"), r.get("council"), r.get("city"), r.get("region"), r.get("country")]
            name = ", ".join([p for p in parts if p])
            if not name:
                name = r.get("display_name") or ""
            name = name.strip().strip(",")
            if name:
                cnt[name] += 1
    n_unique = len(cnt)
    listed = [f"{k} ({v})" for k,v in cnt.most_common(topk)]
    return listed, n_unique

# -------- People Mentioned per issue (top 20) --------
def people_for_group(g: pd.DataFrame, topk=20):
    cnt = Counter()

    # Prefer the explicit column if it exists; otherwise derive from NER
    if "people_mentioned" in g.columns:
        series = g["people_mentioned"]
    else:
        # Fallback: derive from NER spans (expects people_from_ner and _ner available)
        def _from_ner(item):
            if not isinstance(item, (list, tuple)) or len(item) < 3:
                return []
            _, _, ents = item
            try:
                names = people_from_ner(ents)  # returns list of strings
            except NameError:
                names = [text for (text, label) in ents if label == "PERSON" and len(text.split()) >= 2]
            return [{"person_name": n} for n in names]
        series = g["_ner"].map(_from_ner) if "_ner" in g.columns else pd.Series([[]]*len(g), index=g.index)

    for L in series:
        if not isinstance(L, list):
            continue
        for r in L:
            # r may be a dict from linker, or a bare string if upstream changed
            if isinstance(r, dict):
                name = r.get("person_name") or r.get("name") or ""
            else:
                name = str(r) if r is not None else ""
            name = re.sub(r"\s+", " ", name).strip().strip('.,;:"\'()[]')
            if name:
                cnt[name] += 1

    n_unique = len(cnt)
    listed = [f"{k} ({v})" for k, v in cnt.most_common(topk)]
    return listed, n_unique
    

# -------- Build summary table --------
rows = []
for (year, issue), g in df.groupby(["year","issue_id"], sort=True):
    topics_list, topics_n = topics_for_group(g)
    ners_list, ners_n     = ners_for_group(g)
    geos_list, geos_n     = geos_for_group(g)
    people_list, people_n = people_for_group(g)
    rows.append({
        "Year": year,
        "Issue": issue,
        "List of Topics": ", ".join(topics_list),
        "Number of Topics": topics_n,
        "List of NERs (top 25)": "; ".join(ners_list),
        "Number of NERs (unique)": ners_n,
        "List of Geolocations (top 20)": "; ".join(geos_list),
        "Number of Geolocations (unique)": geos_n,
        "List of People Mentioned (top 20)": "; ".join(people_list),
        "Number of People Mentioned (unique)": people_n,
        "Articles in Issue": len(g),
    })

summary = pd.DataFrame(rows).sort_values(["Year","Issue"]).reset_index(drop=True)
summary
summary.to_csv("pi_newswire_output_summary.csv", index=False, encoding="utf-8")  

#### 10. Convert JSONL output to CSV for Wikidata upload
##### 10 a. Extract People Mentioned
This section iterates through the files in outpu-semantic->newswire-format-issues and creates a single file


In [None]:
# --- Step 9: Extract people from JSONL into a CSV (basename + issue) ---
import json
import pandas as pd
from pathlib import Path

people_rows = []

# Path to Step 8 JSONL exports
jsonl_dir = Path(OUTPUT_ROOT) / "newswire-format-issues"
CSV_DIR   =  OUTPUT_ROOT/"csv-exports"
CSV_DIR.mkdir(parents=True, exist_ok=True)

out_csv   = Path(CSV_DIR) / "people_mentioned.csv"


def make_issue(name: str | None) -> str | None:
    """From a filename (no dirs), strip '-cleaned.jsonl' (preferred), else '-cleaned.json',
    else just remove .jsonl/.json. Return None if name is falsy."""
    if not name:
        return None
    s = name
    for suf in ("-cleaned.jsonl", "-cleaned.json", ".jsonl", ".json"):
        if s.endswith(suf):
            s = s[: -len(suf)]
            break
    return s

for f in sorted(jsonl_dir.glob("*.jsonl")):
    with open(f, "r", encoding="utf-8") as fh:
        for line in fh:
            if not line.strip():
                continue
            rec = json.loads(line)

            year        = rec.get("year")
            sf_full     = rec.get("source_file")
            sf_name     = Path(sf_full).name if sf_full else None  # [1] basename only
            issue_name  = make_issue(sf_name)                      # [2] derived issue

            art_id      = rec.get("art_id")
            page        = rec.get("page")

            # Expecting a list of dicts in "people_mentioned"
            for p in rec.get("people_mentioned", []):
                # Defensive parsing: handle dicts or plain strings
                if isinstance(p, dict):
                    name        = p.get("name")
                    wikidata_id = p.get("wikidata_id")
                    person_name = p.get("person_name", name)
                else:
                    # fallback if it's just a string
                    name        = str(p)
                    wikidata_id = None
                    person_name = str(p)

                people_rows.append({
                    "year": year,
                    "source_file": sf_name,  # basename only
                    "issue": issue_name,     # new column
                    "art_id": art_id,
                    "page": page,
                    "name": name,
                    "wikidata_id": wikidata_id,
                    "person_name": person_name,
                })

# Convert to DataFrame
people_df = pd.DataFrame(people_rows)

# Save to CSV
people_df.to_csv(out_csv, index=False, encoding="utf-8")
print(f"Wrote {len(people_df)} rows to {out_csv}")

people_df.head(10)


Wrote 0 rows to /Users/stepanyan/Documents/UCL/GitHub-Projects/Newspaper-Semantic-Enrichment/2-semantic-processing/output-semantic/csv-exports/locations_mentioned.csv


#### 10. Convert JSONL output to CSV for Wikidata upload
##### 10 b. Extract Places Mentioned
This section iterates through the files in output-semantic->csv-
exports and creates a single file


In [None]:
# --- Step 10: Extract locations from JSONL into a CSV (with components) ---
import json
import pandas as pd
from pathlib import Path

# ---- Inputs/outputs
jsonl_dir = Path(OUTPUT_ROOT) / "newswire-format-issues"
CSV_DIR   =  OUTPUT_ROOT/"csv-exports"
CSV_DIR.mkdir(parents=True, exist_ok=True)

out_csv   = Path(CSV_DIR) / "locations_mentioned.csv"

# ---- Helpers
def make_issue(name: str | None) -> str | None:
    """From a filename (no dirs), strip '-cleaned.jsonl' (preferred), else '-cleaned.json',
    else just remove .jsonl/.json. Return None if name is falsy."""
    if not name:
        return None
    s = Path(name).name
    for suf in ("-cleaned.jsonl", "-cleaned.json", ".jsonl", ".json"):
        if s.endswith(suf):
            return s[: -len(suf)]
    return Path(s).stem

def to_float(x):
    try:
        return float(x)
    except Exception:
        return None

def parse_location(loc) -> dict:
    """
    Normalize a location entry (dict or str) to:
    label, type, street, city, state, country, postcode, longitude, latitude, location_notes
    Handles our structured output (with 'coordinates') and common Nominatim shapes.
    """
    # Defaults
    out = {
        "label": None,
        "type": None,
        "street": None,
        "city": None,
        "state": None,
        "country": None,
        "postcode": None,
        "longitude": None,
        "latitude": None,
        "location_notes": None,
    }

    # If it's a plain string, store it as a note
    if not isinstance(loc, dict):
        out["location_notes"] = str(loc)
        return out

    d = loc

    # 0) New enriched fields from upstream
    out["label"]   = (d.get("label") or d.get("mention") or None)
    out["type"]    = (d.get("type")  or d.get("kind")    or None)
    out["street"]  = d.get("street")
    out["postcode"] = d.get("postcode")

    # 1) Direct city/state/country fields (with fallbacks)
    out["city"]    = d.get("city")    or d.get("town") or d.get("village") or d.get("municipality")
    out["state"]   = d.get("state")   or d.get("region") or d.get("province")
    out["country"] = d.get("country")

    # 2) Coordinates:
    # Prefer unified 'coordinates' = [lat, lon] if present, else fall back to lat/lon keys.
    coords = d.get("coordinates")
    if isinstance(coords, (list, tuple)) and len(coords) >= 2:
        out["latitude"]  = to_float(coords[0])
        out["longitude"] = to_float(coords[1])
    else:
        lon = d.get("longitude", d.get("lon", d.get("lng")))
        lat = d.get("latitude",  d.get("lat"))
        # GeoJSON-ish geometry fallback
        if lon is None or lat is None:
            geom = d.get("geometry") or d.get("geo") or d.get("point")
            if isinstance(geom, dict):
                if "coordinates" in geom and isinstance(geom["coordinates"], (list, tuple)) and len(geom["coordinates"]) >= 2:
                    lon = geom["coordinates"][0]
                    lat = geom["coordinates"][1]
                else:
                    lon = lon or geom.get("lon") or geom.get("lng")
                    lat = lat or geom.get("lat")
        out["longitude"] = to_float(lon)
        out["latitude"]  = to_float(lat)

    # 3) Nominatim-style address fallback (fills missing street/city/state/country/postcode)
    addr = d.get("address")
    if isinstance(addr, dict):
        out["street"]   = out["street"]   or addr.get("road") or addr.get("pedestrian") or addr.get("residential") or addr.get("street")
        out["city"]     = out["city"]     or addr.get("city") or addr.get("town") or addr.get("village") or addr.get("hamlet") or addr.get("suburb")
        out["state"]    = out["state"]    or addr.get("state") or addr.get("region") or addr.get("province")
        out["country"]  = out["country"]  or addr.get("country")
        out["postcode"] = out["postcode"] or addr.get("postcode")

    # 4) Notes / display name / provenance
    # Prefer explicit 'location_notes' if present; otherwise collect a few fields.
    if d.get("location_notes"):
        out["location_notes"] = d["location_notes"]
    else:
        parts = []
        for key in ("display_name", "source", "confidence"):
            v = d.get(key)
            if v is not None and str(v).strip():
                parts.append(f"{key}={v}")
        out["location_notes"] = "; ".join(parts) if parts else None

    return out

# ---- Main extraction
rows = []

for f in sorted(jsonl_dir.glob("*.jsonl")):
    with open(f, "r", encoding="utf-8") as fh:
        for line in fh:
            if not line.strip():
                continue
            rec = json.loads(line)

            year        = rec.get("year")
            sf_full     = rec.get("source_file")
            sf_name     = Path(sf_full).name if sf_full else None
            issue_name  = make_issue(sf_name)

            art_id      = rec.get("art_id")
            page        = rec.get("page")
            article     = rec.get("article") or rec.get("text")  # be tolerant

            # locations may be saved under a few different keys; try them in order
            loc_list = (
                rec.get("locations_mentioned")
                or rec.get("_locations_structured")
                or rec.get("resolved_locations")
                or rec.get("locations")
                or []
            )
            if not isinstance(loc_list, list):
                loc_list = [loc_list]

            for loc in loc_list:
                norm = parse_location(loc)
                rows.append({
                    "year": year,
                    "source_file": sf_name,    # basename only
                    "issue": issue_name,       # derived issue name
                    "art_id": art_id,
                    "article": article,
                    "page": page,
                    # NEW enriched fields
                    "label": norm["label"],
                    "type": norm["type"],
                    "street": norm["street"],
                    "postcode": norm["postcode"],
                    # Existing fields
                    "city": norm["city"],
                    "state": norm["state"],
                    "country": norm["country"],
                    "longitude": norm["longitude"],
                    "latitude": norm["latitude"],
                    "location_notes": norm["location_notes"],
                })

# ---- DataFrame + save
loc_df = pd.DataFrame(rows)
out_csv.parent.mkdir(parents=True, exist_ok=True)
loc_df.to_csv(out_csv, index=False, encoding="utf-8")
print(f"Wrote {len(loc_df)} rows to {out_csv}")

loc_df.head(10)


Wrote 0 rows to /Users/stepanyan/Documents/UCL/GitHub-Projects/Newspaper-Semantic-Enrichment/2-semantic-processing/output-semantic/csv-exports/locations_mentioned.csv
