#### Pre-Processing Pipeline

This notebook processes OCR JSON files from newspaper digitization projects in two stages:

##### Stage 1: Text Cleaning
- Normalizes Unicode characters (curly quotes → straight quotes)  
- Collapses dotted abbreviations (U.C.L. → UCL)
- Preserves original JSON structure
- Outputs files with `-cleaned.json` suffix

##### How does it work:
- Recursively finds all `.json` files (excluding already cleaned ones)
- Normalizes quote characters and abbreviations  
- Preserves complete JSON structure
- Reports processing statistics and any errors

**Output**: Creates mirror directory structure under `output-pages-cleaned/` with cleaned files.

In [32]:
# --- Clean page JSONs, save cleaned pages, then (optionally) combine per issue --- #
from pathlib import Path
import json, re

# Locate this notebook/script directory
if "__file__" in globals():                      # when run as a .py script
    BASE_DIR = Path(__file__).resolve().parent
else:                                            # when run inside Jupyter
    BASE_DIR = Path.cwd()

# Folder layout
INPUT_ROOT         = BASE_DIR / "input-pages"                       # source pages
OUTPUT_PAGES_ROOT  = BASE_DIR / "output-pages-cleaned"              # cleaned pages  (<< your “pages” folder)
OUTPUT_ISSUES_ROOT = BASE_DIR / "output-issues-combined-cleaned"    # combined issues

# Ensure output roots exist
OUTPUT_PAGES_ROOT.mkdir(parents=True, exist_ok=True)
OUTPUT_ISSUES_ROOT.mkdir(parents=True, exist_ok=True)


# Mirror the input folder tree under OUTPUT_PAGES_ROOT
for d in [INPUT_ROOT] + [p for p in INPUT_ROOT.rglob("*") if p.is_dir()]:
    (OUTPUT_PAGES_ROOT / d.relative_to(INPUT_ROOT)).mkdir(parents=True, exist_ok=True)


# --- Text cleaning helpers ----------------------------------------------------
def normalize_quotes(s: str) -> str:
    # Curly quotes → straight quotes
    return (
        s.replace("\u2018", "'")
         .replace("\u2019", "'")
         .replace("\u201C", '"')
         .replace("\u201D", '"')
    )

# Collapse dotted abbreviations like U.C.L., U.S.A., incl. spaced variants U. S. A. → UCL/USA
_abbr_with_space = re.compile(r'\b(?:[A-Za-z]\s*\.){1,}[A-Za-z]\s*\.?')
_abbr_no_space   = re.compile(r'\b(?:[A-Za-z]\.){1,}[A-Za-z]\.?')
def collapse_dotted_abbreviations(s: str) -> str:
    def repl(m):  # keep letters only
        return ''.join(ch for ch in m.group(0) if ch.isalpha())
    s = _abbr_with_space.sub(repl, s)
    s = _abbr_no_space.sub(repl, s)
    return s

def clean_string(s: str) -> str:
    s = normalize_quotes(s)
    s = collapse_dotted_abbreviations(s)
    return s

def clean_json(obj):
    """Recursively clean ALL string fields while preserving structure."""
    if isinstance(obj, str):
        return clean_string(obj)
    if isinstance(obj, list):
        return [clean_json(v) for v in obj]
    if isinstance(obj, dict):
        return {k: clean_json(v) for k, v in obj.items()}
    return obj

# --- Start the process of Cleaning JSON files for all the pages ------------------------------------------------------------
processed, skipped, errors = 0, 0, []

for f in INPUT_ROOT.rglob("*.json"):
    # Skip any files that look already cleaned
    if f.name.endswith("-cleaned.json"):
        continue
    rel = f.relative_to(INPUT_ROOT)
    out_dir = OUTPUT_PAGES_ROOT / rel.parent
    out_path = out_dir / (f.stem + "-cleaned.json")
    try:
        with open(f, "r", encoding="utf-8") as fh:
            data = json.load(fh)
    except Exception as e:
        errors.append((str(f), f"read: {e}"))
        continue

    try:
        cleaned = clean_json(data)
    except Exception as e:
        errors.append((str(f), f"clean: {e}"))
        continue

    try:
        with open(out_path, "w", encoding="utf-8") as oh:
            json.dump(cleaned, oh, ensure_ascii=False, indent=2)
        processed += 1
    except Exception as e:
        errors.append((str(out_path), f"write: {e}"))

print(f"Done. Processed: {processed} file(s). Errors: {len(errors)}.")
if errors:
    for path, msg in errors[:10]:  # show up to first 10 errors
        print(f"- {path} -> {msg}")
    if len(errors) > 10:
        print(f"... and {len(errors)-10} more")
print(f"Cleaned files are under: {OUTPUT_PAGES_ROOT}")


Done. Processed: 3 file(s). Errors: 0.
Cleaned files are under: /Users/stepanyan/Documents/UCL/GitHub-Projects/Newspaper-Semantic-Enrichment/1-pre-processing/output-pages-cleaned


#### Stage 2: Page Combination  

- Groups pages by issue using filename patterns (`base_001.json`, `base_002.json` → `base.json`)
- Adds sequential article IDs and page numbers
- Creates consolidated JSON files with consistent structure
- Maintains directory hierarchy


##### How does it work:
**Input Pattern**: Files like `pi_vol_7_5_1951_001-cleaned.json`, `pi_vol_7_5_1951_002-cleaned.json`  
**Output**: Single file `pi_vol_7_5_1951-cleaned.json` containing all articles with:
- Sequential article IDs (1, 2, 3...)
- Source page numbers for each article
- Consistent `{"articles": [...]}` structure

**Processing Summary**: Shows pages combined, article counts, and output paths for verification.

In [33]:
# Combine multi-page OCR JSONs into one file per issue.
# Jupyter-friendly: hard-coded paths, no CLI args.

import os, re, json
from json import JSONDecodeError
from collections import defaultdict
from typing import List, Tuple, Union, Any

# ---- EDIT THESE TWO LINES IF YOUR PATHS CHANGE ----
INPUT_ROOT  = OUTPUT_PAGES_ROOT
OUTPUT_ROOT = OUTPUT_ISSUES_ROOT
# ---------------------------------------------------

OVERWRITE = False      # set True to overwrite existing combined files
DRY_RUN   = False      # set True to preview without writing

# Matches multi-page files like <base>_001-cleaned.json
PAGE_FILE_RE    = re.compile(r"^(?P<base>.+?)_(?P<page>\d{3})-cleaned\.json$")
# Matches single-page cleaned file like <base>-cleaned.json (no _NNN)
SINGLE_FILE_RE  = re.compile(r"^(?P<base>.+?)-cleaned\.json$")

def assert_paths():
    if not os.path.isdir(INPUT_ROOT):
        raise FileNotFoundError(f"INPUT_ROOT not found: {INPUT_ROOT}")
    # Prevent accidental recursion if OUTPUT_ROOT is inside INPUT_ROOT
    in_abs  = os.path.abspath(INPUT_ROOT)
    out_abs = os.path.abspath(OUTPUT_ROOT)
    if out_abs.startswith(in_abs + os.sep):
        raise ValueError("OUTPUT_ROOT must not be inside INPUT_ROOT. Choose a sibling or separate folder.")
    os.makedirs(OUTPUT_ROOT, exist_ok=True)

def mirror_dir_structure(in_root: str, out_root: str, dry_run: bool = False):
    for dirpath, dirnames, _ in os.walk(in_root):
        rel = os.path.relpath(dirpath, start=in_root)
        target_dir = os.path.join(out_root, rel) if rel != os.curdir else out_root
        if not dry_run:
            os.makedirs(target_dir, exist_ok=True)

def find_issue_groups(root: str):
    """
    Return mapping: (dirpath, base) -> list of (page_num, abs_path, dirpath, filename, base)

    Supports two patterns:
      • Multi-page: <base>_NNN-cleaned.json (NNN = 001, 002, …)
      • Single-page: <base>-cleaned.json  (treated as page 1)

    If BOTH exist for the same (dirpath, base), multi-page wins.
    """
    groups = defaultdict(list)
    # Track which (dirpath, base) have multi-page matches to prioritize them
    has_multi = set()

    for dirpath, _, files in os.walk(root):
        # First pass: collect multi-page files
        for fn in files:
            if not fn.endswith("-cleaned.json"):
                continue
            m = PAGE_FILE_RE.match(fn)
            if not m:
                continue
            base = m.group("base")
            page = int(m.group("page"))
            abs_path = os.path.join(dirpath, fn)
            key = (dirpath, base)
            has_multi.add(key)
            groups[key].append((page, abs_path, dirpath, fn, base))

        # Second pass: collect single-page files ONLY if no multi-page for same base
        for fn in files:
            if not fn.endswith("-cleaned.json"):
                continue
            # ignore multipage names here
            if PAGE_FILE_RE.match(fn):
                continue
            m = SINGLE_FILE_RE.match(fn)
            if not m:
                continue
            base = m.group("base")
            key = (dirpath, base)
            if key in has_multi:
                # prefer the multi-page set; skip singleton
                continue
            abs_path = os.path.join(dirpath, fn)
            # Treat as a single page = 1
            groups[key].append((1, abs_path, dirpath, fn, base))

    # Sort each group's entries by page number
    for k in groups:
        groups[k].sort(key=lambda x: x[0])
    return groups

def load_json_or_text(path: str):
    with open(path, "r", encoding="utf-8") as fh:
        try:
            return json.load(fh), "json"
        except JSONDecodeError:
            fh.seek(0)
            return fh.read(), "text"

def _extract_articles(data: Any) -> list:
    """
    Normalize a single page payload into a list of articles:
      - {"articles": [...]} -> [...]
      - [...]                -> [...]
      - {...}                -> [ {...} ]
      - other                -> []
    """
    if isinstance(data, dict):
        if "articles" in data:
            arts = data["articles"]
            return arts if isinstance(arts, list) else [arts]
        return [data]
    if isinstance(data, list):
        return data
    return []

def combine_pages(paged_files: List[Tuple[int, str]]) -> Tuple[Union[dict, str], str]:
    """
    Returns (combined, mode)
      - mode == 'json'  -> {"articles": [...]} where each article has 'id' and 'page'
      - mode == 'text'  -> newline-delimited fallback when any page is non-JSON
    """
    parsed = []
    all_json = True
    for page, path in paged_files:
        data, kind = load_json_or_text(path)
        parsed.append((page, data, kind))
        if kind != "json":
            all_json = False

    if not all_json:
        # Fallback: newline-delimited serialization (no normalization possible)
        lines = []
        for page, data, kind in parsed:
            if kind == "json":
                lines.append(json.dumps(data, ensure_ascii=False))
            else:
                lines.append(str(data))
        return "\n".join(lines), "text"

    # All JSON: normalize, annotate with page, then assign sequential ids
    combined_articles = []
    for page, data, _ in parsed:
        for art in _extract_articles(data):
            if not isinstance(art, dict):
                art = {"_raw": art}
            art = dict(art)
            art["page"] = page
            combined_articles.append(art)

    for i, art in enumerate(combined_articles, start=1):
        art["id"] = i

    return {"articles": combined_articles}, "json"

def ensure_out_dir(in_root: str, dirpath: str, out_root: str) -> str:
    rel = os.path.relpath(dirpath, start=in_root)
    target_dir = os.path.join(out_root, rel) if rel != os.curdir else out_root
    os.makedirs(target_dir, exist_ok=True)
    return target_dir

# ---- run ----
print(f"[INPUT ROOT]  {INPUT_ROOT}")
print(f"[OUTPUT ROOT] {OUTPUT_ROOT}")

assert_paths()
mirror_dir_structure(INPUT_ROOT, OUTPUT_ROOT, dry_run=DRY_RUN)

groups = find_issue_groups(INPUT_ROOT)
if not groups:
    print("No page groups found under INPUT_ROOT.")
else:
    total_written = 0
    for (dirpath, base), entries in groups.items():
        pages = [e[0] for e in entries]
        file_paths = [e[1] for e in entries]
        out_dir = ensure_out_dir(INPUT_ROOT, dirpath, OUTPUT_ROOT)
        out_filename = f"{base}-cleaned.json"  # remove _NNN (or same name for the singleton)
        out_path = os.path.join(out_dir, out_filename)

        if os.path.exists(out_path) and not OVERWRITE:
            print(f"[SKIP] exists: {os.path.relpath(out_path, start=OUTPUT_ROOT)}")
            continue

        paged_files = list(zip(pages, file_paths))  # (page, path)
        combined, mode = combine_pages(paged_files)

        rel_out = os.path.relpath(out_path, start=OUTPUT_ROOT)
        print(f"[WRITE] {rel_out}  pages={pages}  mode={mode}  articles={len(combined['articles']) if mode=='json' else 'N/A'}")

        if not DRY_RUN:
            if mode == "json":
                with open(out_path, "w", encoding="utf-8") as fh:
                    json.dump(combined, fh, ensure_ascii=False, indent=2)
                    fh.write("\n")
            else:
                with open(out_path, "w", encoding="utf-8") as fh:
                    fh.write(combined)
        total_written += 1

    print(f"Done. {'Would write' if DRY_RUN else 'Wrote'} {total_written} file(s).")


[INPUT ROOT]  /Users/stepanyan/Documents/UCL/GitHub-Projects/Newspaper-Semantic-Enrichment/1-pre-processing/output-pages-cleaned
[OUTPUT ROOT] /Users/stepanyan/Documents/UCL/GitHub-Projects/Newspaper-Semantic-Enrichment/1-pre-processing/output-issues-combined-cleaned
[WRITE] Pi-Newspaper-1978-cleaned.json  pages=[1, 2]  mode=json  articles=6
[WRITE] Pi-Newspaper-1979-cleaned.json  pages=[1]  mode=json  articles=3
Done. Wrote 2 file(s).
