In [1]:
import pdfplumber
from pathlib import Path

PDF_PATH = "2024damagescompendium.pdf"

def extract_text_pdfplumber(path):
    pages = []
    with pdfplumber.open(path) as pdf:
        for i, p in enumerate(pdf.pages):
            text = p.extract_text()
            if not text:
                print(f"[warn] Page {i+1} had no extractable text (may be scanned).")
                text = ""
            pages.append(text)
    return pages

pages = extract_text_pdfplumber(PDF_PATH)

print(f"Extracted {len(pages)} pages")
print("Sample page text:")
print(pages[0][:1000])


Extracted 667 pages
Sample page text:
COMPENDIUM OF DAMAGES AWARDED IN
PERSONAL INJURY ACTIONS ACROSS ONTARIO
JANUARY 1999 - OCTOBER 2024
THE HONOURABLE JUSTICE JAYE HOOPER
AND THE HONOURABLE JAMES B. CHADWICK, Q.C.
ANDREW CLARKE (University of Ottawa Common Law Student)
In Acknowledgment of the Contributions of
STEPHEN BLAIR (Sessional Professor – University of Ottawa)
SARAH SAAD (Associate – Low Murchison Radnoff LLP)
LIAM CARDILL (Partner – MBC Law)
ETHAN ZAVARELLA, HANEEN FAISAL, BENJAMIN ISAAK, PHILIP BYUN, PARISA KHAZRA, ALEX DIGIOVANNI,
CAMERON FYNNEY, JORDANA KROFT, CALEB TIMMERMANN, GWENDOLEN BOYLE, JACK KENT, ANTONIO
GIAMBERARDINO, KATARINA GERMANI, JULIAN COSENTINO, STEPHEN LAJEUNESSE, DENNIS MYERS, DAVID
TURNER, SEAN BAWDEN, LESLIE KIRK, CHRISTINA PARKES, MÉLANIE SICOTTE, DAWN SEARLE, ALEXANDRA
SCHORAH, SEETHA L. RAMANATHAN, MICHELLE LUTFY AND JACK KENT


In [6]:
#!/usr/bin/env python3
"""
parse_compendium_hierarchical.py

Produces:
  - /mnt/data/compendium_parsed_hier.json   (single hierarchical JSON object, pretty-printed)
  - /mnt/data/compendium_parsed.json        (newline-delimited JSON: one case per line)
  - /mnt/data/compendium_parsed_preview.csv (CSV preview)

Assumptions & approach:
  - Uses pdfplumber only (install via pip).
  - Reads the Table of Contents area (searching first ~10 pages) to build section names + printed page numbers.
  - Maps section names to actual PDF page indices by searching for the section heading text in extracted pages.
  - Splits each section into blocks using the appearance of the "table header" row or blank-line heuristics.
  - Extracts fields with conservative regexes and then structures records hierarchically under sections.
  - Breaks multi-injury rows into multiple injury entries nested under the same case.
"""

from pathlib import Path
import re
import json
import csv
import pdfplumber
from typing import List, Dict, Optional, Tuple

PDF_PATH = "2024damagescompendium.pdf"
OUT_HIER_JSON = "compendium_parsed_hier.json"
OUT_NDJSON = "compendium_parsed.json"
OUT_CSV = "compendium_parsed_preview.csv"

# ---------- Utilities for extraction/parsing ----------

def extract_pages_text(path: Path) -> List[str]:
    """Extract text from each page using pdfplumber; returns list of page texts (index 0 => page 1)."""
    pages = []
    with pdfplumber.open(path) as pdf:
        for i, p in enumerate(pdf.pages):
            text = p.extract_text() or ""
            pages.append(text)
    return pages

def find_toc_block(pages: List[str], toc_anchor="TABLE OF CONTENTS", search_pages=12) -> Optional[Tuple[int,str]]:
    """Return (page_index, toc_text) for the first page containing 'TABLE OF CONTENTS' within search_pages."""
    for i in range(min(search_pages, len(pages))):
        if pages[i] and toc_anchor.lower() in pages[i].lower():
            # capture a few pages after in case TOC spans multiple pages
            combined = pages[i]
            for j in range(i+1, min(len(pages), i+6)):
                # include subsequent pages as long as they look like TOC (many dots or digits)
                if re.search(r'\.{10,}', pages[j]) or re.search(r'\b\d{1,3}\b', pages[j]):
                    combined += "\n\n" + pages[j]
                else:
                    break
            return i, combined
    return None

def parse_toc_lines(toc_text: str) -> List[Tuple[str, Optional[int]]]:
    """
    Parse TOC lines into (section_name, printed_page_number).
    Heuristic: lines with many dots or long whitespace then a number.
    """
    lines = [l.strip() for l in toc_text.splitlines() if l.strip()]
    parsed = []
    for ln in lines:
        # common TOC line: SECTION NAME ..... 123
        m = re.match(r'^(?P<section>.+?)\s+\.{3,}\s*(?P<pnum>\d{1,4})\s*$', ln)
        if not m:
            # alternative: words then many spaces then number
            m = re.match(r'^(?P<section>.+?)\s{2,}(?P<pnum>\d{1,4})\s*$', ln)
        if m:
            name = re.sub(r'\s+', ' ', m.group('section')).strip()
            parsed.append((name, int(m.group('pnum'))))
        else:
            # also capture lines which are all caps and likely a section header with no page number on same line
            if ln.isupper() and len(ln.split()) <= 6:
                parsed.append((ln, None))
    # Deduplicate and clean
    cleaned = []
    seen = set()
    for name, pnum in parsed:
        key = name.lower()
        if key in seen:
            continue
        seen.add(key)
        cleaned.append((name, pnum))
    return cleaned

def map_sections_to_pages(sections: List[Tuple[str,Optional[int]]], pages: List[str]) -> List[Dict]:
    """
    For each TOC section try to find the first page index where that section name appears.
    Fallback: if printed page number exists, attempt to map via that (printed page number -> index = printed-1).
    Returns list of dicts: {'name','printed_page','page_index'}.
    """
    mapped = []
    n_pages = len(pages)
    for name, printed in sections:
        lower_name = re.sub(r'[^a-z0-9 ]', '', name.lower())
        found_index = None
        # search pages for the name as substring (case-insensitive)
        for i, ptxt in enumerate(pages):
            if not ptxt:
                continue
            if lower_name and lower_name in re.sub(r'[^a-z0-9 ]','', ptxt.lower()):
                found_index = i
                break
        if found_index is None and printed is not None:
            candidate = printed - 1
            if 0 <= candidate < n_pages:
                found_index = candidate
        # if still None, set to 0 (document fallback)
        if found_index is None:
            found_index = 0
        mapped.append({'name': name, 'printed_page': printed, 'page_index': found_index})
    # sort by page_index
    mapped.sort(key=lambda x: x['page_index'])
    return mapped

# ---------- Block splitting and field extraction ----------

TABLE_HEADER_RE = re.compile(r'Plaintiff\s+Defendant\s+Year\s+Citation\s+Court\s+Judge\s+Sex', re.I)

def split_section_into_blocks(section_pages_text: List[str]) -> List[str]:
    """
    Join pages and split into candidate blocks representing individual cases.
    Prefer splitting where the table header appears; otherwise use blank-line heuristics.
    """
    joined = "\n\n".join(section_pages_text)
    # If the table header appears, split using it as anchor (slice into rows after header)
    if TABLE_HEADER_RE.search(joined):
        # remove header lines so we can split into records by occurrence of a year/case name pattern
        # attempt to split by blank-lines where a new case probably starts (line starting with uppercase name)
        parts = re.split(r'\n\s*\n', joined)
        # group contiguous lines into blocks where a line looks like a case heading (e.g., starts with a name)
        blocks = []
        cur = []
        for p in parts:
            # treat lines starting with a capitalized word and containing comma or colon or ending with 'J.' as possible start
            first_line = p.splitlines()[0].strip() if p.strip() else ''
            if re.match(r'^[A-Z][A-Za-z0-9\-\&\.\' ]{2,50}(:|,|\sJ\.|\sS\.C\.J\.|\sS\.C\.J:)?', first_line):
                if cur:
                    blocks.append("\n\n".join(cur))
                cur = [p]
            else:
                cur.append(p)
        if cur:
            blocks.append("\n\n".join(cur))
        # filter too-short items
        blocks = [b.strip() for b in blocks if len(b.strip()) > 60]
        return blocks
    # fallback: split by double newlines and then join into roughly-case-sized chunks
    parts = [p.strip() for p in joined.split('\n\n') if p.strip()]
    blocks = []
    cur = []
    for p in parts:
        cur.append(p)
        if len("\n".join(cur)) > 300:
            blocks.append("\n".join(cur))
            cur = []
    if cur:
        blocks.append("\n".join(cur))
    return blocks

INJURY_KEYWORDS = [
    "brain","skull","head","neck","spine","back","whiplash","arm","wrist","hand","elbow",
    "shoulder","hip","knee","leg","ankle","foot","eye","ear","tooth","dental","scar",
    "burn","laceration","concussion","paraplegia","quadriplegia","nerve","psych", "depression"
]

def extract_fields_from_block(block: str) -> Dict:
    """
    Conservative field extraction producing a structured dict for a case block.
    Fields: case_name, year, citation, court, judge, sex, age, values (list),
            non_pecuniary, general_damages, other_damages, comments,
            injuries (list of dicts), raw.
    Each injury dict = {'text': ..., 'regions': [...], 'values': [...]}
    """
    out = {
        'case_name': None,
        'year': None,
        'citation': None,
        'court': None,
        'judge': None,
        'sex': None,
        'age': None,
        'values': [],
        'non_pecuniary': None,
        'general_damages': None,
        'other_damages': None,
        'comments': None,
        'injuries': [],
        'raw': block
    }

    # Year: first 4-digit year within plausible range
    ym = re.search(r'\b(19|20)\d{2}\b', block)
    if ym:
        out['year'] = ym.group(0)

    # Candidate case name: line that seems like the first heading
    lines = [l.strip() for l in block.splitlines() if l.strip()]
    if lines:
        # often the first meaningful long line is the case title
        out['case_name'] = lines[0][:240]

    # All $ amounts found
    vals = re.findall(r'\$[\d,]+(?:\.\d{2})?', block)
    out['values'] = vals

    # General / Non-Pecuniary
    g = re.search(r'General\s*Damages?\s*[:\-]?\s*(\$[\d,]+(?:\.\d{2})?)', block, re.I)
    if g:
        out['general_damages'] = g.group(1)
    np = re.search(r'Non[- ]?Pecuniary[\s\S]{0,120}(\$[\d,]+(?:\.\d{2})?)', block, re.I)
    if np:
        out['non_pecuniary'] = np.group(1)

    # Comments: after 'Comments' label or last long paragraph
    if 'Comments' in block:
        out['comments'] = block.split('Comments',1)[1].strip()
    else:
        paras = [p for p in re.split(r'\n\s*\n', block) if len(p.strip()) > 40]
        out['comments'] = paras[-1].strip() if paras else ''

    # Detect injuries: extract sentences/clauses that contain keywords
    sentences = re.split(r'[;\.\n]\s+', out['comments']) if out['comments'] else []
    injuries_found = []
    for s in sentences:
        for kw in INJURY_KEYWORDS:
            if re.search(r'\b' + re.escape(kw) + r'\b', s, re.I):
                injuries_found.append(s.strip())
                break

    # If none found in comments, search entire block and capture snippets
    if not injuries_found:
        for kw in INJURY_KEYWORDS:
            for m in re.finditer(r'(.{0,80}\b' + re.escape(kw) + r'\b.{0,80})', block, re.I):
                injuries_found.append(m.group(1).strip())
        # dedupe
        injuries_found = list(dict.fromkeys(injuries_found))

    # Convert each injury snippet to a structured dict
    injuries_structured = []
    for inj in injuries_found:
        regs = sorted(set([kw for kw in INJURY_KEYWORDS if re.search(r'\b' + re.escape(kw) + r'\b', inj, re.I)]))
        vals_inj = re.findall(r'\$[\d,]+(?:\.\d{2})?', inj)
        injuries_structured.append({'text': inj, 'regions': regs, 'values': vals_inj})

    out['injuries'] = injuries_structured

    return out

# ---------- High-level orchestration ----------

def build_hierarchical_json(pages_text: List[str]) -> Dict:
    """
    Build hierarchical JSON object:
    {
      'title': <first page heading or file name>,
      'sections': [
         {
           'name': 'HEAD',
           'printed_page': 1,
           'page_index': 3,
           'cases': [ {case dict}, ... ]
         }, ...
      ]
    }
    """
    title = None
    if pages_text and pages_text[0]:
        # first non-empty line on first page
        first_lines = [l.strip() for l in pages_text[0].splitlines() if l.strip()]
        if first_lines:
            title = first_lines[0]

    # 1) Find TOC and parse
    toc_info = find_toc_block(pages_text, toc_anchor="TABLE OF CONTENTS", search_pages=12)
    if toc_info:
        toc_page_idx, toc_text = toc_info
        sections = parse_toc_lines(toc_text)
    else:
        # If no TOC, fall back to top-level all-caps headings found in the first ~50 pages
        sections = []
        for i in range(min(50, len(pages_text))):
            txt = pages_text[i] or ""
            for line in txt.splitlines():
                ln = line.strip()
                if ln.isupper() and len(ln) > 3 and len(ln.split()) <= 6:
                    sections.append((ln, None))
        # fallback unique
        seen = set()
        sections = [s for s in sections if not (s[0].lower() in seen or seen.add(s[0].lower()))]

    # 2) Map to actual page indices
    mapped = map_sections_to_pages(sections, pages_text)
    # append sentinel end section at end of document
    for i, m in enumerate(mapped):
        mapped[i]['cases'] = []

    # 3) Determine page ranges for each mapped section
    for i in range(len(mapped)):
        start = mapped[i]['page_index']
        end = mapped[i+1]['page_index'] if i+1 < len(mapped) else len(pages_text)
        # collect page texts for this section
        sec_pages = [pages_text[p] or "" for p in range(start, end)]
        # 4) split section into blocks
        blocks = split_section_into_blocks(sec_pages)
        # 5) parse blocks into structured cases
        for b in blocks:
            case = extract_fields_from_block(b)
            # case-level duplicate detection may be applied downstream
            mapped[i]['cases'].append(case)

    # 6) Build top-level JSON
    result = {
        'title': title or PDF_PATH.name,
        'source_pdf_path': str(PDF_PATH),
        'sections': mapped
    }
    return result

# ---------- Write outputs ----------

def dedupe_cases_in_sections(hier: Dict) -> Dict:
    """
    Optionally deduplicate identical case raw texts within each section (exact raw match).
    Keeps first occurrence.
    """
    for sec in hier.get('sections', []):
        seen = set()
        new_cases = []
        for c in sec.get('cases', []):
            r = c.get('raw', '')
            key = r.strip()[:150]  # a short fingerprint
            if key in seen:
                continue
            seen.add(key)
            new_cases.append(c)
        sec['cases'] = new_cases
    return hier

def write_outputs(hier: Dict):
    # pretty hierarchical JSON
    with open(OUT_HIER_JSON, 'w', encoding='utf-8') as f:
        json.dump(hier, f, ensure_ascii=False, indent=2)

    # NDJSON (one case per line) plus section metadata
    with open(OUT_NDJSON, 'w', encoding='utf-8') as f:
        for sec in hier.get('sections', []):
            for case in sec.get('cases', []):
                out = dict(case)  # shallow copy
                out['_section'] = sec['name']
                out['_section_printed_page'] = sec.get('printed_page')
                out['_section_page_index'] = sec.get('page_index')
                f.write(json.dumps(out, ensure_ascii=False) + '\n')

    # CSV preview
    preview_cols = ['_section','case_name','year','general_damages','non_pecuniary','values','injuries_text','injury_regions']
    with open(OUT_CSV, 'w', newline='', encoding='utf-8') as csvf:
        w = csv.writer(csvf)
        w.writerow(preview_cols)
        for sec in hier.get('sections', []):
            for c in sec.get('cases', []):
                injuries_text = " | ".join([i['text'] for i in c.get('injuries', [])]) if c.get('injuries') else ''
                injury_regions = "; ".join(sorted({r for i in c.get('injuries',[]) for r in i.get('regions',[]) })) if c.get('injuries') else ''
                row = [
                    sec['name'],
                    c.get('case_name'),
                    c.get('year'),
                    c.get('general_damages'),
                    c.get('non_pecuniary'),
                    "; ".join(c.get('values',[])),
                    injuries_text,
                    injury_regions
                ]
                w.writerow(row)

# ---------- Main ----------



print("Extracting pages...")
pages = extract_pages_text(PDF_PATH)
print(f"Extracted {len(pages)} pages.")

print("Building hierarchical JSON (TOC detection + mapping)")
hier = build_hierarchical_json(pages)

print("Deduplicating obvious duplicates within sections...")
hier = dedupe_cases_in_sections(hier)

print("Writing outputs...")
write_outputs(hier)

print("Done.")
print(f"Hierarchical JSON: {OUT_HIER_JSON}")
print(f"NDJSON (per-case): {OUT_NDJSON}")
print(f"CSV preview: {OUT_CSV}")



Extracting pages...
Extracted 667 pages.
Building hierarchical JSON (TOC detection + mapping)
Deduplicating obvious duplicates within sections...
Writing outputs...
Done.
Hierarchical JSON: compendium_parsed_hier.json
NDJSON (per-case): compendium_parsed.json
CSV preview: compendium_parsed_preview.csv
