# Ontario Damages Compendium - Data Extraction & Embedding

This notebook extracts case data from the 2024 Damages Compendium PDF and generates embeddings for semantic similarity search.

## 1. Setup and Imports

In [None]:
import camelot
import pandas as pd
import json
from pathlib import Path
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
import re
import warnings
warnings.filterwarnings('ignore')

# Configuration
PDF_PATH = "2024damagescompendium.pdf"
OUTPUT_JSON = "data/damages_with_embeddings.json"
RAW_CSV = "data/damages_raw.csv"

# Create data directory
Path("data").mkdir(exist_ok=True)

print("âœ… Imports complete")

## 2. Load Embedding Model

We use `sentence-transformers/all-MiniLM-L6-v2` - a lightweight but effective model for semantic similarity.
- 384-dimensional embeddings
- Fast inference
- Good balance of speed and accuracy

In [None]:
print("ðŸ“¥ Loading embedding model...")
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
print("âœ… Model loaded successfully")

## 3. Extract Tables from PDF

Extract all tables using Camelot with the 'lattice' flavor (best for tables with clear borders).

In [None]:
print("ðŸ“„ Extracting tables from PDF...")
print(f"   PDF: {PDF_PATH}")

# Extract tables
tables = camelot.read_pdf(
    PDF_PATH,
    pages="all",
    flavor="lattice",
    strip_text="\n"
)

print(f"âœ… Extracted {len(tables)} tables")

# Combine all tables
raw_frames = [t.df for t in tables]
raw = pd.concat(raw_frames, ignore_index=True)

print(f"   Total rows: {len(raw)}")

# Save raw extraction
raw.to_csv(RAW_CSV, index=False)
print(f"   Saved raw data to {RAW_CSV}")

## 4. Clean and Normalize Data

Remove empty rows, duplicate headers, and normalize the data structure.

In [None]:
print("ðŸ§½ Cleaning and normalizing data...")

# Remove empty rows
raw = raw[raw.apply(lambda row: not all(str(v).strip() == "" for v in row), axis=1)]

# Remove duplicate rows
raw = raw.drop_duplicates()

print(f"âœ… Cleaned data: {len(raw)} rows remaining")

## 5. Parse Cases and Extract Structured Data

Parse each row into structured case data, detecting region headers and case details.

In [None]:
def clean_currency(value):
    """Extract numeric value from currency string"""
    if pd.isna(value):
        return None
    try:
        value = str(value).replace("$", "").replace(",", "").strip()
        return float(value)
    except:
        return None

def is_section_header(row):
    """Check if row is a section header (body region)"""
    row_str = " ".join([str(v).strip() for v in row if str(v).strip()])
    # Section headers are typically all caps and relatively short
    return len(row_str) < 100 and row_str.isupper() and len([v for v in row if str(v).strip()]) <= 2

print("ðŸ“Š Parsing cases...")
cases = []
current_region = "UNKNOWN"

for idx, row in raw.iterrows():
    row_values = [str(c).strip() for c in row.tolist()]
    
    # Check if this is a section header
    if is_section_header(row_values):
        current_region = " ".join([v for v in row_values if v]).strip()
        continue
    
    # Skip if row is too short (less than 3 columns with data)
    non_empty = [v for v in row_values if v and v != "nan"]
    if len(non_empty) < 3:
        continue
    
    # Build summary text from all fields
    summary_text = " ".join(non_empty)
    
    # Try to extract structured fields (adjust indices based on actual PDF structure)
    case = {
        "region": current_region,
        "raw_fields": row_values,
        "summary_text": summary_text,
        "case_name": row_values[0] if len(row_values) > 0 else None,
        "year": None,
        "court": None,
        "damages": None
    }
    
    # Try to extract year (4 digits)
    year_match = re.search(r'\b(19|20)\d{2}\b', summary_text)
    if year_match:
        case["year"] = int(year_match.group(0))
    
    # Try to extract dollar amounts
    dollar_amounts = re.findall(r'\$[\d,]+', summary_text)
    if dollar_amounts:
        case["damages"] = clean_currency(dollar_amounts[0])
    
    cases.append(case)

print(f"âœ… Parsed {len(cases)} cases")
print(f"   Regions found: {len(set(c['region'] for c in cases))}")
print(f"   Sample regions: {list(set(c['region'] for c in cases))[:5]}")

## 6. Generate Embeddings

Create semantic embeddings for each case to enable similarity search.

In [None]:
print("ðŸ§  Generating embeddings...")
print("   This may take 1-3 minutes depending on dataset size...")

for case in tqdm(cases, desc="Creating embeddings"):
    # Combine region and summary for better semantic matching
    text_for_embedding = f"{case['region']} {case['summary_text']}"
    case["embedding"] = model.encode(text_for_embedding).tolist()

print("âœ… Embeddings generated")

## 7. Save Processed Data

Save the final dataset with embeddings as JSON.

In [None]:
print(f"ðŸ’¾ Saving processed data to {OUTPUT_JSON}...")

with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
    json.dump(cases, f, indent=2)

print("âœ… Data saved successfully")
print(f"\nðŸ“Š Summary:")
print(f"   Total cases: {len(cases)}")
print(f"   Regions: {len(set(c['region'] for c in cases))}")
print(f"   Cases with damages: {sum(1 for c in cases if c['damages'])}")
print(f"   Cases with year: {sum(1 for c in cases if c['year'])}")
print(f"\nâœ… Ready to use in Streamlit app!")

## 8. Quick Data Inspection

View a sample of the extracted cases.

In [None]:
# Display sample cases
print("\nðŸ“‹ Sample Cases:")
print("=" * 80)

for i, case in enumerate(cases[:3], 1):
    print(f"\nCase {i}:")
    print(f"  Region: {case['region']}")
    print(f"  Case Name: {case['case_name']}")
    print(f"  Year: {case['year']}")
    print(f"  Damages: ${case['damages']:,.0f}" if case['damages'] else "  Damages: Not found")
    print(f"  Summary: {case['summary_text'][:150]}...")
    print(f"  Embedding dimension: {len(case['embedding'])}")
    print("-" * 80)

## Next Steps

1. Run the Streamlit app: `streamlit run streamlit_app.py`
2. Test the search functionality with various injury descriptions
3. Refine region mappings in `region_map.json` if needed
4. Add custom SVG body diagrams in the `assets/` folder