In [11]:
# ─── 1) Install required packages (only needs to run once) ────────────────────
import sys
import subprocess
subprocess.check_call([sys.executable, "-m", "pip", "install", "--quiet", "beautifulsoup4", "lxml", "pandas"])

# ─── 2) Imports ───────────────────────────────────────────────────────────────
import re
import pandas as pd
from bs4 import BeautifulSoup

# ─── 3) Read & parse the HTML file ────────────────────────────────────────────
with open("female-detainee-cases.html", encoding="utf-8") as f:
    soup = BeautifulSoup(f, "lxml")
links = soup.body.find_all("a")

# ─── 4) Filter only case‑style links (those whose href begins with digits) ────
case_links = []
for a in links:
    href = a.get("href", "").strip()
    if re.match(r"^\d+", href):
        case_links.append((href, a.get_text(strip=True)))

# ─── 5) Extract case_no, raw text, and cleaned description ───────────────────
records = []
for href, text in case_links:
    case_no = int(re.match(r"^(\d+)", href).group(1))
    # strip "Case NNNN" prefix from link text
    desc = re.sub(r"^Case\s*\d+\s*", "", text, flags=re.IGNORECASE)
    # collapse multiple spaces
    desc = " ".join(desc.split())
    records.append({
        "case_no": case_no,
        "href": href,
        "raw_text": text,
        "description": desc
    })

# ─── 6) Classify each link as primary / appeal / transcript_page ───────────────
for rec in records:
    href = rec["href"].lower()
    txt  = rec["raw_text"].lower()
    if "appeal" in href or "appeal" in txt:
        rec["type"] = "appeal"
    elif re.search(r"[ab]\.html$", href):
        rec["type"] = "transcript_page"
    else:
        rec["type"] = "primary"

# ─── 7) Build DataFrame & run basic QC ────────────────────────────────────────
df = pd.DataFrame(records)
print("Link types count:\n", df["type"].value_counts(), "\n")
dups = df[df.duplicated(subset=["case_no", "type"], keep=False)]
if not dups.empty:
    print("⚠️ Duplicate case_no+type combos found:\n", dups, "\n")
else:
    print("No duplicate case_no+type combinations.\n")

print("Sample rows:")
print(df.head(), "\n")

# ─── 8) Export to CSV ─────────────────────────────────────────────────────────
out_path = "female_detainee_cases.csv"
df.to_csv(out_path, index=False)
print(f"✅ Exported {len(df)} records to {out_path}")


Link types count:
 type
primary            133
appeal               2
transcript_page      2
Name: count, dtype: int64 

⚠️ Duplicate case_no+type combos found:
     case_no         href                                           raw_text  \
52     5062   5062a.html                     Case 5062 Quock Ah Sip records   
53     5062   5062b.html     Case 5062 Quock Ah Sip Testimony pages 1 to 22   
54     5062  5062b2.html    Case 5062 Quock Ah Sip Testimony pages 23 to 49   
55     5062  5062b3.html    Case 5062 Quock Ah Sip Testimony pages 49 to 72   
56     5062  5062b4.html    Case 5062 Quock Ah Sip Testimony pages 72 to 99   
57     5062  5062b5.html  Case 5062 Quock Ah Sip Testimony pages 100 to END   

                                description             type  
52                     Quock Ah Sip records  transcript_page  
53     Quock Ah Sip Testimony pages 1 to 22  transcript_page  
54    Quock Ah Sip Testimony pages 23 to 49          primary  
55    Quock Ah Sip Testimony pag