In [21]:
import os
import json
import re
import requests
import PyPDF2
import pdfplumber
from io import BytesIO
from typing import Tuple, Optional

# ---- Helper: fallback title generator ----
def generate_title(url: str) -> str:
    try:
        filename = os.path.basename(url)
        title = (
            requests.utils.unquote(filename)
            .replace(".pdf", "")
            .replace("-", " ")
            .replace("_", " ")
            .title()
        )
        return title or "AI Governance Document"
    except Exception:
        return "AI Governance Document"


# ---- Helper: download a file ----
def download_file(url: str) -> bytes:
    resp = requests.get(url, timeout=30)
    resp.raise_for_status()
    return resp.content

# ---- Helper: clean extracted titles ----
def clean_title(text: str) -> str:
    # Remove leading page numbers like "1 Title", "2 - Title"
    cleaned = re.sub(r"^\d+\s*[-–.]?\s*", "", text)
    return cleaned.strip()

# ---- Extract PDF metadata ----
def extract_pdf_title(pdf_bytes: bytes) -> Tuple[Optional[str], str]:
    """
    Extracts a PDF title using multiple methods:
    - metadata (PyPDF2)
    - font size detection (pdfplumber)
    - fallback text parsing (PyPDF2)

    Returns:
        (title, source_flag)
        source_flag = 'metadata' | 'font-size' | 'text-fallback' | 'none'
    """
    # Step 1: Try PyPDF2 metadata
    try:
        pdf_reader = PyPDF2.PdfReader(BytesIO(pdf_bytes))
        if pdf_reader.metadata and "/Title" in pdf_reader.metadata:
            meta_title = pdf_reader.metadata["/Title"]
            if meta_title and not meta_title.lower().startswith("microsoft word"):
                return clean_title(meta_title.strip()), "metadata"
    except Exception:
        pass

    # Step 2: Try pdfplumber font-size detection
    try:
        with pdfplumber.open(BytesIO(pdf_bytes)) as pdf:
            page = pdf.pages[0]
            words = page.extract_words(extra_attrs=["size"])

            if words:
                # Find largest font size
                max_size = max(w["size"] for w in words)
                title_candidates = [w["text"] for w in words if abs(w["size"] - max_size) < 0.5]

                if title_candidates:
                    raw_title = " ".join(title_candidates)
                    return clean_title(raw_title), "font-size"
    except Exception:
        pass

    # Step 3: Fallback to PyPDF2 text extraction (first 1–2 lines)
    try:
        pdf_reader = PyPDF2.PdfReader(BytesIO(pdf_bytes))
        if pdf_reader.pages:
            first_page = pdf_reader.pages[0]
            text = first_page.extract_text() or ""
            lines = [line.strip() for line in text.split("\n") if line.strip()]

            candidates = []
            for line in lines[:10]:  # only look at first 10 lines
                if len(line) < 5:  # skip short stuff
                    continue
                if re.match(r"^\d+$", line):  # skip page numbers
                    continue
                candidates.append(line)
                if len(candidates) >= 2:
                    break

            if candidates:
                raw_title = " ".join(candidates)
                # Cap titles to 20 words
                words = raw_title.split()
                if len(words) > 20:
                    raw_title = " ".join(words[:20])
                return clean_title(raw_title), "text-fallback"

            if lines:
                return clean_title(lines[0]), "text-fallback"
    except Exception:
        pass

    # Nothing worked
    return None, "none"





# ---- Main: process documents ----
def process_documents(docs: list[dict]):
    fallback_docs = []

    for doc in docs:
        url = doc.get("URL")
        title = None
        used_fallback = False

        try:
            pdf_bytes = download_file(url)
            title = extract_pdf_title(pdf_bytes)
        except Exception as e:
            print(f"⚠️ Skipping {url}: {e}")

        if not title:
            title = generate_title(url)
            used_fallback = True
            fallback_docs.append({**doc, "Title": title, "UsedFallback": used_fallback})

        doc["Title"] = title
        doc["UsedFallback"] = used_fallback

        print(f"Processed: {title} {'(fallback used)' if used_fallback else ''}")

    return docs, fallback_docs







In [10]:
# ---- Example usage ----
with open("extract_5_rows_json_format.json", "r", encoding="utf-8") as f:
    docs = json.load(f)

all_docs, fallback_docs = process_documents(docs)

# Save enriched JSON with all docs
with open("5_documents_with_titles.json", "w", encoding="utf-8") as f:
    json.dump(all_docs, f, indent=2, ensure_ascii=False)

# Save subset where fallback was used
with open("missing_titles_test.json", "w", encoding="utf-8") as f:
    json.dump(fallback_docs, f, indent=2, ensure_ascii=False)

print(f"\n✅ Processing complete. {len(fallback_docs)} documents need manual title review.")

Processed: Identity, Credential, and Access Management (ICAM) Reference Architecture 
Processed: Patient Safety and Artificial Intelligence: Considerations for Key Groups — GenAI Developers 
Processed: Assuring the safety of AI-based clinical decision support systems: a case study of the AI Clinician for sepsis treatment 
Processed: DEVELOPMENT OF AN ONLINE TRAINING MODULE FOR CLINICIANS ON ALGORITHMIC BIAS IN HEALTHCARE 
Processed: Supporting the Health Care Workforce: Lessons Following the COVID-19 Pandemic 

✅ Processing complete. 0 documents need manual title review.


In [22]:
# ---- Example usage ----
with open("extract_10_rows_json_format.json", "r", encoding="utf-8") as f:
    docs = json.load(f)

all_docs, fallback_docs = process_documents(docs)

# Save enriched JSON with all docs
with open("10_documents_with_titles.json", "w", encoding="utf-8") as f:
    json.dump(all_docs, f, indent=2, ensure_ascii=False)

# Save subset where fallback was used
with open("missing_titles_test.json", "w", encoding="utf-8") as f:
    json.dump(fallback_docs, f, indent=2, ensure_ascii=False)

print(f"\n✅ Processing complete. {len(fallback_docs)} documents need manual title review.")

⚠️ Skipping https://opwdd.ny.gov/system/files/documents/2020/01/cco-policy-manual-master_acc_1.pdf: 403 Client Error: Forbidden for url: https://opwdd.ny.gov/system/files/documents/2020/01/cco-policy-manual-master_acc_1.pdf
Processed: Cco Policy Manual Master Acc 1 (fallback used)
Processed: ('Final Report of Reference Committee B', 'metadata') 
⚠️ Skipping https://www.chfs.ky.gov/agencies/os/oig/Kentucky%20Regulations%20and%20Statutes/Telehealth%20Terminology%20Glossary%20FINAL%20-%20July%202022.pdf: 403 Client Error: Forbidden for url: https://www.chfs.ky.gov/agencies/os/oig/Kentucky%20Regulations%20and%20Statutes/Telehealth%20Terminology%20Glossary%20FINAL%20-%20July%202022.pdf
Processed: Telehealth Terminology Glossary Final   July 2022 (fallback used)
Processed: ('MMS Infosession-Artificial Intelligence (AI) in Action: Generating Claims about Measure Properties', 'metadata') 
Processed: ('EXECUTIVE DIRECTION Government Relations and Special Project – Tracey DeShields State Policy 