# Codebook parser smoke tests
Run the codebook parser against multiple sample codebooks in this repo (including Opportunity Atlas table 2).

Prerequisites:
- `OPENAI_API_KEY` set in your environment.
- `camelot-py` runtime deps available (Ghostscript/Poppler on Windows).
- Run from the repo root or `examples/notebooks`.


In [None]:
from pathlib import Path
import os
import sys
import pandas as pd
from IPython.display import display
from openai import OpenAI

CWD = Path.cwd().resolve()
if (CWD / "examples" / "codebooks").exists():
    PROJECT_ROOT = CWD
elif (CWD.parent / "examples" / "codebooks").exists():
    PROJECT_ROOT = CWD.parent
elif (CWD.parent.parent / "examples" / "codebooks").exists():
    PROJECT_ROOT = CWD.parent.parent
else:
    raise RuntimeError("Run this notebook from the repo root or within examples/notebooks.")

EXAMPLES_DIR = PROJECT_ROOT / "examples"
CODEBOOK_DIR = EXAMPLES_DIR / "codebooks"
DATASETS_DIR = EXAMPLES_DIR / "datasets"

# Ensure src/ is on path for imports
SRC_DIR = PROJECT_ROOT / "src"
if str(SRC_DIR) not in sys.path:
    sys.path.insert(0, str(SRC_DIR))

from extendddg.parsing.codebook import CodebookParser

api_key = "your-api-key"
if not api_key:
    raise RuntimeError("Set OPENAI_API_KEY before running.")

client = OpenAI(api_key=api_key)
parser = CodebookParser(client=client, model_name="gpt-4o-mini")


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
TEST_CASES = [
    {
        "name": "RLS codebook (CSV)",
        "codebook": CODEBOOK_DIR / "rls_codebook.csv",
        "dataset": DATASETS_DIR / "rls_dataset_example.csv",
    },
    {
        "name": "RLS codebook sample (CSV)",
        "codebook": CODEBOOK_DIR / "rls_codebook_sample.csv",
        "dataset": DATASETS_DIR / "rls_dataset_example.csv",
    },
    {
        "name": "ABJK declarations (PDF)",
        "codebook": CODEBOOK_DIR / "abjk_codebook.pdf",
        "dataset": DATASETS_DIR / "Replication Data for The Declaration of Independents" / "ABJK_Declarations.csv",
    },
    {
        "name": "Opportunity Atlas table 2 (PDF)",
        "codebook": CODEBOOK_DIR / "opportunity_atlas_table2_codebook.pdf",
        "dataset": None,
    },
    {
        "name": "Opportunity Atlas table 2 (dataset copy, PDF)",
        "codebook": DATASETS_DIR / "Opportunity Atlas" / "table_2_codebook.pdf",
        "dataset": None,
    },
    {
        "name": "Opportunity Atlas table 9 (PDF)",
        "codebook": DATASETS_DIR / "Opportunity Atlas" / "table_9_codebook.pdf",
        "dataset": None,
    },
    {
        "name": "Mobility Report Cards table 1 (PDF)",
        "codebook": DATASETS_DIR / "Mobility Report Cards" / "table_1_codebook.pdf",
        "dataset": None,
    },
    {
        "name": "Mobility Report Cards table 10 (PDF)",
        "codebook": DATASETS_DIR / "Mobility Report Cards" / "table_10_codebook.pdf",
        "dataset": None,
    },
    {
        "name": "CDC Teen Immunization Survey (PDF)",
        "codebook": DATASETS_DIR / "CDC Teen Immunization Survey" / "2023_codebook.pdf",
        "dataset": None,
    },
]


In [3]:
def load_dataset(path: Path | None):
    if path is None:
        return None
    if not path.exists():
        raise FileNotFoundError(f"Dataset not found: {path}")
    return pd.read_csv(path)


In [4]:
results = {}

for case in TEST_CASES:
    name = case["name"]
    codebook_path = case["codebook"]
    dataset_path = case["dataset"]

    print(f"--- {name} ---")
    if not codebook_path.exists():
        print(f"Skipping: codebook not found at {codebook_path}")
        continue

    dataset_df = load_dataset(dataset_path) if dataset_path else None

    try:
        parsed_df = parser.parse_codebook(str(codebook_path), dataset_df)
        results[name] = parsed_df
        print(f"Rows: {len(parsed_df)} | Columns: {list(parsed_df.columns)}")
        display(parsed_df.head())
    except Exception as exc:
        print(f"Failed: {exc}")


--- RLS codebook (CSV) ---
Rows: 654 | Columns: ['variable_name', 'description']


Unnamed: 0,variable_name,description
0,P_SUID,Unique ID
2,YEAR_FROM_CODATE,Year in which survey was completed
4,REGION,Census region (variable is only available in r...
8,HAPPY,"Generally, how happy are you with your life th..."
12,SATIS_A,Would you say your health in general is excell...


--- RLS codebook sample (CSV) ---
Rows: 3 | Columns: ['variable_name', 'description']


Unnamed: 0,variable_name,description
0,P_SUID,Unique ID
2,YEAR_FROM_CODATE,Year in which survey was completed
4,REGION,Census region (variable is only available in r...


--- ABJK declarations (PDF) ---
Rows: 13 | Columns: ['variable_name', 'description']


Unnamed: 0,variable_name,description
13,caseid,Original ANES case identifier
14,unique.id,Unique case identifier (analysis
16,year,Survey year
27,weight.combined,Combined survey weight (full sample)
28,mode,Survey mode


--- Opportunity Atlas table 2 (PDF) ---
Rows: 10 | Columns: ['variable_name', 'description', 'variable_type']


Unnamed: 0,variable_name,description,variable_type
0,Variable,Description,Type
1,state,Two-digit state 2010 FIPS code,Num
2,county,Three-digit county 2010 FIPS code,Num
3,cz,Commuting zone identifier (1990 definitions),Num
4,czname,Commuting zone name,String


--- Opportunity Atlas table 2 (dataset copy, PDF) ---
Rows: 10 | Columns: ['variable_name', 'description', 'variable_type']


Unnamed: 0,variable_name,description,variable_type
0,Variable,Description,Type
1,state,Two-digit state 2010 FIPS code,Num
2,county,Three-digit county 2010 FIPS code,Num
3,cz,Commuting zone identifier (1990 definitions),Num
4,czname,Commuting zone name,String


--- Opportunity Atlas table 9 (PDF) ---
Rows: 25 | Columns: ['variable_name', 'description']


Unnamed: 0,variable_name,description
0,Variable,Description
1,state,Two-digit state 2010 FIPS code
2,county,Three-digit county 2010 FIPS code
3,tract,Six-digit tract 2010 FIPS code
4,cz,Five-digit 1990 commuter zone code


--- Mobility Report Cards table 1 (PDF) ---
Rows: 16 | Columns: ['variable_name', 'description']


Unnamed: 0,variable_name,description
0,Stata Variable Name,Description
1,super_opeid,Institution OPEID / Cluster ID when combining ...
2,name,Name of college (or college group)
3,czname,Commuting zone (analogous to metro area) in wh...
4,state,State in which college is located


--- Mobility Report Cards table 10 (PDF) ---
Rows: 43 | Columns: ['variable_name', 'description']


Unnamed: 0,variable_name,description
0,Variable,Description
1,super_opeid,Institution OPEID / Cluster ID when co...
2,name,Name of Institution / Super-OPEID Cluster
3,region,Census region 1 = Northeast 2 = Midwest ...
4,state,State


--- CDC Teen Immunization Survey (PDF) ---


KeyboardInterrupt: 

In [5]:
case = TEST_CASES[0]  # e.g., RLS codebook (CSV)
parsed_df = parser.parse_codebook(str(case["codebook"]), load_dataset(case["dataset"]))
print(f"Rows: {len(parsed_df)} | Columns: {list(parsed_df.columns)}")
display(parsed_df.head())


Rows: 654 | Columns: ['variable_name', 'description']


Unnamed: 0,variable_name,description
0,P_SUID,Unique ID
2,YEAR_FROM_CODATE,Year in which survey was completed
4,REGION,Census region (variable is only available in r...
8,HAPPY,"Generally, how happy are you with your life th..."
12,SATIS_A,Would you say your health in general is excell...


In [6]:
# Summarize parsed codebooks
summary = []
for name, df in results.items():
    summary.append({
        "name": name,
        "rows": len(df),
        "columns": list(df.columns),
        "sample_variables": df["variable_name"].head(5).tolist() if "variable_name" in df else []
    })
pd.DataFrame(summary)


Unnamed: 0,name,rows,columns,sample_variables
0,RLS codebook (CSV),654,"[variable_name, description]","[P_SUID, YEAR_FROM_CODATE, REGION, HAPPY, SATI..."
1,RLS codebook sample (CSV),3,"[variable_name, description]","[P_SUID, YEAR_FROM_CODATE, REGION]"
2,ABJK declarations (PDF),13,"[variable_name, description]","[caseid, unique.id, year, weight.combined, mode]"
3,Opportunity Atlas table 2 (PDF),10,"[variable_name, description, variable_type]","[Variable, state, county, cz, czname]"
4,"Opportunity Atlas table 2 (dataset copy, PDF)",10,"[variable_name, description, variable_type]","[Variable, state, county, cz, czname]"
5,Opportunity Atlas table 9 (PDF),25,"[variable_name, description]","[Variable, state, county, tract, cz]"
6,Mobility Report Cards table 1 (PDF),16,"[variable_name, description]","[Stata Variable Name, super_opeid, name, cznam..."
7,Mobility Report Cards table 10 (PDF),43,"[variable_name, description]","[Variable, super_opeid, name, region, state]"


In [7]:
out_dir = PROJECT_ROOT / "examples" / "notebooks" / "outputs"
out_dir.mkdir(exist_ok=True)
for name, df in results.items():
    safe = name.replace(" ", "_").replace("(", "").replace(")", "").replace("/", "_")
    df.to_csv(out_dir / f"{safe}.csv", index=False)
