# Variable-level alignment and extended metrics (Opportunity Atlas table 2)
Run alignment plus semantic consistency, coverage, and length-normalized completeness on a non-RLS codebook.

Prereqs:
- Set `OPENAI_API_KEY` in your environment.
- Run from repo root or `examples/notebooks`.
- Camelot deps installed for PDF parsing (Ghostscript/Poppler).


In [None]:
from pathlib import Path
import os
import sys
import pandas as pd
from IPython.display import display
from openai import OpenAI

CWD = Path.cwd().resolve()
if (CWD / "examples" / "codebooks").exists():
    PROJECT_ROOT = CWD
elif (CWD.parent / "examples" / "codebooks").exists():
    PROJECT_ROOT = CWD.parent
elif (CWD.parent.parent / "examples" / "codebooks").exists():
    PROJECT_ROOT = CWD.parent.parent
else:
    raise RuntimeError("Run this notebook from the repo root or within examples/notebooks.")

SRC_DIR = PROJECT_ROOT / "src"
if str(SRC_DIR) not in sys.path:
    sys.path.insert(0, str(SRC_DIR))

# Import evaluation helpers after sys.path update
from extendddg.evaluation import (
    evaluate_variable_alignment,
    summarize_variable_support,
    semantic_consistency_check,
    coverage_of_key_dimensions,
    length_normalized_completeness,
    redundancy_verbosity_metrics,
    unique_fact_ratio,
    helpfulness_usefulness_score,
    specificity_vagueness_score,
    semantic_quality_composite,
)
from extendddg.parsing.codebook import CodebookParser

EXAMPLES_DIR = PROJECT_ROOT / "examples"
CODEBOOK_DIR = EXAMPLES_DIR / "codebooks"

api_key = "your-api-key"
if not api_key:
    raise RuntimeError("Set OPENAI_API_KEY before running.")

client = OpenAI(api_key=api_key)
parser = CodebookParser(client=client, model_name="gpt-4o-mini")


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Parse Opportunity Atlas table 2 codebook (PDF)
codebook_path = CODEBOOK_DIR / 'opportunity_atlas_table2_codebook.pdf'
if not codebook_path.exists():
    raise FileNotFoundError(f'Missing codebook PDF: {codebook_path}')

codebook_df = parser.parse_codebook(str(codebook_path), dataset_df=None)
codebook_df.head()


Unnamed: 0,variable_name,description,variable_type
0,Variable,Description,Type
1,state,Two-digit state 2010 FIPS code,Num
2,county,Three-digit county 2010 FIPS code,Num
3,cz,Commuting zone identifier (1990 definitions),Num
4,czname,Commuting zone name,String


In [3]:
# Sample description covering key variables
description = (
    'The dataset reports county and commuting-zone identifiers (state and county FIPS, cz, czname) and '
    'mobility outcomes for children whose parents were at the 25th percentile of the national income '
    'distribution. It includes mean household income rank at ages 31-37 (kfr_*), incarceration rates for '
    '1978-1983 cohorts (jail_*), and counts of children under 18 below the median by race and gender. '
    'Standard errors are provided alongside each outcome.'
)
print(description)


The dataset reports county and commuting-zone identifiers (state and county FIPS, cz, czname) and mobility outcomes for children whose parents were at the 25th percentile of the national income distribution. It includes mean household income rank at ages 31-37 (kfr_*), incarceration rates for 1978-1983 cohorts (jail_*), and counts of children under 18 below the median by race and gender. Standard errors are provided alongside each outcome.


In [4]:
# Variable-level alignment
var_result = evaluate_variable_alignment(
    description=description,
    codebook=codebook_df,
    client=client,
    model_name='gpt-4o-mini',
    sample_size=10,
)
print('Support rate:', var_result['support_rate'])
display(pd.DataFrame(var_result['results']))

# Summarize variable support (no domain map provided)
summary = summarize_variable_support(var_result['results'], domain_map=None, top_n=5)
print('Support counts:', summary['support_counts'])
print('Top unsupported:')
display(pd.DataFrame(summary['top_unsupported']))


Support rate: 0.7


Unnamed: 0,variable,label,supported,rationale
0,jail_[race]_[gender]_p25_se,,yes,The dataset description mentions incarceration...
1,czname,,yes,The dataset description mentions commuting-zon...
2,jail_[race]_[gender]_p25,,yes,The dataset description mentions incarceration...
3,county,,no,The dataset description does not provide any s...
4,state,,no,The dataset description does not mention the '...
5,[race]_[gender]_count,,yes,The dataset description mentions counts of chi...
6,cz,,yes,The dataset description mentions commuting-zon...
7,Variable,,no,The dataset description does not provide any s...
8,kfr_[race]_[gender]_p25_se,,yes,The dataset description mentions mean househol...
9,kfr_[race]_[gender]_p25,,yes,The dataset description mentions mean househol...


Support counts: {'yes': 7, 'no': 3, 'maybe': 0}
Top unsupported:


Unnamed: 0,variable,label,supported,rationale
0,county,,no,The dataset description does not provide any s...
1,state,,no,The dataset description does not mention the '...
2,Variable,,no,The dataset description does not provide any s...


In [5]:
# Coverage of key dimensions
coverage = coverage_of_key_dimensions(description)
print('Coverage score:', coverage['coverage_score'])
display(pd.DataFrame.from_dict(coverage['coverage'], orient='index'))


Coverage score: 0.5


Unnamed: 0,covered,hits
population,True,[children]
time_period,True,[cohort]
geography,True,"[state, county, cz]"
methodology_sampling,False,[]
weighting_imputation,False,[]
variable_definitions,False,[]


In [6]:
# Length-normalized completeness demo (pretend raw completeness=8 on 0-10 scale)
raw_completeness = 8.0
lnc = length_normalized_completeness(raw_completeness, description, words_per_unit=100)
print('Raw completeness:', raw_completeness)
print('Length-normalized completeness:', lnc)


Raw completeness: 8.0
Length-normalized completeness: 8.0


In [7]:
# Semantic consistency using OpenAI embeddings (text-embedding-3-small)
def openai_embed(texts):
    if isinstance(texts, str):
        texts = [texts]
    resp = client.embeddings.create(model='text-embedding-3-small', input=list(texts))
    return [item.embedding for item in resp.data]

sem_result = semantic_consistency_check(
    description=description,
    codebook=codebook_df,
    embedder=openai_embed,
    threshold=0.35,
)
print('Average similarity:', sem_result['average_similarity'])
print('Flagged (low similarity):')
display(pd.DataFrame(sem_result['flagged']))


Average similarity: 0.47591057121753694
Flagged (low similarity):


Unnamed: 0,variable,max_similarity
0,Variable,0.231047
1,state,0.347371


In [8]:
# Redundancy and uniqueness
redundancy = redundancy_verbosity_metrics(description)
print('Repeated fact rate:', redundancy['repeated_fact_rate'])
print('Bigram redundancy:', redundancy['bigram_redundancy'])
print('Avg sentence length:', redundancy['avg_sentence_length'])

unique_ratio = unique_fact_ratio(description)
print('Unique fact ratio:', unique_ratio)


Repeated fact rate: 0.0
Bigram redundancy: 0.0
Avg sentence length: 23.0
Unique fact ratio: 1.0


In [9]:
# Helpfulness/usefulness and specificity (LLM-graded)
intended_use = 'Policy analyst assessing county-level mobility data for program design'
helpfulness = helpfulness_usefulness_score(
    description=description,
    intended_use=intended_use,
    client=client,
    model_name='gpt-4o-mini',
)
print('Usefulness score:', helpfulness['usefulness_score'])
print('Usefulness rationale:', helpfulness['rationale'])

specificity = specificity_vagueness_score(
    description=description,
    client=client,
    model_name='gpt-4o-mini',
)
print('Specificity score:', specificity['specificity_score'])
print('Specificity justification:', specificity['justification'])


Usefulness score: 4
Usefulness rationale: The dataset provides specific county-level mobility outcomes relevant to policy analysts, including income ranks and incarceration rates, which are crucial for program design. However, it could benefit from more context on how these metrics directly relate to mobility and program effectiveness.
Specificity score: 5
Specificity justification: The description includes specific identifiers such as 'state and county FIPS' and detailed metrics like 'mean household income rank at ages 31-37' and 'incarceration rates for 1978-1983 cohorts'.


In [10]:
# Semantic quality composite (reference vs. generated)
reference_description = (
    'County and commuting-zone identifiers with mobility outcomes for children of parents at the 25th ' 
    'percentile of income; includes kfr_* income ranks, incarceration rates for 1978-83 cohorts, race/sex ' 
    'child counts below the median, and standard errors for each metric.'
)
semantic_quality = semantic_quality_composite(
    generated_description=description,
    reference_description=reference_description,
    embedder=openai_embed,
)
print('BERT F1:', semantic_quality['bert_f1'])
print('Cosine similarity:', semantic_quality['cosine_similarity'])
print('Composite quality:', semantic_quality['composite_quality'])


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


BERT F1: 0.7650284767150879
Cosine similarity: 0.8734161257743835
Composite quality: 0.8192223012447357
