# PBP → Benefits Translator (LangChain + Azure OpenAI)

This notebook uses **LangChain** with your **Azure OpenAI** deployment to infer and validate
a column mapping from `parse_pbp_files.csv` (parsed PBP) into the target schema defined by `benefits_ifp2.csv`.

**Highlights**
- LLM-assisted mapping proposal (JSON) using your source/target headers and a few-shot prompt.
- Validation & auto-repair of the LLM's mapping output.
- Fallback to deterministic matching (synonyms + fuzzy) if LLM is unavailable.
- Numeric/percent coercions for cost-sharing fields.
- Outputs `benefits_translated_llm.csv` aligned to the target schema.


In [None]:
# If running locally: ensure these are installed in your environment
# pip install langchain langchain-openai pandas python-dotenv


In [None]:
# ---- Configuration: Azure OpenAI via environment variables ----
import os
from pathlib import Path

# Required environment variables (example names):
#   AZURE_OPENAI_API_KEY       = "..."
#   AZURE_OPENAI_ENDPOINT      = "https://<your-resource>.openai.azure.com/"
#   AZURE_OPENAI_API_VERSION   = "2024-02-15-preview"
#   AZURE_OPENAI_CHAT_DEPLOYMENT = "gpt-4o-mini"      # your chat deployment name
#   AZURE_OPENAI_EMBED_DEPLOYMENT = "text-embedding-3-large"  # optional

missing = [k for k in [
    "AZURE_OPENAI_API_KEY",
    "AZURE_OPENAI_ENDPOINT",
    "AZURE_OPENAI_API_VERSION",
    "AZURE_OPENAI_CHAT_DEPLOYMENT"
] if not os.getenv(k)]

if missing:
    print("⚠️ Missing Azure OpenAI env vars:", missing)
    print("   You can copy /mnt/data/azure_openai_example.env to .env and set your values.")


In [None]:
# ---- Imports ----
import re, json
import pandas as pd
from difflib import get_close_matches
from pathlib import Path

from langchain_openai import AzureChatOpenAI
from langchain.schema import SystemMessage, HumanMessage


In [None]:
# ---- Paths ----
BASE = Path("/mnt/data")
PARSED_PBP = BASE / "parse_pbp_files.csv"
TARGET_BENCHMARK = BASE / "benefits_ifp2.csv"
OUTPUT = BASE / "benefits_translated_llm.csv"


In [None]:
# ---- Utilities ----
def normalize_name(s: str) -> str:
    s = (s or "").strip().lower()
    s = re.sub(r"[^a-z0-9]+", "_", s)
    s = re.sub(r"_+", "_", s).strip("_")
    return s

def money_to_number(x):
    import pandas as pd
    if pd.isna(x):
        return x
    s = str(x).strip()
    if s == "":
        return None
    s = s.replace(",", "")
    if s.endswith("%"):
        try:
            return float(s.rstrip("%")) / 100.0
        except Exception:
            return s
    if s.startswith("$"):
        s = s[1:]
    try:
        return float(s)
    except Exception:
        return x

def pick_best(source_cols, target_col, candidates_cache):
    from difflib import get_close_matches
    if target_col in candidates_cache:
        return candidates_cache[target_col]
    matches = get_close_matches(target_col, source_cols, n=1, cutoff=0.72)
    best = matches[0] if matches else None
    candidates_cache[target_col] = best
    return best


In [None]:
# ---- Deterministic rules (synonyms & numeric-like) ----
SYNONYMS = {
    "plan_id": ["contract_plan_id","plan_id","contract_plan","pbp_id","contract_pbpid"],
    "segment_id": ["segment_id","seg_id"],
    "org_marketing_name": ["organization_marketing_name","org_marketing_name","parent_organization"],
    "plan_marketing_name": ["plan_marketing_name","plan_name","marketing_name","plan_marketing"],
    "plan_type": ["plan_type","org_type","product_type"],
    "county": ["county","service_county","service_area_county"],
    "state": ["state","service_state","state_code"],
    "zip": ["zip","zipcode","postal_code"],
    "effective_year": ["year","benefit_year","effective_year"],
    "medical_deductible": ["medical_deductible","deductible_medical","in_network_deductible"],
    "drug_deductible": ["drug_deductible","pharmacy_deductible","rx_deductible"],
    "moop_in_network": ["moop","in_network_moop","max_oop_in_network","oop_max"],
    "pcp_copay": ["pcp_copay","primary_care_copay","primary_care_visit_copay","pcp_visit_copay"],
    "specialist_copay": ["specialist_copay","specialist_visit_copay","spec_copay"],
    "er_copay": ["er_copay","emergency_room_copay","emergency_care_copay"],
    "urgent_care_copay": ["urgent_care_copay","urgent_care_visit_copay"],
    "inpatient_facility_per_stay": ["inpatient_per_stay","inpatient_facility_per_stay","inpatient_hospital_copay_per_stay"],
    "outpatient_surgery_copay": ["outpatient_surgery_copay","ambulatory_surgery_copay","outpatient_facility_copay"],
    "tier1_generic": ["tier1_generic_copay","generic_copay","pref_generic_copay"],
    "tier2_pref_brand": ["tier2_preferred_brand_copay","preferred_brand_copay","pref_brand_copay"],
    "tier3_nonpref_brand": ["tier3_nonpreferred_brand_copay","nonpreferred_brand_copay","nonpref_brand_copay"],
    "tier4_specialty": ["tier4_specialty_copay","specialty_copay"],
    "dental_coverage": ["dental_benefit","dental","comprehensive_dental"],
    "vision_coverage": ["vision_benefit","vision"],
    "hearing_coverage": ["hearing_benefit","hearing"],
}

NUMERIC_LIKE = {
    "medical_deductible","drug_deductible","moop_in_network",
    "pcp_copay","specialist_copay","er_copay","urgent_care_copay",
    "inpatient_facility_per_stay","outpatient_surgery_copay",
    "tier1_generic","tier2_pref_brand","tier3_nonpref_brand","tier4_specialty",
}


In [None]:
# ---- LLM mapping proposal ----
def propose_mapping_with_llm(source_cols, target_cols, model=None):
    """
    Ask the LLM to produce a JSON mapping: { "target_col": "best_source_col_or_null", ... }
    Return dict or None on failure.
    """
    if model is None:
        return None  # signal to caller to use fallback

    sys = SystemMessage(content=(
        "You are a data integration assistant for Medicare PBP → benefits mapping. "
        "Given source and target headers, return ONLY a compact JSON mapping "
        "from target to source. Use null if no clear match. Do not explain."
    ))

    human = HumanMessage(content=f"""
Create a JSON mapping from target → source.
Target columns: {target_cols}
Source columns: {source_cols}

Rules:
- Prefer exact semantic matches (PBP/PUF conventions).
- If multiple candidates exist, pick the most specific.
- If no good match, set value to null.
Return JSON only.
""")

    try:
        resp = model.invoke([sys, human])
        txt = resp.content.strip()
        start = txt.find("{")
        end = txt.rfind("}")
        if start != -1 and end != -1 and end > start:
            txt = txt[start:end+1]
        mapping = json.loads(txt)
        if not isinstance(mapping, dict):
            raise ValueError("LLM output is not a dict")
        return mapping
    except Exception as e:
        print("LLM mapping error → falling back. Reason:", e)
        return None


In [None]:
# ---- Pipeline: load, map, transform ----
parsed_pbp = pd.read_csv(PARSED_PBP)
target_benchmark = pd.read_csv(TARGET_BENCHMARK)

target_cols = list(target_benchmark.columns)
target_norm = [normalize_name(c) for c in target_cols]

source_cols = list(parsed_pbp.columns)
source_norm = [normalize_name(c) for c in source_cols]
norm_to_source = {normalize_name(c): c for c in source_cols}

# Instantiate LLM if env is ready
llm = None
try:
    if all(os.getenv(k) for k in ["AZURE_OPENAI_API_KEY","AZURE_OPENAI_ENDPOINT","AZURE_OPENAI_API_VERSION","AZURE_OPENAI_CHAT_DEPLOYMENT"]):
        llm = AzureChatOpenAI(
            azure_deployment=os.getenv("AZURE_OPENAI_CHAT_DEPLOYMENT"),
            api_key=os.getenv("AZURE_OPENAI_API_KEY"),
            azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
            api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
            temperature=0
        )
except Exception as e:
    print("Failed to initialize AzureChatOpenAI → using fallback only. Reason:", e)

# Ask LLM for a mapping using original headers
llm_mapping = propose_mapping_with_llm(list(parsed_pbp.columns), list(target_benchmark.columns), model=llm)

# Build resolved sources
resolved_sources = {}

def resolve_with_synonyms_or_fuzzy(tnorm):
    if tnorm in norm_to_source:
        return norm_to_source[tnorm]
    if tnorm in SYNONYMS:
        for candidate in SYNONYMS[tnorm]:
            if candidate in norm_to_source:
                return norm_to_source[candidate]
    best_norm = pick_best(source_norm, tnorm, candidates_cache)
    if best_norm and best_norm in norm_to_source:
        return norm_to_source[best_norm]
    return None

candidates_cache = {}
if llm_mapping:
    for tcol in target_cols:
        src = llm_mapping.get(tcol)
        src_final = None
        if src:
            if src in parsed_pbp.columns:
                src_final = src
            else:
                nsrc = normalize_name(src)
                src_final = norm_to_source.get(nsrc)
        if not src_final:
            src_final = resolve_with_synonyms_or_fuzzy(normalize_name(tcol))
        if src_final:
            resolved_sources[normalize_name(tcol)] = src_final
else:
    for tcol in target_cols:
        tnorm = normalize_name(tcol)
        src_final = resolve_with_synonyms_or_fuzzy(tnorm)
        if src_final:
            resolved_sources[tnorm] = src_final

out = pd.DataFrame(columns=target_cols)

for tcol, tnorm in zip(target_cols, target_norm):
    src = resolved_sources.get(tnorm)
    if src is None:
        out[tcol] = None
        continue
    series = parsed_pbp[src]
    if tnorm in NUMERIC_LIKE:
        out[tcol] = series.apply(money_to_number)
    else:
        out[tcol] = series

if "plan_type" in target_norm:
    tname = target_cols[target_norm.index("plan_type")]
    out[tname] = out[tname].astype(str).str.upper().str.replace("MEDICARE ", "", regex=False)

out.to_csv(OUTPUT, index=False)
print(f"✅ Wrote: {OUTPUT}")
out.head(10)
