# ✅ LLM Processor Test Notebook
This notebook is ready for you to paste the full updated `llm_processor_dbconn.py` code.

In [None]:
# ================================================================
#   FULL llm_processor_dbconn.py — DROP-IN NOTEBOOK VERSION
#   Complete, single-cell, self-contained, PlanCode fully added
#   Includes:
#     ✔ Ray support
#     ✔ should_use_ray()
#     ✔ Manhattan Life 2-row header handling
#     ✔ PlanCode extraction + merging
#     ✔ LLM rule generation
#     ✔ Transformation engine
#     ✔ match_llm_output_to_raw_counts
# ================================================================

import warnings
warnings.filterwarnings("ignore")

import os, re, json, hashlib, time, math
from pathlib import Path
from typing import Dict, Any, List, Optional, Callable

import numpy as np
import pandas as pd
from dateutil.relativedelta import relativedelta

# ---------- LangChain Azure OpenAI ----------
try:
    from langchain_openai import AzureChatOpenAI
except:
    from langchain.chat_models import AzureChatOpenAI

# ---------- Perf Config ----------
ENABLE_RAY = os.getenv("ENABLE_RAY", "auto")  # auto / on / off
RAY_PARTITIONS = int(os.getenv("RAY_PARTITIONS", "8"))
RAY_MIN_ROWS_TO_USE = int(os.getenv("RAY_MIN_ROWS_TO_USE", "30000"))

BASE_DIR = Path(os.getcwd()).resolve()
OUT_DIR = Path(BASE_DIR / "outbound").resolve()
OUT_DIR.mkdir(parents=True, exist_ok=True)

# ---------- Columns (PlanCode added) ----------
FINAL_COLUMNS = [
    "PolicyNO","PHFirst","PHLast","Status","Issuer","State",
    "ProductType","PlanName","PlanCode",
    "SubmittedDate","EffectiveDate","TermDate","Paysched",
    "PayCode","WritingAgentID","Premium","CommPrem",
    "TranDate","CommReceived","PTD","NoPayMon",
    "Membercount"
]

ALLOWED_OPS = [
    "copy","const","date_mmddyyyy","date_plus_1m_mmddyyyy",
    "name_first_from_full","name_last_from_full",
    "money","membercount_from_commission","blank"
]

SYSTEM_PROMPT = """
Return strict JSON only.
"""

# ---------- Utility ----------
def _norm_key(s): return re.sub(r"[^a-z0-9]", "", str(s).lower())
def _sig_from_cols(cols): return hashlib.sha1(("||".join(cols)).encode()).hexdigest()[:12]
def _build_header_index(cols): return {_norm_key(c): c for c in cols}

# ---------- Loader ----------
CARRIERS = {
    "Manhattan Life": {"loader": "two_header"},
    "Molina": {"loader": "csv"},
    "Ameritas": {"loader": "csv"}
}

def _fast_read_header(path, loader):
    if loader == "csv":
        return list(pd.read_csv(path, nrows=0, dtype=str).columns)

    # two_header
    tmp = pd.read_csv(path, header=None, nrows=2, dtype=str).fillna("")
    top, bottom = tmp.iloc[0].tolist(), tmp.iloc[1].tolist()
    cols = []
    last = ""
    for a,b in zip(top,bottom):
        a = str(a).strip()
        b = str(b).strip()
        if a: last=a
        name = f"{last} {b}".strip() if b else last
        name = re.sub(r"[^A-Za-z0-9_]", "_", name.replace(" ", "_"))
        cols.append(name)
    return cols

def _read_csv_usecols(path, usecols, loader):
    if loader=="csv":
        return pd.read_csv(path, dtype=str, usecols=usecols).fillna("")

    tmp = pd.read_csv(path, header=None, dtype=str).fillna("")
    top, bottom = tmp.iloc[0].tolist(), tmp.iloc[1].tolist()
    cols=[]
    last=""
    for a,b in zip(top,bottom):
        a=str(a).strip(); b=str(b).strip()
        if a: last=a
        name = f"{last} {b}".strip() if b else last
        name = re.sub(r"[^A-Za-z0-9_]", "_", name.replace(" ", "_"))
        cols.append(name)
    df = tmp.iloc[2:].reset_index(drop=True)
    df.columns=cols
    if usecols: df=df[[c for c in usecols if c in df.columns]]
    return df.fillna("")

# ---------- LLM ----------
def build_llm():
    return AzureChatOpenAI(
        azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT","https://YOUR-ENDPOINT"),
        api_key=os.getenv("AZURE_OPENAI_API_KEY",""),
        api_version=os.getenv("AZURE_OPENAI_API_VERSION","2024-12-01-preview"),
        azure_deployment=os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME","gpt-4o-mini"),
        temperature=0
    )

def llm_generate_rule_spec(headers, prompt_path, rules_path):
    llm = build_llm()
    payload={
        "RequiredFields": FINAL_COLUMNS+["PID"],
        "RawHeaders": headers,
        "RulesNarrative": prompt_path.read_text() if prompt_path.exists() else "",
        "ExtraPrompt": rules_path.read_text() if rules_path.exists() else "",
        "OutputFormat": "Return strict JSON."
    }
    resp = llm.invoke([
        {"role":"system","content":SYSTEM_PROMPT},
        {"role":"user","content":json.dumps(payload)}
    ])
    return json.loads(resp.content)

# ---------- Spec Binding ----------
CANON={_norm_key(k):k for k in FINAL_COLUMNS+["PID"]}

def canonicalize_spec(spec):
    fixed={}
    for k,v in spec.items():
        fixed[CANON.get(_norm_key(k),k)] = v
    for req in FINAL_COLUMNS:
        if req not in fixed:
            fixed[req]={"op":"blank"}
    return fixed

def needs_source(op):
    return op in ["copy","date_mmddyyyy","date_plus_1m_mmddyyyy",
                  "name_first_from_full","name_last_from_full",
                  "money","membercount_from_commission"]

def bind_sources_to_headers(headers, spec):
    norm=_build_header_index(headers)
    out={}
    for tgt,s in spec.items():
        op=s.get("op")
        if not needs_source(op):
            out[tgt]=s; continue
        src=s.get("source","")
        if src in headers:
            out[tgt]=s; continue
        nk=_norm_key(src)
        if nk in norm:
            s["source"]=norm[nk]
        out[tgt]=s
    return out

def promote_pid_to_ptd(spec):
    if "PID" in spec and ("PTD" not in spec or spec["PTD"].get("op")=="blank"):
        spec["PTD"]=spec["PID"]
    return spec

def collect_usecols(spec):
    cols=set()
    for tgt,s in spec.items():
        if needs_source(s.get("op","")):
            cols.add(s.get("source"))
    return [c for c in cols if c]

# ---------- Transforms ----------
def _to_mmddyyyy(s):
    dt=pd.to_datetime(s,errors="coerce")
    return dt.dt.strftime("%m/%d/%Y").fillna("")

def _add_one_month_mmddyyyy(s):
    dt=pd.to_datetime(s,errors="coerce")
    return (dt+relativedelta(months=1)).dt.strftime("%m/%d/%Y").fillna("")

def _name_first_last(series):
    s = series.fillna("").astype(str)
    parts = s.str.split()
    return parts.str[0].fillna(""), parts.str[-1].fillna("")

def apply_rules(df, spec):
    out={}
    for tgt in FINAL_COLUMNS:
        rule=spec.get(tgt,{"op":"blank"})
        op=rule.get("op")
        if op=="copy":
            out[tgt]=df[rule["source"]]
        elif op=="const":
            out[tgt]=rule["value"]
        elif op=="date_mmddyyyy":
            out[tgt]=_to_mmddyyyy(df[rule["source"]])
        elif op=="date_plus_1m_mmddyyyy":
            out[tgt]=_add_one_month_mmddyyyy(df[rule["source"]])
        elif op=="name_first_from_full":
            out[tgt]=_name_first_last(df[rule["source"]])[0]
        elif op=="name_last_from_full":
            out[tgt]=_name_first_last(df[rule["source"]])[1]
        elif op=="money":
            out[tgt]=df[rule["source"]].astype(str)
        elif op=="membercount_from_commission":
            s=df[rule["source"]].astype(str)
            out[tgt]=np.where(s.str.contains("-"),"-1","1")
        else:
            out[tgt]=""
    return pd.DataFrame(out)

# ---------- Ray ----------
def should_use_ray(n):
    if ENABLE_RAY=="on": return True
    if ENABLE_RAY=="off": return False
    return n>=RAY_MIN_ROWS_TO_USE

def apply_rules_parallel(df, spec):
    import ray
    if not ray.is_initialized(): ray.init(ignore_reinit_error=True)
    spec_ref=ray.put(spec)

    @ray.remote
    def worker(chunk, spec_ref): return apply_rules(chunk, ray.get(spec_ref))
    parts=np.array_split(df, RAY_PARTITIONS)
    return pd.concat(ray.get([worker.remote(p,spec_ref) for p in parts]))

# ---------- Manhattan Life ----------
def extract_manhattan_policy_plan_from_csv(path, log):
    df=pd.read_csv(path, header=[0,1]).fillna("")
    flat=[]
    for a,b in df.columns:
        name=f"{str(a).strip()} {str(b).strip()}".strip()
        name=re.sub(r"[^A-Za-z0-9_]","_",name.replace(" ","_"))
        flat.append(name)
    df.columns=flat

    pc=[c for c in df.columns if "plan" in c.lower() and "code" in c.lower()]
    po=[c for c in df.columns if "policy" in c.lower() or "case_number" in c.lower()]
    if not pc or not po:
        raise ValueError("Could not locate PlanCode or PolicyNumber")

    out=df[[po[0], pc[0]]].copy()
    out.columns=["PolicyNumber","PlanCode"]
    out["PolicyNumber"]=out["PolicyNumber"].astype(str).str.strip()
    out["PlanCode"]=out["PlanCode"].astype(str).str.strip().str.upper()
    out["PolicyNo"]=out["PolicyNumber"]
    return out[out["PolicyNumber"]!=""]

def match_llm_output_to_raw_counts(raw, llm):
    counts=raw["PolicyNo"].value_counts()
    dfs=[]
    for pol,n in counts.items():
        block=llm[llm["PolicyNo"]==pol]
        if len(block)==0: continue
        if len(block)>=n:
            dfs.append(block.head(n))
        else:
            dfs.append(pd.concat([block]*((n//len(block))+1)).head(n))
    return pd.concat(dfs)

# DB stubs (you will replace with real code)
def get_manhattan_mapping(Load_task_id, company_issuer_id, server, database, log):
    return pd.DataFrame(columns=["PolicyNo","PlanName","ProductType"])

def stg_plan_mapping_min(df, server, database, log): return len(df)

# ---------- Pipeline ----------
def run_llm_pipeline(
    issuer, paycode, trandate, load_task_id, company_issuer_id,
    csv_path, template_dir, output_csv_name,
    server_name, database_name,
    log=print):

    loader=CARRIERS.get(issuer,{}).get("loader","csv")
    headers=_fast_read_header(csv_path, loader)
    sig=_sig_from_cols(headers)

    prompt_path=Path(template_dir)/f"{issuer}_prompt.txt"
    rules_path=Path(template_dir)/f"{issuer}_rules.json"
    compiled_path=Path(template_dir)/f"{issuer}_compiled_rules_{sig}.json"

    if compiled_path.exists():
        spec=json.loads(compiled_path.read_text())
    else:
        raw=llm_generate_rule_spec(headers,prompt_path,rules_path)
        raw=canonicalize_spec(raw)
        raw=bind_sources_to_headers(headers,raw)
        raw=promote_pid_to_ptd(raw)
        compiled_path.write_text(json.dumps(raw,indent=2))
        spec=raw

    usecols=collect_usecols(spec)
    df=_read_csv_usecols(csv_path, usecols, loader)

    if should_use_ray(len(df)):
        out=apply_rules_parallel(df,spec)
    else:
        out=apply_rules(df,spec)

    out["TranDate"]=trandate
    out["PayCode"]=paycode
    out["Issuer"]=issuer
    if "PlanCode" not in out: out["PlanCode"]=""

    if issuer=="Manhattan Life":
        log("Enriching Manhattan Life…")
        raw_link=extract_manhattan_policy_plan_from_csv(csv_path,log)
        map_df=get_manhattan_mapping(load_task_id,company_issuer_id,server_name,database_name,log)

        merged=out.merge(raw_link[["PolicyNo","PlanCode"]],on="PolicyNo",how="left")
        merged["PlanCode"]=merged["PlanCode_x"].fillna(merged["PlanCode_y"])
        merged=merged.drop(columns=["PlanCode_x","PlanCode_y"])
        out=match_llm_output_to_raw_counts(raw_link, merged)

    out_file=Path(csv_path).parent/f"{output_csv_name}.csv"
    out.to_csv(out_file,index=False)
    log(f"Done → {out_file}")
    return out_file


# ===================================================
#   EXAMPLE INPUTS FOR TESTING
# ===================================================
issuer="Manhattan Life"
paycode="Default"
trandate="2025-11-12"
load_task_id="11497"
company_issuer_id="3205"

csv_path = r".\inbound\raw.csv"           # ✔ update
template_dir = r".\carrier_prompts"       # ✔ update

server_name="NGCS"
database_name="NGCS"

# ===================================================
#   RUN PIPELINE
# ===================================================
out = run_llm_pipeline(
    issuer=issuer,
    paycode=paycode,
    trandate=trandate,
    load_task_id=load_task_id,
    company_issuer_id=company_issuer_id,
    csv_path=csv_path,
    template_dir=template_dir,
    output_csv_name="manhattan_life_test_output",
    server_name=server_name,
    database_name=database_name,
)

out
