In [6]:
import pandas as pd
from langchain.chat_models import AzureChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.schema import SystemMessage, HumanMessage
from langchain_core.output_parsers import JsonOutputParser
import json
from typing import List, Dict, Any, Iterable
import os, json, time, math, re
import random

In [5]:
from itertools import islice
from typing import Iterable, Tuple, Any

def batched(iterable: Iterable[Any], n: int) -> Iterable[Tuple[Any, ...]]:
    """Batch data into tuples of length n. Last batch may be shorter."""
    it = iter(iterable)
    while batch := tuple(islice(it, n)):
        yield batch


In [None]:
## Load and flatten multi-row header
def flatten_two_header_csv(path: str) -> pd.DataFrame:
    tmp = pd.read_csv(path, header=None, dtype=str)
    tmp = tmp.fillna("")

    top = tmp.iloc[0].tolist()
    bottom = tmp.iloc[1].tolist()

    ff = []
    last = ""
    for x in top:
        x = str(x).strip()
        if x:
            last = x
        ff.append(last)

    cols = []
    for a, b in zip(ff, bottom):
        a = str(a).strip()
        b = str(b).strip()
        if not a and not b:
            name = "unnamed"
        elif not a:
            name = b
        elif not b:
            name = a
        else:
            name = f"{a} {b}"

        name = re.sub(r"\s+", " ", name)
        name = name.replace("/", "_").replace(".", "_").strip()
        name = re.sub(r"\s+", "_", name)
        cols.append(name)

    df = tmp.iloc[2:].reset_index(drop=True)
    df.columns = cols

    df = df[[c for c in df.columns if not df[c].astype(str).str.strip().eq("").all()]]
    return df.fillna("")


In [None]:
FINAL_COLUMNS = ['PolicyNo',
    'PHFirst',
    'PHLast',
    'Status',
    'Issuer',
    'State',
    'ProductType',
    'PlanName',
    'SubmittedDate',
    'EffectiveDate',
    'TermDate',
    'PaySched',
    'PayCode',
    'WritingAgentID',
    'Premium',
    'CommPrem',
    'TranDate',
    'CommReceived',
    'PTD',
    'NoPayMon',
    'Membercount']

BATCH_SIZE = 50
MAX_CONCURRENCY = 4
MAX_RETRIES = 5
BASE_SLEEP = 1.5


In [7]:
# Load LLM
llm = AzureChatOpenAI(
    azure_endpoint = "https://joshuemueu1yp-swedencentral.cognitiveservices.azure.com/",
    api_key = "7T8tUCqXfHkSbgfHdv1TXsXqn90tDtS4bTtE4EG4qwXLfsVnf9qT1JQQJ998HAcfmk5XJw3AAAAAC0GA0ed",
    api_version = "2024-12-01-preview",
    azure_deployment = "gpt-5-mini_g",
    temperature = 1
)

llm = llm.bind(temperature = 1)


  llm = AzureChatOpenAI(


In [9]:
from langchain.schema import SystemMessage, HumanMessage

messages = [
    SystemMessage(content="You are a helpful assistant."),
    HumanMessage(content="Write a haiku about testing Azure OpenAI.")
]

response = llm.invoke(messages)
print(response.content)


APIConnectionError: Connection error.

In [10]:
import os
from openai import AzureOpenAI


client = AzureOpenAI(
    azure_endpoint="https://joshuemueu1yp-swedencentral.cognitiveservices.azure.com/",
    api_key="7T8tUCqXfHkSbgfHdv1TXsXqn90tDtS4bTtE4EG4qwXLfsVnf9qT1JQQJ998HAcfmk5XJw3AAAAAC0GA0ed",
    api_version="2024-02-15-preview",
)

# IMPORTANT: 'model' must be your DEPLOYMENT name
resp = client.chat.completions.create(
    model="<your-deployment-name>",
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user",   "content": "Say 'ok' if this works."}
    ]
)
print(resp.choices[0].message.content)


APIConnectionError: Connection error.

In [None]:
from manhattan_mapping import get_manhattan_mapping
import pandas as pd
import json, time
from pathlib import Path

# Example log function for notebook output
def log(message: str):
    print(f"[LOG] {message}")


In [None]:
issuer = "Manhattan Life"
paycode = "PAY123"
trandate = "2025-11-04"
csv_path = "/mnt/input/sample_file.csv"
template_dir = "/mnt/templates/"


In [None]:
prompt_path = Path(template_dir) / f"{issuer}_prompt.txt"
rules_path  = Path(template_dir) / f"{issuer}_rules.json"

if not prompt_path.exists():
    log(f"NOTE: prompt file not found, continuing: {prompt_path}")
if not rules_path.exists():
    log(f"NOTE: rules file not found, continuing: {rules_path}")


In [None]:
headers = _fast_read_header(csv_path, "csv")
sig = _sig_from_cols(headers)
compiled_path = Path(template_dir) / f"{issuer}_compiled_rules__{sig}.json"


In [None]:
if compiled_path.exists():
    bound_spec = json.loads(compiled_path.read_text(encoding="utf-8"))
    log(f"[Rules] Loaded cached compiled rules → {compiled_path.name}")
else:
    log("[Rules] Generating with LLM…")
    raw_spec   = llm_generate_rule_spec(headers, prompt_path, rules_path)
    raw_spec   = canonicalize_spec_keys(raw_spec)
    bound_spec = bind_sources_to_headers(headers, raw_spec)
    bound_spec = promote_pid_to_ptd(bound_spec)
    compiled_path.write_text(json.dumps(bound_spec, ensure_ascii=False, indent=2), encoding="utf-8")
    log(f"[Rules] Compiled & saved → {compiled_path.name}")


In [None]:
usecols = collect_usecols(bound_spec)
df = _read_csv_usecols(csv_path, usecols if usecols else None, "csv")
log(f"[IO] Rows loaded: {len(df):,} | usecols={len(usecols)} | loader=csv")


In [None]:
use_ray = should_use_ray(len(df))
if use_ray:
    log("[Exec] Using Ray parallel mode…")
    out_df = apply_rules_parallel(df, bound_spec)
else:
    out_df = apply_rules(df, bound_spec)


In [None]:
out_df["TranDate"] = trandate
out_df["PayCode"]  = paycode
out_df["Issuer"]   = issuer


In [None]:
if issuer == "Manhattan Life":
    log("[INFO] Manhattan Life detected — retrieving SQL mapping.")
    try:
        map_df = get_manhattan_mapping(
            load_task_id=13449,
            company_issuer_id=2204,
            log=log
        )

        if not map_df.empty and "PlanCode" in df.columns:
            map_df = map_df.drop_duplicates(subset=["PlanCode"]).copy()
            map_df.index = map_df["PlanCode"].astype(str).str.strip()

            src_key = df["PlanCode"].astype(str).str.strip()
            mapped_policy = src_key.map(map_df["PolicyNumber"]).fillna("")
            mapped_name   = src_key.map(map_df["ProductName"]).fillna("")

            out_df["ProductType"] = out_df.get("ProductType", "")
            out_df["PlanName"] = out_df.get("PlanName", "")

            out_df["ProductType"] = mapped_policy.where(mapped_policy.ne(""), out_df["ProductType"])
            out_df["PlanName"]    = mapped_name.where(mapped_name.ne(""), out_df["PlanName"])
            log(f"[ManhattanLife] Updated {sum(mapped_policy!='')} ProductType/PlanName rows.")
        else:
            log("[ManhattanLife] No PlanCode column or mapping data — skipped.")
    except Exception as e:
        log(f"[WARN] Manhattan Life enrichment failed: {e}")


In [None]:
OUT_DIR.mkdir(parents=True, exist_ok=True)
out_base = OUT_DIR / f"{issuer}_{sig}"

if OUT_FORMAT.lower() == "parquet":
    out_path = out_base.with_suffix(".parquet")
    try:
        comp = None if PARQUET_COMPRESSION.lower()=="none" else PARQUET_COMPRESSION
        out_df.to_parquet(out_path, index=False, compression=comp)
    except Exception:
        out_path = out_base.with_suffix(".csv")
        out_df.to_csv(out_path, index=False)
else:
    out_path = out_base.with_suffix(".csv")
    out_df.to_csv(out_path, index=False)

log(f"✅ Completed → {out_path.as_posix()}")
