
# 🛡️ HomeShield AI — POC (Jupyter Notebook)

This notebook demonstrates an end-to-end **Proof of Concept** for the **AI Claims Copilot with Coverage Validation**:

- Ingest synthetic **policy documents** into **Pinecone**
- Ask **coverage questions** with citations (RAG)
- Submit a **claim** in natural language → extract fields → validate coverage (RAG) → return decision with citations

> **Prereqs**: You must have Azure OpenAI deployments and a Pinecone API key.  
> **Data**: Point to your local `policies_docs/policies` folder (300 TXT files) and `homeshield_sample_data/customers.csv`.


In [None]:

# If running in a fresh environment, uncomment these:
# !pip install fastapi uvicorn openai==1.* pinecone-client pandas python-dotenv tqdm


## 1) Configuration

In [None]:

import os, json, re, uuid, glob
from typing import List, Dict
import pandas as pd
from pathlib import Path

# ---- UPDATE THESE PATHS FOR YOUR MACHINE ----
POLICY_DIR   = r"C:\Users\kalva\AI_Projects\HomeShield_AI\policies_docs\policies"
CUSTOMERS_CSV= r"C:\Users\kalva\AI_Projects\HomeShield_AI\homeshield_sample_data\customers.csv"

# ---- ENV VARS (set here or via your OS/.env) ----
os.environ.setdefault("AZURE_OPENAI_ENDPOINT",      "https://<your-endpoint>")
os.environ.setdefault("AZURE_OPENAI_API_KEY",       "<your-azure-openai-key>")
os.environ.setdefault("AZURE_OPENAI_API_VERSION",   "2024-06-01")
os.environ.setdefault("AZURE_OPENAI_EMBEDDING_DEPLOYMENT", "text-embedding-3-small")
os.environ.setdefault("AZURE_OPENAI_CHAT_DEPLOYMENT",      "gpt-4o")

os.environ.setdefault("PINECONE_API_KEY", "<your-pinecone-key>")
os.environ.setdefault("PINECONE_INDEX",   "homeshield-policies")
os.environ.setdefault("PINECONE_REGION",  "us-east-1")

# Sanity checks
print("POLICY_DIR exists:", Path(POLICY_DIR).exists())
print("CUSTOMERS_CSV exists:", Path(CUSTOMERS_CSV).exists())


## 2) Initialize Clients (Azure OpenAI + Pinecone)

In [None]:

from openai import AzureOpenAI
from pinecone import Pinecone, ServerlessSpec

AZURE_OPENAI_ENDPOINT = os.environ["AZURE_OPENAI_ENDPOINT"]
AZURE_OPENAI_API_KEY  = os.environ["AZURE_OPENAI_API_KEY"]
AZURE_OPENAI_API_VERSION = os.environ.get("AZURE_OPENAI_API_VERSION","2024-06-01")
EMBED_MODEL = os.environ.get("AZURE_OPENAI_EMBEDDING_DEPLOYMENT","text-embedding-3-small")
CHAT_MODEL  = os.environ.get("AZURE_OPENAI_CHAT_DEPLOYMENT","gpt-4o")

client = AzureOpenAI(
    api_key=AZURE_OPENAI_API_KEY,
    api_version=AZURE_OPENAI_API_VERSION,
    azure_endpoint=AZURE_OPENAI_ENDPOINT,
)

PC_API_KEY = os.environ["PINECONE_API_KEY"]
PINECONE_INDEX = os.environ.get("PINECONE_INDEX","homeshield-policies")
PINECONE_REGION = os.environ.get("PINECONE_REGION","us-east-1")

pc = Pinecone(api_key=PC_API_KEY)
DIM = 1536 if "small" in EMBED_MODEL else 3072

# Create index if missing
if PINECONE_INDEX not in [i["name"] for i in pc.list_indexes()]:
    pc.create_index(
        name=PINECONE_INDEX,
        dimension=DIM,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region=PINECONE_REGION),
    )
index = pc.Index(PINECONE_INDEX)
print("Index ready:", PINECONE_INDEX)


## 3) Helpers: Chunking, Embedding, Retrieval (MMR)

In [None]:

import re
from typing import List, Dict
from tqdm import tqdm

def embed_batch(texts: List[str]) -> List[List[float]]:
    resp = client.embeddings.create(model=EMBED_MODEL, input=texts)
    return [d.embedding for d in resp.data]

def sentence_chunks(text: str, max_words=180, overlap=40) -> List[str]:
    sents = re.split(r'(?<=[.!?])\s+', text)
    out, cur, n = [], [], 0
    for s in sents:
        w = len(s.split())
        if n + w > max_words and cur:
            out.append(" ".join(cur))
            tail = " ".join(cur).split()[-overlap:]
            cur = [" ".join(tail), s]
            n = len(cur[0].split()) + len(s.split())
        else:
            cur.append(s); n += w
    if cur: out.append(" ".join(cur))
    return [c.strip() for c in out if c.strip()]

def mmr(query_vec, cand_vecs, k=8, lam=0.5):
    import numpy as np
    q = np.array(query_vec)
    C = np.array(cand_vecs)
    sim = C @ q / ((np.linalg.norm(C, axis=1) * np.linalg.norm(q)) + 1e-8)
    chosen, cand = [], list(range(len(cand_vecs)))
    while len(chosen) < min(k, len(cand)):
        best, best_score = None, -1e9
        for i in cand:
            div = 0 if not chosen else max(
                (C[i] @ C[j]) / ((np.linalg.norm(C[i]) * np.linalg.norm(C[j])) + 1e-8)
                for j in chosen
            )
            score = lam * sim[i] - (1 - lam) * div
            if score > best_score:
                best, best_score = i, score
        chosen.append(best); cand.remove(best)
    return chosen

def retrieve(query: str, filters: Dict, k=8):
    qv = embed_batch([query])[0]
    res = index.query(vector=qv, top_k=max(20, k*2), include_metadata=True, filter=filters, namespace="policies")
    matches = res.matches or []
    if not matches: return []
    vecs = [m.values for m in matches]
    keep = mmr(qv, vecs, k=min(k, len(matches)))
    picked = [matches[i] for i in keep]
    return [{
        "content": m.metadata.get("content",""),
        "source":  m.metadata.get("source",""),
        "page":    m.metadata.get("page", None)
    } for m in picked], qv


## 4) Ingest policies into Pinecone

In [None]:

files = sorted(glob.glob(str(Path(POLICY_DIR) / "*.txt")))
assert files, f"No TXT files found under {POLICY_DIR}"

def parse_meta(fname: str) -> Dict:
    base = os.path.basename(fname).replace(".txt","")
    _, plan, state, year = base.split("_")
    return {"plan": plan, "state": state, "effective_year": int(year)}

total = 0
for fpath in tqdm(files, desc="Upserting policies"):
    meta = parse_meta(fpath)
    text = Path(fpath).read_text(encoding="utf-8")
    chunks = sentence_chunks(text)
    if not chunks: 
        continue
    BATCH = 64
    for i in range(0, len(chunks), BATCH):
        batch = chunks[i:i+BATCH]
        vecs = embed_batch(batch)
        points = []
        for j, (ch, v) in enumerate(zip(batch, vecs)):
            points.append({
                "id": str(uuid.uuid4()),
                "values": v,
                "metadata": {
                    "plan": meta["plan"],
                    "state": meta["state"],
                    "effective_year": meta["effective_year"],
                    "source": os.path.basename(fpath),
                    "page": (i+j)//5 + 1,
                    "section": "policy",
                    "content": ch,
                    "chunk_id": f"{os.path.basename(fpath)}-{i+j:04d}"
                }
            })
        index.upsert(vectors=points, namespace="policies")
        total += len(points)
print("Total chunks upserted:", total)


## 5) Coverage Q&A — ask with plan/state/year filters

In [None]:

question = "Does my plan cover HVAC compressor?"
filters = {"plan": "Gold", "state": "PA", "effective_year": 2025}

ctx, qv = retrieve(question, filters, k=8)
print("Context hits:", len(ctx))
ctx_text = "\n\n---\n\n".join(f"[{i}] {c['source']} p.{c['page']}\n{c['content']}" for i,c in enumerate(ctx))

ANSWER_SYS = (
    "You are HomeShield AI. Answer ONLY using the provided context from policy documents. "
    'If unsupported, say you are unsure. Return JSON: '
    '{"answer": string, "citations": [{"source": string, "page": number, "quote": string}]}.'
)

prompt = f"Question: {question}\n\nContext:\n{ctx_text}"
resp = client.chat.completions.create(
    model=os.environ.get("AZURE_OPENAI_CHAT_DEPLOYMENT","gpt-4o"),
    messages=[{"role":"system","content":ANSWER_SYS},{"role":"user","content":prompt}],
    temperature=0
)
print(resp.choices[0].message.content)


## 6) Claim submission — extract → validate coverage

In [None]:

customers = pd.read_csv(CUSTOMERS_CSV)
customer_id = customers.iloc[0]["customer_id"]
plan  = customers.iloc[0]["plan"]
state = customers.iloc[0]["state"]
year  = int(str(customers.iloc[0]["effective_date"])[:4])

message = "My AC stopped cooling yesterday, thermostat shows E1."

EXTRACT_SYS = (
    "Extract claim fields from text. Return STRICT JSON with keys: "
    '{"appliance": string, "issue": string, "failure_date": string | null}. If date not found, use null.'
)
extraction = client.chat.completions.create(
    model=os.environ.get("AZURE_OPENAI_CHAT_DEPLOYMENT","gpt-4o"),
    messages=[{"role":"system","content":EXTRACT_SYS},{"role":"user","content":message}],
    temperature=0
).choices[0].message.content

try:
    extraction_json = json.loads(extraction)
except:
    import re as _re
    cleaned = _re.sub(r"^```json|```$", "", extraction.strip(), flags=_re.MULTILINE)
    extraction_json = json.loads(cleaned)

q_for_ret = f"{extraction_json.get('appliance','')} {extraction_json.get('issue','')}".strip() or message
ctx, _ = retrieve(q_for_ret, {"plan": plan, "state": state, "effective_year": year}, k=8)
ctx_text = "\n\n---\n\n".join(f"[{i}] {c['source']} p.{c['page']}\n{c['content']}" for i,c in enumerate(ctx))

REASON_SYS = (
    "You are a coverage validator. Use ONLY the provided context. "
    'Return JSON: {"decision": "covered" | "partial" | "denied" | "ambiguous", '
    '"reasons": [string], "citations": [{"source": string, "page": number, "quote": string}]}.'
)
reason_user = (
    f"Customer plan: {plan}, {state}, {year}\n"
    f"Appliance: {extraction_json.get('appliance')}\n"
    f"Issue: {extraction_json.get('issue')}\n"
    f"Failure date: {extraction_json.get('failure_date')}\n\n"
    f"Context:\n{ctx_text}"
)
decision = client.chat.completions.create(
    model=os.environ.get("AZURE_OPENAI_CHAT_DEPLOYMENT","gpt-4o"),
    messages=[{"role":"system","content":REASON_SYS},{"role":"user","content":reason_user}],
    temperature=0
).choices[0].message.content

print("Extraction ->", extraction_json)
print("Decision ->", decision)


## 7) (Optional) Quick evaluation on a few questions

In [None]:

gold_path = Path(r"C:\Users\kalva\AI_Projects\HomeShield_AI\homeshield_sample_data\evaluation_pairs.jsonl")
if gold_path.exists():
    rows = [json.loads(l) for l in gold_path.read_text(encoding="utf-8").splitlines()][:5]
    for r in rows:
        q = r["question"]
        plan = r["metadata"]["plan"]; state = r["metadata"]["state"]; year = r["metadata"]["year"]
        ctx, _ = retrieve(q, {"plan":plan, "state":state, "effective_year":year}, k=8)
        ctx_text = "\n\n---\n\n".join(f"[{i}] {c['source']} p.{c['page']}\n{c['content']}" for i,c in enumerate(ctx))
        prompt = f"Question: {q}\n\nContext:\n{ctx_text}"
        out = client.chat.completions.create(
            model=os.environ.get("AZURE_OPENAI_CHAT_DEPLOYMENT","gpt-4o"),
            messages=[{"role":"system","content":"Answer only from context, return plain text."},
                      {"role":"user","content":prompt}],
            temperature=0
        ).choices[0].message.content
        print("\nQ:", q, "\nModel:", out, "\nExpected:", r["expected_answer"][:120], "...")
else:
    print("evaluation_pairs.jsonl not found — skipping.")
