In [1]:
pip install python-dotenv pandas

Collecting python-dotenv
  Using cached python_dotenv-1.1.1-py3-none-any.whl.metadata (24 kB)
Collecting pandas
  Using cached pandas-2.3.1-cp310-cp310-win_amd64.whl.metadata (19 kB)
Collecting numpy>=1.22.4 (from pandas)
  Using cached numpy-2.2.6-cp310-cp310-win_amd64.whl.metadata (60 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Using cached python_dotenv-1.1.1-py3-none-any.whl (20 kB)
Using cached pandas-2.3.1-cp310-cp310-win_amd64.whl (11.3 MB)
Using cached numpy-2.2.6-cp310-cp310-win_amd64.whl (12.9 MB)
Using cached pytz-2025.2-py2.py3-none-any.whl (509 kB)
Using cached tzdata-2025.2-py2.py3-none-any.whl (347 kB)
Installing collected packages: pytz, tzdata, python-dotenv, numpy, pandas

   ---------------------------------------- 0/5 [pytz]
   ---------------------------------------- 0/5 [pytz]
   ----------------

In [11]:
import os, json, re, uuid, glob
from pathlib import Path
from typing import List, Dict

import pandas as pd
from dotenv import load_dotenv

# ---- 1. Load Environment Variables ----
# Point directly to your .env file (absolute path)
env_path = Path(r"C:\Users\kalva\AI_Projects\HomeShield_AI") / ".env"
load_dotenv(env_path)

# ---- 2. Required Variables Check ----
required_vars = [
    "AZURE_OPENAI_ENDPOINT",
    "AZURE_OPENAI_API_KEY", 
    "PINECONE_API_KEY"
]

missing = [var for var in required_vars if not os.getenv(var)]
if missing:
    raise RuntimeError(f"Missing env vars: {missing}")

# ---- 3. Path Configuration ----
POLICY_DIR = Path(r"C:\Users\kalva\AI_Projects\HomeShield_AI\policies_docs")
CUSTOMERS_CSV = Path(r"C:\Users\kalva\AI_Projects\HomeShield_AI\homeshield_sample_data\customers.csv")
EVAL_PAIRS = Path(r"C:\Users\kalva\AI_Projects\HomeShield_AI\homeshield_sample_data\evaluation_pairs.jsonl")

# ---- 4. Validation ----
print("Azure endpoint configured:", bool(os.getenv("AZURE_OPENAI_ENDPOINT")))
print("Policy directory exists:", POLICY_DIR.exists())
print("Customer data exists:", CUSTOMERS_CSV.exists())

Azure endpoint configured: True
Policy directory exists: True
Customer data exists: True


In [3]:
pip install openai pinecone

Collecting openai
  Downloading openai-1.99.9-py3-none-any.whl.metadata (29 kB)
Collecting pinecone
  Using cached pinecone-7.3.0-py3-none-any.whl.metadata (9.5 kB)
Collecting anyio<5,>=3.5.0 (from openai)
  Using cached anyio-4.10.0-py3-none-any.whl.metadata (4.0 kB)
Collecting distro<2,>=1.7.0 (from openai)
  Using cached distro-1.9.0-py3-none-any.whl.metadata (6.8 kB)
Collecting httpx<1,>=0.23.0 (from openai)
  Using cached httpx-0.28.1-py3-none-any.whl.metadata (7.1 kB)
Collecting jiter<1,>=0.4.0 (from openai)
  Downloading jiter-0.10.0-cp310-cp310-win_amd64.whl.metadata (5.3 kB)
Collecting pydantic<3,>=1.9.0 (from openai)
  Using cached pydantic-2.11.7-py3-none-any.whl.metadata (67 kB)
Collecting sniffio (from openai)
  Using cached sniffio-1.3.1-py3-none-any.whl.metadata (3.9 kB)
Collecting tqdm>4 (from openai)
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting idna>=2.8 (from anyio<5,>=3.5.0->openai)
  Using cached idna-3.10-py3-none-any.whl.metadata (10 kB)

In [12]:
import os
from openai import AzureOpenAI
from pinecone import Pinecone

# ----- Secure Configuration -----
def get_env_var(name, default=None):
    """Safely get environment variable with validation"""
    value = os.environ.get(name, default)
    if value is None and default is None:
        raise ValueError(f"Missing required environment variable: {name}")
    return value

# Azure OpenAI Config
AZURE_OPENAI_ENDPOINT = get_env_var("AZURE_OPENAI_ENDPOINT")
AZURE_OPENAI_API_KEY = get_env_var("AZURE_OPENAI_API_KEY")
AZURE_OPENAI_API_VER = get_env_var("AZURE_OPENAI_API_VERSION")
EMBED_MODEL = get_env_var("AZURE_OPENAI_EMBEDDING_DEPLOYMENT") 
CHAT_MODEL = get_env_var("AZURE_OPENAI_CHAT_DEPLOYMENT")

# Pinecone Config
PC_API_KEY = get_env_var("PINECONE_API_KEY")
PINECONE_INDEX = get_env_var("PINECONE_INDEX")
PINECONE_REGION = get_env_var("PINECONE_REGION")

# ----- Client Initialization -----
# Azure OpenAI Client
oai_client = AzureOpenAI(
    api_key=AZURE_OPENAI_API_KEY,
    api_version=AZURE_OPENAI_API_VER,
    azure_endpoint=AZURE_OPENAI_ENDPOINT,
)

# Pinecone Client
pc = Pinecone(api_key=PC_API_KEY)

# ----- Index Verification -----
print(f"Connecting to existing Pinecone index: {PINECONE_INDEX}")
index = pc.Index(PINECONE_INDEX)

# Verify index configuration matches your expectations
index_stats = index.describe_index_stats()
print("\nIndex Configuration:")
print(f"- Dimensions: {index_stats.dimension}")
print(f"- Metric: {index_stats.metric}")
print(f"- Vector Count: {index_stats.total_vector_count}")
print(f"- Embedding Model: {EMBED_MODEL}")

# Safety check for embedding dimension
expected_dim = 1536  # text-embedding-3-small uses 1536 dimensions
if index_stats.dimension != expected_dim:
    print(f"\nWARNING: Index dimension ({index_stats.dimension}) doesn't match expected ({expected_dim})")
    print("You may need to recreate your index with the correct dimensions")
else:
    print("\nIndex validation successful - ready for operations!")

Connecting to existing Pinecone index: homeshield-policies

Index Configuration:
- Dimensions: 1536
- Metric: cosine
- Vector Count: 0
- Embedding Model: text-embedding-3-small

Index validation successful - ready for operations!


In [4]:
pip install langchain-openai langchain-pinecone

Collecting langchain-openai
  Downloading langchain_openai-0.3.30-py3-none-any.whl.metadata (2.4 kB)
Collecting langchain-pinecone
  Downloading langchain_pinecone-0.2.11-py3-none-any.whl.metadata (6.1 kB)
Collecting langchain-core<1.0.0,>=0.3.74 (from langchain-openai)
  Downloading langchain_core-0.3.74-py3-none-any.whl.metadata (5.8 kB)
Collecting tiktoken<1,>=0.7 (from langchain-openai)
  Downloading tiktoken-0.11.0-cp310-cp310-win_amd64.whl.metadata (6.9 kB)
Collecting langsmith>=0.3.45 (from langchain-core<1.0.0,>=0.3.74->langchain-openai)
  Downloading langsmith-0.4.14-py3-none-any.whl.metadata (14 kB)
Collecting tenacity!=8.4.0,<10.0.0,>=8.1.0 (from langchain-core<1.0.0,>=0.3.74->langchain-openai)
  Using cached tenacity-9.1.2-py3-none-any.whl.metadata (1.2 kB)
Collecting jsonpatch<2.0,>=1.33 (from langchain-core<1.0.0,>=0.3.74->langchain-openai)
  Using cached jsonpatch-1.33-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting PyYAML>=5.3 (from langchain-core<1.0.0,>=0.3.74->lang

In [13]:
# Block 3 — LangChain + Azure embeddings + Pinecone VectorStore

from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings
from langchain_pinecone import PineconeVectorStore
from pinecone import Pinecone, ServerlessSpec
import os

# Azure LLM + Embeddings (deployment names come from your .env)
llm = AzureChatOpenAI(
    api_key=os.environ["AZURE_OPENAI_API_KEY"],
    api_version=os.environ["AZURE_OPENAI_API_VERSION"],
    azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
    azure_deployment=os.environ["AZURE_OPENAI_CHAT_DEPLOYMENT"],
    temperature=0,
)

embeddings = AzureOpenAIEmbeddings(
    api_key=os.environ["AZURE_OPENAI_API_KEY"],
    api_version=os.environ["AZURE_OPENAI_API_VERSION"],
    azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
    azure_deployment=os.environ["AZURE_OPENAI_EMBEDDING_DEPLOYMENT"],
)

# Ensure Pinecone index exists with dim=1536 (Azure text-embedding-3-small)
pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"])
INDEX_NAME   = os.environ["PINECONE_INDEX"]
REGION       = os.environ["PINECONE_REGION"]
DIM          = 1536  # Azure text-embedding-3-small

if INDEX_NAME not in [i["name"] for i in pc.list_indexes()]:
    pc.create_index(
        name=INDEX_NAME,
        dimension=DIM,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region=REGION),
    )

# LangChain VectorStore handle (keep namespace stable)
NAMESPACE = "policies"
vectorstore = PineconeVectorStore(index_name=INDEX_NAME, embedding=embeddings, namespace=NAMESPACE)

print("LangChain ready → index:", INDEX_NAME, "| namespace:", NAMESPACE)


LangChain ready → index: homeshield-policies | namespace: policies


In [9]:
# Run this in a Jupyter notebook cell
%pip install langchain-community langchain-core langchain-text-splitters pinecone-client

Collecting langchain-community
  Using cached langchain_community-0.3.27-py3-none-any.whl.metadata (2.9 kB)
Collecting langchain-text-splitters
  Downloading langchain_text_splitters-0.3.9-py3-none-any.whl.metadata (1.9 kB)
Collecting pinecone-client
  Using cached pinecone_client-6.0.0-py3-none-any.whl.metadata (3.4 kB)
Collecting langchain<1.0.0,>=0.3.26 (from langchain-community)
  Downloading langchain-0.3.27-py3-none-any.whl.metadata (7.8 kB)
Collecting SQLAlchemy<3,>=1.4 (from langchain-community)
  Downloading sqlalchemy-2.0.43-cp310-cp310-win_amd64.whl.metadata (9.8 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Using cached dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Using cached pydantic_settings-2.10.1-py3-none-any.whl.metadata (3.4 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Using cached httpx_sse-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Col

In [29]:
# Adapted “simple strategy” for our project (TXT + Azure + metadata + namespace)
# with namespace wipe + 'text' metadata for LangChain compatibility

import os, time
from pathlib import Path
from dotenv import load_dotenv
from tqdm import tqdm

from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import AzureOpenAIEmbeddings
from pinecone import Pinecone

# --- Load env if needed (keeps your existing .env setup) ---
# load_dotenv(Path(r"C:\Users\kalva\AI_Projects\HomeShield_AI") / ".env")

# --- 1) Load TXT policies ---
raw_docs = []
for p in POLICY_DIR.glob("*.txt"):
    raw_docs.extend(TextLoader(str(p), encoding="utf-8").load())
print("Loaded TXT files:", len(raw_docs))

# --- 2) Chunk ---
splitter = RecursiveCharacterTextSplitter(chunk_size=900, chunk_overlap=120, add_start_index=True)
chunks = splitter.split_documents(raw_docs)
print("Chunks:", len(chunks))

# --- 3) Attach metadata from filename: LHG_<Plan>_<STATE>_<YEAR>.txt ---
def parse_meta_from_filename(path_str: str):
    stem = Path(path_str).stem
    parts = stem.split("_")
    if len(parts) >= 4 and parts[0] == "LHG":
        _, plan, state, year = parts[:4]
        try:
            return {"plan": plan, "state": state, "effective_year": int(year)}
        except:
            return {}
    return {}

for i, d in enumerate(chunks):
    src = d.metadata.get("source", "")
    d.metadata.update(parse_meta_from_filename(src))
    d.metadata.setdefault("source", Path(src).name or "unknown.txt")
    d.metadata.setdefault("page", (i // 5) + 1)
    d.metadata.setdefault("section", "policy")
    d.metadata.setdefault("chunk_id", f"{d.metadata['source']}-{i:04d}")

# --- 4) Embeddings (Azure, 1536-dim) ---
emb = AzureOpenAIEmbeddings(
    api_key=os.environ["AZURE_OPENAI_API_KEY"],
    api_version=os.environ["AZURE_OPENAI_API_VERSION"],
    azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
    azure_deployment=os.environ["AZURE_OPENAI_EMBEDDING_DEPLOYMENT"],
)

# --- 5) Pinecone client & index handle ---
pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"])
index = pc.Index(INDEX_NAME)


# --- 6) Upsert with throttling (avoid 429s) & include 'text' in metadata ---
BATCH = 64
BASE_SLEEP = 3.0
MAX_RETRIES = 6

def embed_texts(texts):
    # LangChain embeddings returns list of vectors
    return emb.embed_documents(texts)

def upsert_batch(batch, start_id):
    texts = [d.page_content for d in batch]
    vecs = embed_texts(texts)
    vectors = []
    for j, d in enumerate(batch):
        vectors.append({
            "id": f"hs-{start_id + j:08d}",
            "values": vecs[j],
            "metadata": {
                **d.metadata,
                "text": d.page_content,   # <-- critical: store text so LangChain can rebuild Documents
            }
        })
    index.upsert(vectors=vectors, namespace=NAMESPACE)

def add_batch_with_backoff(batch, start_id):
    delay = BASE_SLEEP
    for attempt in range(1, MAX_RETRIES + 1):
        try:
            upsert_batch(batch, start_id)
            return
        except Exception as e:
            msg = str(e)
            is_429 = ("429" in msg) or ("Too Many Requests" in msg) or ("rate" in msg.lower())
            if not is_429 or attempt == MAX_RETRIES:
                print(f"[ERROR] Upsert failed (attempt {attempt}): {e}")
                raise
            # exponential backoff with cap
            sleep_for = min(delay * (1.8 ** (attempt - 1)), 30.0)
            print(f"[WARN] 429 rate limit; retrying in {sleep_for:.1f}s (attempt {attempt}/{MAX_RETRIES})")
            time.sleep(sleep_for)

total = len(chunks)
for i in tqdm(range(0, total, BATCH), desc="Upserting"):
    batch = chunks[i:i + BATCH]
    add_batch_with_backoff(batch, start_id=i)
    time.sleep(BASE_SLEEP)  # steady pacing
    if (i // BATCH) % 10 == 0:
        print(f"Progress: {min(i + BATCH, total)}/{total}")

print(f"✅ Completed upsert to Pinecone index='{INDEX_NAME}', namespace='{NAMESPACE}'.")


Loaded TXT files: 300
Chunks: 4500


Upserting:   0%|          | 0/71 [00:00<?, ?it/s]INFO:httpx:HTTP Request: POST https://homeshield.openai.azure.com/openai/deployments/text-embedding-3-small/embeddings?api-version=2024-12-01-preview "HTTP/1.1 200 OK"
Upserting:   1%|▏         | 1/71 [00:05<06:02,  5.18s/it]

Progress: 64/4500


INFO:httpx:HTTP Request: POST https://homeshield.openai.azure.com/openai/deployments/text-embedding-3-small/embeddings?api-version=2024-12-01-preview "HTTP/1.1 200 OK"
Upserting:   3%|▎         | 2/71 [00:09<05:24,  4.71s/it]INFO:httpx:HTTP Request: POST https://homeshield.openai.azure.com/openai/deployments/text-embedding-3-small/embeddings?api-version=2024-12-01-preview "HTTP/1.1 200 OK"
Upserting:   4%|▍         | 3/71 [00:13<05:10,  4.56s/it]INFO:httpx:HTTP Request: POST https://homeshield.openai.azure.com/openai/deployments/text-embedding-3-small/embeddings?api-version=2024-12-01-preview "HTTP/1.1 200 OK"
Upserting:   6%|▌         | 4/71 [00:18<05:05,  4.55s/it]INFO:httpx:HTTP Request: POST https://homeshield.openai.azure.com/openai/deployments/text-embedding-3-small/embeddings?api-version=2024-12-01-preview "HTTP/1.1 200 OK"
Upserting:   7%|▋         | 5/71 [00:22<04:55,  4.48s/it]INFO:httpx:HTTP Request: POST https://homeshield.openai.azure.com/openai/deployments/text-embedding-

Progress: 704/4500


INFO:httpx:HTTP Request: POST https://homeshield.openai.azure.com/openai/deployments/text-embedding-3-small/embeddings?api-version=2024-12-01-preview "HTTP/1.1 200 OK"
Upserting:  17%|█▋        | 12/71 [00:54<04:19,  4.40s/it]INFO:httpx:HTTP Request: POST https://homeshield.openai.azure.com/openai/deployments/text-embedding-3-small/embeddings?api-version=2024-12-01-preview "HTTP/1.1 200 OK"
Upserting:  18%|█▊        | 13/71 [00:58<04:15,  4.41s/it]INFO:httpx:HTTP Request: POST https://homeshield.openai.azure.com/openai/deployments/text-embedding-3-small/embeddings?api-version=2024-12-01-preview "HTTP/1.1 200 OK"
Upserting:  20%|█▉        | 14/71 [01:03<04:11,  4.42s/it]INFO:httpx:HTTP Request: POST https://homeshield.openai.azure.com/openai/deployments/text-embedding-3-small/embeddings?api-version=2024-12-01-preview "HTTP/1.1 200 OK"
Upserting:  21%|██        | 15/71 [01:07<04:06,  4.40s/it]INFO:httpx:HTTP Request: POST https://homeshield.openai.azure.com/openai/deployments/text-embedd

Progress: 1344/4500


INFO:httpx:HTTP Request: POST https://homeshield.openai.azure.com/openai/deployments/text-embedding-3-small/embeddings?api-version=2024-12-01-preview "HTTP/1.1 200 OK"
Upserting:  31%|███       | 22/71 [01:38<03:40,  4.49s/it]INFO:httpx:HTTP Request: POST https://homeshield.openai.azure.com/openai/deployments/text-embedding-3-small/embeddings?api-version=2024-12-01-preview "HTTP/1.1 200 OK"
Upserting:  32%|███▏      | 23/71 [01:43<03:36,  4.51s/it]INFO:httpx:HTTP Request: POST https://homeshield.openai.azure.com/openai/deployments/text-embedding-3-small/embeddings?api-version=2024-12-01-preview "HTTP/1.1 200 OK"
Upserting:  34%|███▍      | 24/71 [01:47<03:32,  4.52s/it]INFO:httpx:HTTP Request: POST https://homeshield.openai.azure.com/openai/deployments/text-embedding-3-small/embeddings?api-version=2024-12-01-preview "HTTP/1.1 200 OK"
Upserting:  35%|███▌      | 25/71 [01:52<03:26,  4.49s/it]INFO:httpx:HTTP Request: POST https://homeshield.openai.azure.com/openai/deployments/text-embedd

Progress: 1984/4500


INFO:httpx:HTTP Request: POST https://homeshield.openai.azure.com/openai/deployments/text-embedding-3-small/embeddings?api-version=2024-12-01-preview "HTTP/1.1 200 OK"
Upserting:  45%|████▌     | 32/71 [02:22<02:50,  4.38s/it]INFO:httpx:HTTP Request: POST https://homeshield.openai.azure.com/openai/deployments/text-embedding-3-small/embeddings?api-version=2024-12-01-preview "HTTP/1.1 200 OK"
Upserting:  46%|████▋     | 33/71 [02:27<02:46,  4.38s/it]INFO:httpx:HTTP Request: POST https://homeshield.openai.azure.com/openai/deployments/text-embedding-3-small/embeddings?api-version=2024-12-01-preview "HTTP/1.1 200 OK"
Upserting:  48%|████▊     | 34/71 [02:31<02:42,  4.39s/it]INFO:httpx:HTTP Request: POST https://homeshield.openai.azure.com/openai/deployments/text-embedding-3-small/embeddings?api-version=2024-12-01-preview "HTTP/1.1 200 OK"
Upserting:  49%|████▉     | 35/71 [02:36<02:39,  4.44s/it]INFO:httpx:HTTP Request: POST https://homeshield.openai.azure.com/openai/deployments/text-embedd

Progress: 2624/4500


INFO:httpx:HTTP Request: POST https://homeshield.openai.azure.com/openai/deployments/text-embedding-3-small/embeddings?api-version=2024-12-01-preview "HTTP/1.1 200 OK"
Upserting:  59%|█████▉    | 42/71 [03:07<02:09,  4.45s/it]INFO:httpx:HTTP Request: POST https://homeshield.openai.azure.com/openai/deployments/text-embedding-3-small/embeddings?api-version=2024-12-01-preview "HTTP/1.1 200 OK"
Upserting:  61%|██████    | 43/71 [03:11<02:04,  4.44s/it]INFO:httpx:HTTP Request: POST https://homeshield.openai.azure.com/openai/deployments/text-embedding-3-small/embeddings?api-version=2024-12-01-preview "HTTP/1.1 200 OK"
Upserting:  62%|██████▏   | 44/71 [03:16<01:59,  4.44s/it]INFO:httpx:HTTP Request: POST https://homeshield.openai.azure.com/openai/deployments/text-embedding-3-small/embeddings?api-version=2024-12-01-preview "HTTP/1.1 200 OK"
Upserting:  63%|██████▎   | 45/71 [03:20<01:57,  4.52s/it]INFO:httpx:HTTP Request: POST https://homeshield.openai.azure.com/openai/deployments/text-embedd

Progress: 3264/4500


INFO:httpx:HTTP Request: POST https://homeshield.openai.azure.com/openai/deployments/text-embedding-3-small/embeddings?api-version=2024-12-01-preview "HTTP/1.1 200 OK"
Upserting:  73%|███████▎  | 52/71 [03:51<01:23,  4.41s/it]INFO:httpx:HTTP Request: POST https://homeshield.openai.azure.com/openai/deployments/text-embedding-3-small/embeddings?api-version=2024-12-01-preview "HTTP/1.1 200 OK"
Upserting:  75%|███████▍  | 53/71 [03:56<01:19,  4.41s/it]INFO:httpx:HTTP Request: POST https://homeshield.openai.azure.com/openai/deployments/text-embedding-3-small/embeddings?api-version=2024-12-01-preview "HTTP/1.1 200 OK"
Upserting:  76%|███████▌  | 54/71 [04:00<01:14,  4.37s/it]INFO:httpx:HTTP Request: POST https://homeshield.openai.azure.com/openai/deployments/text-embedding-3-small/embeddings?api-version=2024-12-01-preview "HTTP/1.1 200 OK"
Upserting:  77%|███████▋  | 55/71 [04:04<01:10,  4.38s/it]INFO:httpx:HTTP Request: POST https://homeshield.openai.azure.com/openai/deployments/text-embedd

Progress: 3904/4500


INFO:httpx:HTTP Request: POST https://homeshield.openai.azure.com/openai/deployments/text-embedding-3-small/embeddings?api-version=2024-12-01-preview "HTTP/1.1 200 OK"
Upserting:  87%|████████▋ | 62/71 [04:35<00:39,  4.43s/it]INFO:httpx:HTTP Request: POST https://homeshield.openai.azure.com/openai/deployments/text-embedding-3-small/embeddings?api-version=2024-12-01-preview "HTTP/1.1 200 OK"
Upserting:  89%|████████▊ | 63/71 [04:40<00:35,  4.45s/it]INFO:httpx:HTTP Request: POST https://homeshield.openai.azure.com/openai/deployments/text-embedding-3-small/embeddings?api-version=2024-12-01-preview "HTTP/1.1 200 OK"
Upserting:  90%|█████████ | 64/71 [04:44<00:31,  4.46s/it]INFO:httpx:HTTP Request: POST https://homeshield.openai.azure.com/openai/deployments/text-embedding-3-small/embeddings?api-version=2024-12-01-preview "HTTP/1.1 200 OK"
Upserting:  92%|█████████▏| 65/71 [04:49<00:27,  4.55s/it]INFO:httpx:HTTP Request: POST https://homeshield.openai.azure.com/openai/deployments/text-embedd

Progress: 4500/4500
✅ Completed upsert to Pinecone index='homeshield-policies', namespace='policies'.





In [17]:
# Bind LangChain to the same Pinecone index/namespace you just upserted into
from langchain_pinecone import PineconeVectorStore

vectorstore = PineconeVectorStore(
    index_name=INDEX_NAME,
    embedding=emb,           # <-- reuse your AzureOpenAIEmbeddings instance
    namespace=NAMESPACE      # "policies"
)
print("VectorStore ready:", INDEX_NAME, "| ns:", NAMESPACE)


VectorStore ready: homeshield-policies | ns: policies


In [25]:
import sys, importlib.metadata as md

def v(pkg):
    try:
        return md.version(pkg)
    except md.PackageNotFoundError:
        return "not installed"

print("python:", sys.version.split()[0])
print("langchain:", v("langchain"))
print("langchain-openai:", v("langchain-openai"))
print("langchain-community:", v("langchain-community"))
print("langchain-pinecone:", v("langchain-pinecone"))
print("pinecone-client:", v("pinecone-client"))
print("openai:", v("openai"))


python: 3.10.18
langchain: 0.3.27
langchain-openai: 0.3.30
langchain-community: 0.3.27
langchain-pinecone: 0.2.11
pinecone-client: 6.0.0
openai: 1.99.9


In [30]:
# 100% safe MMR using VectorStore API directly (no compressors involved)

def retrieve_mmr(question: str, plan: str, state: str, year: int, k: int = 8, fetch_k: int = 24, lambda_mult: float = 0.5):
    """
    Uses vectorstore.max_marginal_relevance_search to get diverse chunks.
    Applies strict metadata filter so we only search the correct policy set.
    """
    return vectorstore.max_marginal_relevance_search(
        question,
        k=k,
        fetch_k=fetch_k,
        lambda_mult=lambda_mult,
        filter={"plan": plan, "state": state, "effective_year": year},
    )

def format_context(docs):
    lines = []
    for i, d in enumerate(docs):
        src  = d.metadata.get("source", "")
        page = d.metadata.get("page", "")
        lines.append(f"[{i}] {src} p.{page}\n{d.page_content}")
    return "\n\n---\n\n".join(lines)

# smoke test
_test = retrieve_mmr("HVAC compressor coverage", "Gold", "PA", 2025)
print("MMR hits:", len(_test))
print(_test[0].metadata, _test[0].page_content[:160])


INFO:httpx:HTTP Request: POST https://homeshield.openai.azure.com/openai/deployments/text-embedding-3-small/embeddings?api-version=2024-12-01-preview "HTTP/1.1 200 OK"


MMR hits: 8
{'chunk_id': 'C:\\Users\\kalva\\AI_Projects\\HomeShield_AI\\policies_docs\\LHG_Gold_PA_2025.txt-1126', 'effective_year': 2025.0, 'page': 226.0, 'plan': 'Gold', 'section': 'policy', 'source': 'C:\\Users\\kalva\\AI_Projects\\HomeShield_AI\\policies_docs\\LHG_Gold_PA_2025.txt', 'start_index': 478.0, 'state': 'PA'} SECTION – HVAC
- Compressor is covered for mechanical and electrical failures under the Gold plan, subject to exclusions listed below. Pre-existing conditions a


In [35]:
docs = vectorstore.max_marginal_relevance_search(
    "HVAC compressor coverage",
    k=8, fetch_k=50, lambda_mult=0.5,
    filter={"plan": "Gold", "state": "PA", "effective_year": 2025},
)
print("hits:", len(docs))
if docs:
    print(docs[0].metadata, "\n", docs[0].page_content[:500])

INFO:httpx:HTTP Request: POST https://homeshield.openai.azure.com/openai/deployments/text-embedding-3-small/embeddings?api-version=2024-12-01-preview "HTTP/1.1 200 OK"


hits: 8
{'chunk_id': 'C:\\Users\\kalva\\AI_Projects\\HomeShield_AI\\policies_docs\\LHG_Gold_PA_2025.txt-1126', 'effective_year': 2025.0, 'page': 226.0, 'plan': 'Gold', 'section': 'policy', 'source': 'C:\\Users\\kalva\\AI_Projects\\HomeShield_AI\\policies_docs\\LHG_Gold_PA_2025.txt', 'start_index': 478.0, 'state': 'PA'} 
 SECTION – HVAC
- Compressor is covered for mechanical and electrical failures under the Gold plan, subject to exclusions listed below. Pre-existing conditions and improper installation are excluded. Wear items (e.g., filters, belts) are excluded unless otherwise stated.
- Evaporator Coil is covered for mechanical and electrical failures under the Gold plan, subject to exclusions listed below. Pre-existing conditions and improper installation are excluded. Wear items (e.g., filters, belts) are ex
