In [28]:
!pip install -q google-genai langchain langchain-community faiss-cpu pdfminer.six pillow pandas

import os
import json
import re
from pathlib import Path
from typing import Dict, Any, List
import pandas as pd
from PIL import Image

from google import genai
from google.genai import types

from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document
from langchain_core.embeddings import Embeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pdfminer.high_level import extract_text

# ---------------------------
# Gemini Client Setup
# ---------------------------
os.environ["GOOGLE_API_KEY"] = "YOURAPIKEY"  #change API key
API_KEY = os.environ.get("GOOGLE_API_KEY")
assert API_KEY, "GOOGLE_API_KEY env var is not set"
client = genai.Client(api_key=API_KEY)

# Choose your Gemini model
GEMINI_MODEL = "gemini-2.5-flash"  # or "gemini-1.5-pro", "gemini-1.5-flash"

# ---------------------------
# Config
# ---------------------------
DOC_PATH = "BeginnerGuide_howtodress.pdf"
INDEX_DIR = "faiss_index"

In [29]:
# For embeddings, we'll use a simple approach with Gemini's embedding model
class GeminiEmbeddings(Embeddings):
    def __init__(self, client, model="text-embedding-004"):
        self.client = client
        self.model = model

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        embeddings = []
        for text in texts:
            result = self.client.models.embed_content(
                model=self.model,
                contents=text
            )
            embeddings.append(result.embeddings[0].values)
        return embeddings

    def embed_query(self, text: str) -> List[float]:
        result = self.client.models.embed_content(
            model=self.model,
            contents=text
        )
        return result.embeddings[0].values

embeddings = GeminiEmbeddings(client)

# ---------------------------
# PDF -> Documents
# ---------------------------
def load_pdf_as_documents(path: str) -> List[Document]:
    """Extract text from PDF and create documents with page metadata."""
    raw = extract_text(path) or ""
    pages = [p.strip() for p in raw.split("\f") if p.strip()]
    docs = []
    for i, page in enumerate(pages, start=1):
        docs.append(Document(page_content=page, metadata={"source": path, "page": i}))
    return docs

# ---------------------------
# Chunking
# ---------------------------
def chunk_docs(docs: List[Document]) -> List[Document]:
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=800,
        chunk_overlap=120,
        separators=["\n\n", "\n", ".", " ", ""],
    )
    return splitter.split_documents(docs)

# ---------------------------
# Build / Load Vector Store
# ---------------------------
def get_vectorstore(chunks: List[Document]) -> FAISS:
    if Path(INDEX_DIR).exists():
        db = FAISS.load_local(INDEX_DIR, embeddings, allow_dangerous_deserialization=True)
    else:
        db = FAISS.from_documents(chunks, embeddings)
        db.save_local(INDEX_DIR)
    return db

# ---------------------------
# Retrieval
# ---------------------------
def retrieve_docs(db: FAISS, query: str, k: int = 5) -> List[Document]:
    retriever = db.as_retriever(
        search_type="mmr",
        search_kwargs={"k": k, "fetch_k": 20, "lambda_mult": 0.5}
    )
    return retriever.get_relevant_documents(query)

def format_context(docs: List[Document], max_chars_per_chunk: int = 900) -> str:
    blocks = []
    for d in docs:
        page = d.metadata.get("page", "?")
        text = d.page_content.strip().replace("\n", " ")
        if len(text) > max_chars_per_chunk:
            text = text[:max_chars_per_chunk] + "…"
        blocks.append(f"[p.{page}] {text}")
    return "\n\n".join(blocks)

def format_citations(docs: List[Document]) -> str:
    pages = []
    for d in docs:
        p = d.metadata.get("page", None)
        if p is not None:
            pages.append(int(p))
    pages = sorted(set(pages))
    if not pages:
        return "Sources: (no page markers)"
    return "Sources: " + ", ".join([f"p.{p}" for p in pages])



In [31]:
# ---------------------------
# Gemini Generation Functions
# ---------------------------
SYSTEM_PROMPT = (
    "You are a friendly, practical fashion assistant. "
    "Use ONLY the provided context to answer. If the context is missing something, "
    "say what is known from the context and then clearly mark any extra as 'General guidance'. "
    "Give actionable steps and at least one ready-to-wear outfit formula. "
    "Keep the tone concise and reassuring."
)

def generate_with_gemini(prompt: str, system_prompt: str = SYSTEM_PROMPT) -> str:
    """Generate text using Gemini API."""
    response = client.models.generate_content(
        model=GEMINI_MODEL,
        contents=prompt,
        config=types.GenerateContentConfig(
            system_instruction=system_prompt,
            temperature=0.0,
            max_output_tokens=500,
        )
    )
    return response.text.strip()

# ---------------------------
# End-to-end QA
# ---------------------------
def answer_question(query: str) -> str:
    docs = load_pdf_as_documents(DOC_PATH)
    chunks = chunk_docs(docs)
    db = get_vectorstore(chunks)
    top_docs = retrieve_docs(db, query, k=5)
    context = format_context(top_docs)

    prompt = (
        f"User question:\n{query}\n\n"
        f"Context (excerpts with page numbers):\n{context}\n\n"
        "Now write the best possible answer grounded in the context. "
        "Finish with a 'Sources:' line listing the page numbers you used like p.3, p.5."
    )

    model_answer = generate_with_gemini(prompt)

    if "Sources:" not in model_answer:
        model_answer += "\n\n" + format_citations(top_docs)
    return model_answer

# ================== Closet Management ==================
def load_closet(path: str) -> pd.DataFrame:
    if path.lower().endswith(".csv"):
        df = pd.read_csv(path)
    elif path.lower().endswith(".json"):
        df = pd.DataFrame(json.load(open(path)))
    else:
        raise ValueError("Closet must be .csv or .json")

    df.columns = [c.strip().lower() for c in df.columns]

    # Required columns: object, color, pattern, fabric
    for col in ["object", "color", "pattern", "fabric"]:
        if col not in df.columns:
            raise ValueError(f"Closet missing column: {col}")

    # Create ID from index if not present
    if "id" not in df.columns:
        df["id"] = df.index.astype(str)

    # Compact description for prompting
    df["desc"] = df.apply(
        lambda r: f'{r["id"]}: {r["object"]} (color={r["color"]}, pattern={r["pattern"]}, fabric={r["fabric"]})',
        axis=1
    )
    return df

def closet_lines(df: pd.DataFrame, max_items: int = 120) -> List[str]:
    rows = df.head(max_items)
    return rows["desc"].tolist()

# ---------------------------
# JSON Parsing
# ---------------------------
def extract_json(text: str):
    """Extract and parse JSON from Gemini response."""
    # Remove code fences
    text = re.sub(r"^```[\w-]*\s*|```$", "", text.strip(), flags=re.MULTILINE)

    # Normalize smart quotes
    text = (text
        .replace("\u201c", '"').replace("\u201d", '"')
        .replace("\u2018", "'").replace("\u2019", "'"))

    # Strip comments
    text = re.sub(r"/\*.*?\*/", "", text, flags=re.DOTALL)
    text = re.sub(r"(?m)^\s*//.*$", "", text)
    text = re.sub(r"(?m)(?<!https:)(?<!http:)//.*$", "", text)

    # Find JSON block
    m = re.search(r"\{.*\}", text, flags=re.DOTALL)
    if not m:
        raise ValueError("No JSON object found in model output.")
    snippet = m.group(0)

    # Replace Python literals
    snippet = re.sub(r"\bNone\b", "null", snippet)
    snippet = re.sub(r"\bTrue\b", "true", snippet)
    snippet = re.sub(r"\bFalse\b", "false", snippet)

    # Remove trailing commas
    for _ in range(3):
        snippet = re.sub(r",\s*([\]}])", r"\1", snippet)

    try:
        return json.loads(snippet)
    except json.JSONDecodeError:
        # Try to fix single quotes
        def safe_double_quotes(s):
            s = re.sub(r"(?<=\{|,)\s*'([^']+)'\s*:", r'"\1":', s)
            s = re.sub(r':\s*\'([^\'\\]*(?:\\.[^\'\\]*)*)\'', r': "\1"', s)
            return s
        coerced = safe_double_quotes(snippet)
        return json.loads(coerced)

# ---------------------------
# Validation
# ---------------------------
def validate_selection(sel: Dict[str, Any], closet_df: pd.DataFrame) -> Dict[str, Any]:
    valid_ids = set(closet_df["id"].astype(str))
    for slot in ["dress","top","bottom","outerwear","shoes","bag"]:
        v = sel.get(slot)
        if v is not None and not (isinstance(v, str) and v in valid_ids):
            sel[slot] = None

    acc = sel.get("accessories", [])
    if isinstance(acc, list):
        sel["accessories"] = [a for a in acc if isinstance(a, str) and a in valid_ids]
    else:
        sel["accessories"] = []

    essentials_ok = bool(sel.get("shoes")) and (
        bool(sel.get("dress")) or (bool(sel.get("top")) and bool(sel.get("bottom")))
    )
    sel["_status"] = "ok" if essentials_ok else "insufficient"
    return sel

# ---------------------------
# Outfit Generation with Gemini
# ---------------------------
SCHEMA_STR = r"""
Return ONLY JSON (no code fences, no prose) matching exactly:
{
  "deduced_event": "short string",
  "constraints": ["short bullets inferred from context"],
  "selected": {
    "dress": "ID or null",
    "top": "ID or null",
    "bottom": "ID or null",
    "outerwear": "ID or null",
    "shoes": "ID or null",
    "bag": "ID or null",
    "accessories": ["IDs"]
  },
  "explanation": "Under 6 sentences explaining why this works, grounded in context.",
  "citations": ["p.7","p.8"]
}
Do not include any comments or Markdown.
"""

OUTFIT_SYSTEM_PROMPT = (
    "You are a fashion assistant. Infer appropriate styling rules from the provided context excerpts "
    "(with page numbers) and assemble an outfit using ONLY closet item IDs. Do NOT invent items. "
    "If you cannot complete a full outfit, still return valid JSON with nulls where needed. "
    "Absolutely no text outside the JSON object. No comments. No code fences."
)

def llm_outfit_from_closet(user_query: str, closet_df: pd.DataFrame,
                           doc_path: str, k: int = 6, max_closet_items: int = 120) -> Dict[str, Any]:
    # Retrieve guide context via RAG
    docs = load_pdf_as_documents(doc_path)
    chunks = chunk_docs(docs)
    db = get_vectorstore(chunks)
    top_docs = retrieve_docs(db, user_query, k=k)

    context = format_context(top_docs, max_chars_per_chunk=700)
    citations = format_citations(top_docs)
    closet_inv = "\n".join(closet_lines(closet_df, max_closet_items))

    user_prompt = (
        f"User query:\n{user_query}\n\n"
        f"Context (excerpts with page numbers):\n{context}\n\n"
        f"Citations for reference: {citations}\n\n"
        "Closet inventory (use ONLY these item IDs):\n"
        f"{closet_inv}\n\n"
        "Instructions:\n"
        "- Infer the event/style constraints ONLY from the context above.\n"
        "- Build an outfit using only closet item IDs. If needed, a dress can stand alone (no top/bottom).\n"
        "- Keep total colors cohesive (2–3) only if context suggests; otherwise prioritize context guidance.\n"
        "- Do not repeat the full closet; do not add commentary outside JSON.\n"
        + SCHEMA_STR
    )

    response = client.models.generate_content(
        model=GEMINI_MODEL,
        contents=user_prompt,
        config=types.GenerateContentConfig(
            system_instruction=OUTFIT_SYSTEM_PROMPT,
            temperature=0.2,
            max_output_tokens=400,
        )
    )

    raw_out = response.text.strip()
    data = extract_json(raw_out)

    # Safety: ensure keys exist
    data.setdefault("selected", {})
    for k_ in ["dress","top","bottom","outerwear","shoes","bag","accessories"]:
        data["selected"].setdefault(k_, None if k_ != "accessories" else [])

    # Validate IDs
    data["selected"] = validate_selection(data["selected"], closet_df)

    # Ensure citations present
    if not data.get("citations"):
        data["citations"] = re.findall(r"p\.\d+", citations) or []

    return data

# ---------------------------
# Pretty Printer
# ---------------------------
def render_outfit(data: Dict[str, Any], closet_df: pd.DataFrame) -> str:
    lines = []
    sel = data["selected"]

    def lookup(i):
        if not i: return ""
        row = closet_df[closet_df["id"].astype(str)==str(i)]
        if row.empty: return i
        r = row.iloc[0]
        return f'{i} — {r["object"]} (color: {r["color"]}, pattern: {r["pattern"]}, fabric: {r["fabric"]})'

    for slot in ["dress","top","bottom","outerwear","shoes","bag"]:
        if sel.get(slot):
            lines.append(f"{slot.capitalize()}: {lookup(sel[slot])}")

    for a in sel.get("accessories", []):
        lines.append(f"Accessory: {lookup(a)}")

    lines.append("\nWhy this works:\n" + (data.get("explanation") or "").strip())

    if data.get("citations"):
        lines.append("Sources: " + ", ".join(data["citations"]))

    if sel.get("_status") == "insufficient":
        lines.append("\nNote: Could not complete a full outfit from the closet.")

    return "\n".join(lines)

# ================== Example Run ==================
if __name__ == "__main__":
    # Create sample closet with new column structure
    sample_csv = """id,object,color,pattern,fabric
T1,Black crewneck knit,black,solid,wool
T2,Dark navy blouse,navy,solid,silk
T3,White oxford shirt,white,solid,cotton
T4,Striped Breton tee,navy/white,stripe,cotton
T5,Charcoal merino turtleneck,charcoal,solid,wool
T6,Ivory silk cami,ivory,solid,silk
B1,Black linen pants,black,solid,linen
B2,Charcoal wool trousers,charcoal,solid,wool
B3,Dark indigo straight jeans,dark indigo,solid,denim
B4,Black wide-leg trousers,black,solid,wool blend
B5,Navy A-line midi skirt,navy,solid,wool
D1,Deep green midi dress,deep green,solid,viscose
D2,Black column dress,black,solid,crepe
O1,Camel trench coat,camel,solid,cotton
O2,Navy unstructured blazer,navy,solid,wool
O3,Black wool overcoat,black,solid,wool
O4,Cropped denim jacket,blue denim,solid,denim
S1,Black leather loafers,black,solid,leather
S2,Black ankle boots,black,solid,leather
S3,White leather sneakers,white,solid,leather
S4,Black strappy heels,black,solid,leather
S5,Brown brogues,brown,solid,leather
G1,Black structured tote,black,solid,leather
G2,Slim black crossbody,black,solid,leather
G3,Small metallic clutch,metallic,solid,leather
A1,Small silver hoops,silver,solid,metal
A2,Thin black belt,black,solid,leather
A3,Gold pendant necklace,gold,solid,metal
A4,Black wool scarf,black,solid,wool"""

    with open("closet_min.csv","w") as f:
        f.write(sample_csv)

    # Test simple Q&A
    print("=== Simple Q&A Test ===")
    user_query = "How should I dress to a funeral if I want to use my black linen pants?"
    print(answer_question(user_query))
    print("\n")

    # Test outfit generation
    print("=== Outfit Generation Test ===")
    closet_df = load_closet("closet_min.csv")
    result = llm_outfit_from_closet(
        user_query=user_query,
        closet_df=closet_df,
        doc_path=DOC_PATH,
        k=6
    )
    print(render_outfit(result, closet_df))

=== Simple Q&A Test ===
To dress for a funeral using your black linen pants, focus on a respectful and understated tone.

Here's how to style them:

**Actionable Steps:**
1.  **Choose a Top:** Pair your black linen pants with a simple top in a dark or muted color (such as black, charcoal, navy, or deep green).
2.  **Select Shoes:** Wear closed-toe shoes.
3.  **Accessorize Minimally:** Keep jewelry and other accessories minimal.
4.  **Consider Outerwear:** If needed, add a formal coat, potentially in velvet or wool.
5.  **Ensure Fit:** Tailor small things; they make the biggest difference. Ensure your pants are neat.

**Ready-to-Wear Outfit Formula:**
Black linen pants + Simple dark top + Closed-toe shoes + Minimal accessories

**General guidance:**
The context specifies dark or muted colors for funerals, which your black linen pants fulfill. While the context doesn't explicitly mention linen as an appropriate fabric for funeral pants, it does suggest velvet/wool for formal coats. Gener