# Vibe Matcher Prototype

Dual TF-IDF / OpenAI embedding demo. Toggle USE_OPENAI at top of the script to switch embedding backend.


In [None]:
"""
Vibe Matcher Prototype (dual TF-IDF / OpenAI)
Run: python vibe_matcher_prototype.py
Requirements: scikit-learn, pandas, matplotlib. For OpenAI embeddings, install openai and set OPENAI_API_KEY env var.
"""

import os
import time
import pandas as pd
import numpy as np
from typing import List, Tuple
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt

# Optional OpenAI
try:
    import openai
    OPENAI_AVAILABLE = True
except Exception:
    OPENAI_AVAILABLE = False

# --- Config ---
USE_OPENAI = False  # Toggle: set True to use OpenAI embeddings (requires OPENAI_API_KEY)
OPENAI_MODEL = "text-embedding-ada-002"
OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY', '')

if OPENAI_AVAILABLE and OPENAI_API_KEY:
    openai.api_key = OPENAI_API_KEY

# --- Sample data ---
products = [
    {"name": "Boho Dress", "desc": "Flowy, earthy tones for festival vibes"},
    {"name": "Denim Jacket", "desc": "Energetic urban chic, cropped with patch details"},
    {"name": "Athleisure Set", "desc": "Comfort-first, sporty, energetic for gym-to-street"},
    {"name": "Silk Slip", "desc": "Minimalist, elegant evening wear with satin sheen"},
    {"name": "Corduroy Pants", "desc": "Warm, cozy textured pants perfect for autumn walks"},
    {"name": "Street Sneaks", "desc": "Bold, high-top sneakers with urban attitude"},
    {"name": "Linen Shirt", "desc": "Light, breezy, relaxed summer vibes â€” clean and natural"},
    {"name": "Statement Blazer", "desc": "Sharp, energetic, structured blazer for confident looks"}
]

df = pd.DataFrame(products)

# --- Embedding functions ---
def embed_with_tfidf(texts: List[str], max_features: int = 256):
    vec = TfidfVectorizer(max_features=max_features, stop_words='english')
    X = vec.fit_transform(texts).toarray()
    norms = np.linalg.norm(X, axis=1, keepdims=True)
    norms[norms==0] = 1.0
    X = X / norms
    return X, vec

def embed_with_openai(texts: List[str], model: str = OPENAI_MODEL):
    if not OPENAI_AVAILABLE or not openai.api_key:
        raise RuntimeError("OpenAI not available or API key not set")
    embeddings = []
    batch = []
    for t in texts:
        batch.append(t)
    # OpenAI supports batching; use single call
    resp = openai.Embedding.create(model=model, input=batch)
    for r in resp['data']:
        embeddings.append(r['embedding'])
    emb = np.array(embeddings)
    norms = np.linalg.norm(emb, axis=1, keepdims=True)
    norms[norms==0] = 1.0
    emb = emb / norms
    return emb

def get_embeddings(texts: List[str], prefer_openai: bool = USE_OPENAI):
    if prefer_openai and OPENAI_AVAILABLE and openai.api_key:
        try:
            return embed_with_openai(texts)
        except Exception as e:
            print("OpenAI failed, falling back to TF-IDF:", e)
    emb, vec = embed_with_tfidf(texts)
    return emb, vec

# --- Build product embeddings ---
if USE_OPENAI:
    emb_matrix = None
    try:
        emb_matrix = embed_with_openai(df['desc'].tolist())
        embedding_vectorizer = None
    except Exception as ex:
        print("OpenAI embedding failed, falling back to TF-IDF:", ex)
        emb_matrix, embedding_vectorizer = embed_with_tfidf(df['desc'].tolist())
else:
    emb_matrix, embedding_vectorizer = embed_with_tfidf(df['desc'].tolist())

df['embedding'] = list(emb_matrix)

# --- Similarity search ---
def query_top_k(query: str, df: pd.DataFrame, k: int = 3, threshold: float = 0.35, prefer_openai: bool = USE_OPENAI):
    # Get query embedding
    if prefer_openai and OPENAI_AVAILABLE and openai.api_key:
        q_emb = embed_with_openai([query])
    else:
        q_emb = embedding_vectorizer.transform([query]).toarray()
        norms = np.linalg.norm(q_emb, axis=1, keepdims=True)
        norms[norms==0] = 1.0
        q_emb = q_emb / norms
    # Ensure shapes align
    X = np.vstack(df['embedding'].values)
    if X.shape[1] != q_emb.shape[1]:
        raise ValueError(f"Embedding dimension mismatch: products {X.shape[1]} vs query {q_emb.shape[1]}")
    sims = cosine_similarity(q_emb, X)[0]
    top_idx = np.argsort(sims)[::-1][:k]
    results = df.iloc[top_idx].copy()
    results['score'] = sims[top_idx]
    fallback_msg = None if results['score'].iloc[0] >= threshold else "No confident match found."
    return results.reset_index(drop=True), fallback_msg

# --- Tests / sample queries ---
def run_queries(queries):
    results = []
    for q in queries:
        t0 = time.perf_counter()
        res, fb = query_top_k(q, df)
        t1 = time.perf_counter()
        print(f"\nQuery: {q}")
        print(res[['name', 'score']])
        print(f"Latency: {t1 - t0:.4f}s")
        if fb:
            print("Fallback:", fb)
        results.append({
            'query': q,
            'top_score': float(res['score'].iloc[0]),
            'latency': t1 - t0
        })
    return results


if __name__ == '__main__':
    sample_queries = [
        "energetic urban chic",
        "cozy autumn outfit",
        "minimalist evening elegance"
    ]
    results = run_queries(sample_queries)

    # Simple metric
    threshold = 0.35
    good = sum(1 for r in results if r['top_score'] > threshold)
    print(f"\nGood queries (top_score>{threshold}): {good}/{len(results)}")