In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# Load the data
df = pd.read_csv("data.csv", encoding='ISO-8859-1', parse_dates=["InvoiceDate"])


In [None]:
df.head()



In [None]:
df.info()


In [None]:
df.describe(include='all')


In [None]:
df.isnull().sum()


In [None]:
df['CustomerID'] = df['CustomerID'].astype(str)


In [None]:
df.duplicated().sum()

In [None]:
# Drop rows with missing CustomerID
df = df.dropna(subset=['CustomerID'])

# Remove cancelled orders (InvoiceNo starting with 'C')
df = df[~df['InvoiceNo'].astype(str).str.startswith('C')]

# Convert InvoiceNo and CustomerID to string
df['InvoiceNo'] = df['InvoiceNo'].astype(str)
df['CustomerID'] = df['CustomerID'].astype(str)

# Add TotalPrice column
df['TotalPrice'] = df['Quantity'] * df['UnitPrice']
df

In [None]:
pip install pandas numpy sentence-transformers pinecone-client scikit-learn tqdm

In [None]:
# ===== STEP 1: Load Data =====
import pandas as pd
df = pd.read_csv("data.csv", encoding='ISO-8859-1', parse_dates=["InvoiceDate"])
df['Description'] = df['Description'].str.lower().str.strip()
df = df.dropna(subset=['Description', 'CustomerID'])

# ===== STEP 2: Create Product Metadata =====
product_metadata = df.groupby('StockCode').agg({
    'Description': 'first',
    'UnitPrice': 'mean',
    'Quantity': 'sum'
}).reset_index()
product_metadata['product_id'] = product_metadata['StockCode'].astype(str)

# ===== STEP 3: Generate Embeddings =====
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')
product_metadata['embedding'] = model.encode(
    product_metadata['Description'].tolist(),
    show_progress_bar=True
).tolist()

# ===== STEP 4: Pinecone Setup =====
from pinecone import Pinecone, ServerlessSpec
import os
pc = Pinecone(api_key="pcsk_5qM3R_RTNpiTNe6KMSLvYicdZvzc48WR8zXAGYjVH6iozaCeUYkydMhmw42toBHwVhQCJ")
index_name = "product-recs"

if index_name in pc.list_indexes().names():
    pc.delete_index(index_name)

pc.create_index(
    name=index_name,
    dimension=384,
    metric="cosine",
    spec=ServerlessSpec(cloud="aws", region="us-east-1")
)
index = pc.Index(index_name)

# ===== STEP 5: Upload Embeddings =====
from tqdm import tqdm
vectors = [{
    "id": row["product_id"],
    "values": row["embedding"],
    "metadata": {
        "description": row["Description"],
        "price": float(row["UnitPrice"]),
        "sales": int(row["Quantity"])
    }
} for _, row in product_metadata.iterrows()]

print(f"Uploading {len(vectors)} products...")
for i in tqdm(range(0, len(vectors), 100)):
    index.upsert(vectors=vectors[i:i+100])

print("\nUpload complete! Index stats:")
print(index.describe_index_stats())

In [None]:
# 1. Verify vectors are properly formatted
print("First vector sample:", vectors[0]["id"], "...", vectors[0]["values"][:3])

# 2. Alternative upload method (more reliable)
from itertools import islice
def batch_upsert(all_vectors, batch_size=100):
    for i in range(0, len(all_vectors), batch_size):
        yield list(islice(all_vectors, i, i + batch_size))

print(f"\nRe-attempting upload for {len(vectors)} vectors...")
for batch in tqdm(batch_upsert(vectors), total=len(vectors)//100 + 1):
    index.upsert(vectors=batch)

# 3. Force index refresh
import time
time.sleep(10)  # Allow index to update

# 4. Verify with deeper check
stats = index.describe_index_stats()
if stats['total_vector_count'] == len(vectors):
    print("✓ Upload verified!")
    print(f"Vectors in index: {stats['total_vector_count']}")
else:
    print("⚠️ Upload problem detected! Trying diagnostic:")
    print(f"Expected: {len(vectors)} | Actual: {stats['total_vector_count']}")
    print("Checking first 3 vectors in index...")
    sample_ids = [v["id"] for v in vectors[:3]]
    print("Fetch result:", index.fetch(ids=sample_ids))

In [None]:
import numpy as np

In [None]:
# 1. Create user purchase history
user_history = df.groupby('CustomerID').agg({
    'StockCode': list,
    'Description': list,
    'Quantity': list
}).reset_index()

# 2. Generate user embeddings (average of purchased product embeddings)
user_embeddings = []
for _, user in user_history.iterrows():
    purchased_products = user['StockCode']
    embeddings = []

    for product_id in purchased_products:
        product_mask = product_metadata['StockCode'] == product_id
        if product_mask.any():
            embeddings.append(product_metadata.loc[product_mask, 'embedding'].iloc[0])

    if embeddings:
        user_embedding = np.mean(embeddings, axis=0).tolist()
        user_embeddings.append({
            "id": str(user['CustomerID']),
            "values": user_embedding,
            "metadata": {
                "purchase_count": len(purchased_products),
                "first_purchase": min(df[df['CustomerID'] == user['CustomerID']]['InvoiceDate'])
            }
        })

# 3. Show sample user embedding
print(f"\nGenerated {len(user_embeddings)} user embeddings")
print("Sample user embedding:", user_embeddings[0]['values'][:5], "...")

In [None]:
print("Total users:", len(user_history))
print("Sample purchase history:", user_history.iloc[0]['StockCode'][:3])

In [None]:
print("Embedding shape:", len(user_embeddings[0]['values']))

In [None]:
# 1. Convert timestamps to strings in user embeddings
for embedding in user_embeddings:
    embedding['metadata']['first_purchase'] = str(embedding['metadata']['first_purchase'])

# 2. Create user index
user_index_name = "user-profiles"
if user_index_name in pc.list_indexes().names():
    pc.delete_index(user_index_name)

pc.create_index(
    name=user_index_name,
    dimension=384,
    metric="cosine",
    spec=ServerlessSpec(cloud="aws", region="us-east-1")
)
user_index = pc.Index(user_index_name)

# 3. Upload with error handling
print(f"Uploading {len(user_embeddings)} user profiles...")
success_count = 0
for i in tqdm(range(0, len(user_embeddings), 100)):
    batch = user_embeddings[i:i+100]
    try:
        user_index.upsert(vectors=batch)
        success_count += len(batch)
    except Exception as e:
        print(f"Error in batch {i//100}: {str(e)}")
        # Fallback: Upload vectors individually
        for vector in batch:
            try:
                user_index.upsert(vectors=[vector])
                success_count += 1
            except:
                print(f"Failed to upload user {vector['id']}")

# 4. Verify
stats = user_index.describe_index_stats()
print(f"\nSuccessfully uploaded {success_count}/{len(user_embeddings)} users")
print("User index stats:", stats)

In [None]:
# Check embedding format
print("Sample user embedding keys:", user_embeddings[0].keys())
print("Metadata types:", {k: type(v) for k, v in user_embeddings[0]['metadata'].items()})

# Verify all timestamps are strings
print("\nTimestamp samples:")
for emb in user_embeddings[:3]:
    print(f"User {emb['id']}: {emb['metadata']['first_purchase']} ({type(emb['metadata']['first_purchase'])})")

In [None]:
# 1. Clean existing index (if any)
user_index_name = "user-profiles"
if user_index_name in pc.list_indexes().names():
    pc.delete_index(user_index_name)

# 2. Create fresh index with optimized settings
pc.create_index(
    name=user_index_name,
    dimension=384,
    metric="cosine",
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    )
)
user_index = pc.Index(user_index_name)

# 3. Prepare batches with string conversion
batch_size = 50  # Smaller batches for reliability
print(f"Preparing {len(user_embeddings)} users in batches of {batch_size}...")

# 4. Upload with progress tracking
success_count = 0
for i in tqdm(range(0, len(user_embeddings), batch_size)):
    batch = user_embeddings[i:i + batch_size]

    # Ensure all metadata values are strings
    processed_batch = []
    for vec in batch:
        processed_batch.append({
            "id": vec["id"],
            "values": vec["values"],
            "metadata": {
                "purchases": str(vec["metadata"]["purchase_count"]),
                "first_purchase": vec["metadata"]["first_purchase"]  # Already string
            }
        })

    # Upsert with timeout handling
    try:
        user_index.upsert(vectors=processed_batch)
        success_count += len(processed_batch)
    except Exception as e:
        print(f"Batch {i//batch_size} failed: {str(e)}")

# 5. Final verification
time.sleep(15)  # Allow index to stabilize
stats = user_index.describe_index_stats()
print("\nFinal Results:")
print(f"• Users uploaded: {success_count}/{len(user_embeddings)}")
print(f"• Index vector count: {stats['total_vector_count']}")
print(f"• Dimensions: {stats['dimension']}")

# 6. Sample query verification
test_user = user_embeddings[0]["id"]
print(f"\nTest query for user {test_user}:")
print(user_index.fetch(ids=[test_user]))

In [None]:
def get_personalized_recommendations(user_id, top_k=5, filter_owned=True):
    """
    Get top product recommendations for a specific user
    Args:
        user_id: Customer ID (string or numeric)
        top_k: Number of recommendations to return
        filter_owned: Exclude products the user already purchased
    """
    try:
        # Get user's embedding
        user_vector = user_index.fetch(ids=[str(user_id)]).vectors[str(user_id)].values

        # Get user's purchase history
        purchased = set(user_history[user_history['CustomerID'] == float(user_id)]['StockCode'].iloc[0])

        # Query similar products
        results = index.query(
            vector=user_vector,
            top_k=top_k + len(purchased) if filter_owned else top_k,
            filter={"product_id": {"$nin": list(purchased)}} if filter_owned else None,
            include_metadata=True
        )

        return [{
            'product_id': match.id,
            'description': match.metadata['description'],
            'price': match.metadata['price'],
            'relevance_score': match.score
        } for match in results.matches][:top_k]

    except Exception as e:
        print(f"Recommendation error: {e}")
        return []

# Test with a sample user
sample_user = user_history.iloc[10]['CustomerID']
print(f"\nRecommendations for user {sample_user}:")
for rec in get_personalized_recommendations(sample_user):
    print(f"- {rec['description']} (${rec['price']:.2f}, score: {rec['relevance_score']:.3f})")

In [None]:
def get_similar_products(product_id, top_k=5):
    """Get visually/thematically similar products"""
    try:
        results = index.query(
            id=str(product_id),
            top_k=top_k + 1,  # +1 to exclude self
            include_metadata=True
        )

        return [{
            'product_id': match.id,
            'description': match.metadata['description'],
            'price': match.metadata['price'],
            'similarity_score': match.score
        } for match in results.matches if match.id != str(product_id)][:top_k]

    except Exception as e:
        print(f"Similar products error: {e}")
        return []

# Test with a sample product
sample_product = product_metadata.iloc[20]['StockCode']
print(f"\nProducts similar to {sample_product}:")
for sim in get_similar_products(sample_product):
    print(f"- {sim['description']} (${sim['price']:.2f}, similarity: {sim['similarity_score']:.3f})")

In [None]:
def get_hybrid_recommendations(user_id, viewed_product=None, top_k=5):
    """
    Combine:
    - User's general preferences
    - Similarity to currently viewed product
    """
    try:
        # Get user embedding
        user_vector = user_index.fetch(ids=[str(user_id)]).vectors[str(user_id)].values

        # Get viewed product embedding if provided
        if viewed_product:
            product_vector = index.fetch(ids=[str(viewed_product)]).vectors[str(viewed_product)].values
            hybrid_vector = (np.array(user_vector) + np.array(product_vector)) / 2
        else:
            hybrid_vector = user_vector

        # Query
        results = index.query(
            vector=hybrid_vector.tolist(),
            top_k=top_k,
            include_metadata=True
        )

        return [{
            'product_id': match.id,
            'description': match.metadata['description'],
            'price': match.metadata['price'],
            'match_score': match.score
        } for match in results.matches]

    except Exception as e:
        print(f"Hybrid recommendation error: {e}")
        return []

# Test hybrid approach
print("\nHybrid recommendations (user + product context):")
for rec in get_hybrid_recommendations(sample_user, sample_product):
    print(f"- {rec['description']} (score: {rec['match_score']:.3f})")

In [None]:
def evaluate_recommendations(test_users, k=5):
    """Measure precision@k and coverage"""
    hits = 0
    total_relevant = 0
    recommended_products = set()

    for user_id in test_users['CustomerID'].sample(100):  # Evaluate on sample
        # Get ground truth
        actual_purchases = set(test_df[test_df['CustomerID'] == user_id]['StockCode'].astype(str))

        if not actual_purchases:
            continue

        # Get recommendations
        recs = get_personalized_recommendations(user_id, top_k=k)
        recommended_ids = {r['product_id'] for r in recs}
        recommended_products.update(recommended_ids)

        # Calculate hits
        hits += len(actual_purchases & recommended_ids)
        total_relevant += len(actual_purchases)

    precision = hits / (k * len(test_users))
    recall = hits / total_relevant if total_relevant > 0 else 0
    coverage = len(recommended_products) / len(product_metadata)

    return {
        "precision@k": precision,
        "recall@k": recall,
        "coverage": coverage,
        "test_users": len(test_users)
    }

# Run evaluation
test_df = df.sample(frac=0.2)  # 20% test set
metrics = evaluate_recommendations(test_df)
print("\nEvaluation Metrics:")
for k, v in metrics.items():
    print(f"- {k}: {v:.3f}")

In [None]:
def recommend_for_user(user_id, strategy="personalized", context_product=None, top_k=5):
    """
    Unified recommendation API
    Strategies: "personalized", "similar", "hybrid"
    """
    if strategy == "personalized":
        return get_personalized_recommendations(user_id, top_k)
    elif strategy == "similar" and context_product:
        return get_similar_products(context_product, top_k)
    elif strategy == "hybrid":
        return get_hybrid_recommendations(user_id, context_product, top_k)
    else:
        raise ValueError("Invalid strategy or missing context product")

# Example usage
print("\nUnified API examples:")
print("1. Personalized:", recommend_for_user(sample_user, "personalized")[0])
print("2. Similar products:", recommend_for_user(None, "similar", sample_product)[0])
print("3. Hybrid:", recommend_for_user(sample_user, "hybrid", sample_product)[0])

In [None]:
from pinecone import Pinecone
pc = Pinecone(api_key="pcsk_5qM3R_RTNpiTNe6KMSLvYicdZvzc48WR8zXAGYjVH6iozaCeUYkydMhmw42toBHwVhQCJ")

# Initialize indexes
product_index = pc.Index("product-recs")
user_index = pc.Index("user-profiles")

# Get 3 random product IDs from your metadata
sample_product_ids = product_metadata['StockCode'].sample(3).astype(str).tolist()

# Fetch product vectors (corrected)
product_response = product_index.fetch(ids=sample_product_ids)

print("=== PRODUCT VECTORS ===")
for pid, vector in product_response.vectors.items():
    print(f"\nProduct ID: {pid}")
    print(f"Description: {vector.metadata['description']}")
    print(f"Price: ${vector.metadata['price']:.2f}")
    print(f"Vector (first 5 dims): {vector.values[:5]}")

In [None]:
# Get 3 random user IDs
sample_user_ids = user_history['CustomerID'].sample(3).astype(str).tolist()

# Fetch user vectors (corrected)
user_response = user_index.fetch(ids=sample_user_ids)

print("\n=== USER VECTORS ===")
for uid, vector in user_response.vectors.items():
    print(f"\nUser ID: {uid}")
    print(f"First Purchase: {vector.metadata['first_purchase']}")
    print(f"Purchase Count: {vector.metadata['purchases']}")
    print(f"Vector (first 5 dims): {vector.values[:5]}")