# Vogue Archive Data Processing

This notebook processes Vogue magazine data and creates vector embeddings for semantic search.

**Run this in Google Colab for free GPU access**

Runtime: ~20 minutes for 10k records

## 1. Install Dependencies

In [None]:
!pip install sentence-transformers pinecone pandas tqdm pyarrow torch transformers ftfy regex

## 2. Setup Pinecone

In [None]:
import os
from pinecone import Pinecone, ServerlessSpec

# Your Pinecone API key
PINECONE_API_KEY = "pcsk_2JKS4Y_LNuT72kmgxsuWksy2LyqcQP5Q2iX626vPCwb2KEjj23Vf72a43ZWgNp6FcCJshz"
INDEX_NAME = "vogue-archive-clip"  # New index name for CLIP embeddings

# Initialize Pinecone
pc = Pinecone(api_key=PINECONE_API_KEY)

# Create index if it doesn't exist
# CLIP uses 512 dimensions (vs 384 for MiniLM)
if INDEX_NAME not in pc.list_indexes().names():
    pc.create_index(
        name=INDEX_NAME,
        dimension=512,  # CLIP ViT-B/32 dimension
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        )
    )

index = pc.Index(INDEX_NAME)
print(f"Index '{INDEX_NAME}' ready!")

## 3. Download Pre-computed Image Embeddings

The dataset includes pre-computed CLIP image embeddings - we'll use these!

In [None]:
import numpy as np
import requests

# Define the base URL for the archive
ARCHIVE_BASE_URL = "https://archive.org/download/VogueRunway_dataset"

print("Downloading pre-computed CLIP image embeddings...")
embeddings_url = f"{ARCHIVE_BASE_URL}/img_emb/VogueRunway_image.npy"

response = requests.get(embeddings_url, stream=True)
with open('VogueRunway_image.npy', 'wb') as f:
    for chunk in response.iter_content(chunk_size=8192):
        f.write(chunk)

# Load the embeddings
image_embeddings = np.load('VogueRunway_image.npy')
print(f"✓ Loaded {len(image_embeddings):,} pre-computed CLIP image embeddings")
print(f"Embedding dimension: {image_embeddings.shape[1]}")

## 4. Load Vogue Runway Metadata

Download metadata and match with embeddings

In [None]:
import json
import pandas as pd

print("Downloading Vogue Runway metadata...")
url = f"{ARCHIVE_BASE_URL}/VogueRunway.parquet"

# Download parquet file
response = requests.get(url, stream=True)
with open('VogueRunway.parquet', 'wb') as f:
    for chunk in response.iter_content(chunk_size=8192):
        f.write(chunk)

print("Loading metadata...")
df = pd.read_parquet('VogueRunway.parquet')

print(f"Total items: {len(df):,}")
print(f"Total embeddings: {len(image_embeddings):,}")

# Take top items by aesthetic score (you can change 1000 to 10000 or 100000)
NUM_ITEMS = 1000

if 'aesthetic' in df.columns:
    df = df.nlargest(NUM_ITEMS, 'aesthetic')
    print(f"\nSelected top {NUM_ITEMS} items by aesthetic score")
else:
    df = df.head(NUM_ITEMS)
    print(f"\nTaking first {NUM_ITEMS} items")

# Reset index to get clean indices
df = df.reset_index(drop=True)

print(f"\nSample data:")
print(df[['key', 'designer', 'season', 'year', 'category', 'city']].head())

## 5. Match Embeddings with Metadata and Upload to Pinecone

Use pre-computed image embeddings for multimodal search

In [None]:
from tqdm import tqdm

# Batch size for uploading
BATCH_SIZE = 100

def process_batch(batch_df, batch_embeddings):
    """Process a batch of records with pre-computed embeddings"""
    vectors = []
    
    for idx, row in batch_df.iterrows():
        # Get the pre-computed embedding for this item
        # The embedding index should match the row index in the original dataset
        embedding_idx = row['key']  # Use the key to find the right embedding
        
        # Get embedding (already computed from images!)
        embedding = batch_embeddings[idx].tolist()
        
        # Prepare metadata
        metadata = {
            "description": f"{row.get('designer', '')} {row.get('season', '')} {row.get('year', '')} {row.get('category', '')} {row.get('section', '')}".strip(),
            "designer": str(row.get('designer', '')),
            "season": str(row.get('season', '')),
            "year": int(row.get('year', 0)) if pd.notna(row.get('year')) else 0,
            "category": str(row.get('category', '')),
            "city": str(row.get('city', '')),
            "section": str(row.get('section', '')),
            "image_url": row.get('url', ''),
            "aesthetic_score": float(row.get('aesthetic', 0)) if pd.notna(row.get('aesthetic')) else 0,
        }
        
        vectors.append({
            "id": f"vogue_runway_{row['key']}",
            "values": embedding,
            "metadata": metadata
        })
    
    # Upload to Pinecone
    index.upsert(vectors=vectors)
    return len(vectors)

# Get embeddings for selected items
# Map each row's key to its embedding
selected_embeddings = []
for _, row in df.iterrows():
    key = row['key']
    # The embeddings array is ordered by key
    selected_embeddings.append(image_embeddings[key])

selected_embeddings = np.array(selected_embeddings)

print(f"\nProcessing {len(df)} items with pre-computed embeddings in batches of {BATCH_SIZE}...")
total_uploaded = 0

for i in tqdm(range(0, len(df), BATCH_SIZE)):
    batch_df = df.iloc[i:i+BATCH_SIZE]
    batch_emb = selected_embeddings[i:i+BATCH_SIZE]
    count = process_batch(batch_df, batch_emb)
    total_uploaded += count

print(f"\n✓ Upload complete! {total_uploaded} vectors uploaded to Pinecone.")
print(f"\nIndex stats: {index.describe_index_stats()}")

## 6. Test Multimodal Search

Test text queries against image embeddings - this is CLIP's superpower!

In [None]:
from sentence_transformers import SentenceTransformer

# Load CLIP text encoder for queries
model = SentenceTransformer('clip-ViT-B-32')

# Test queries
test_queries = [
    "elegant evening gown",
    "minimalist black dress",
    "tweed jacket",
    "vintage cocktail dress"
]

for query in test_queries:
    print(f"\n{'='*60}")
    print(f"Query: '{query}'")
    print(f"{'='*60}")
    
    # Encode text query with CLIP
    query_embedding = model.encode(query).tolist()
    
    # Search against image embeddings
    results = index.query(
        vector=query_embedding,
        top_k=3,
        include_metadata=True
    )
    
    for i, match in enumerate(results['matches'], 1):
        print(f"\n{i}. Score: {match['score']:.3f}")
        print(f"   Designer: {match['metadata']['designer']}")
        print(f"   {match['metadata']['season']} {match['metadata']['year']}")
        print(f"   Category: {match['metadata']['category']}")
        print(f"   City: {match['metadata']['city']}")
        if match['metadata'].get('image_url'):
            print(f"   Image: {match['metadata']['image_url'][:80]}...")

## Done! Multimodal Search Ready

Your Vogue archive now uses **image embeddings** in the database.

When users search with text, CLIP matches:
- Text query → Image embeddings
- This finds visually similar runway looks based on semantic understanding

**Benefits:**
✓ Faster processing (no embedding generation needed)
✓ Better visual understanding (searches actual image features)
✓ True multimodal CLIP search (text-to-image matching)

Next steps:
1. Deploy the API (see ../api/)
2. Your React Native app is already configured!