# Vogue Archive Data Processing

This notebook processes Vogue magazine data and creates vector embeddings for semantic search.

**Run this in Google Colab for free GPU access**

Runtime: ~20 minutes for 10k records

## 1. Install Dependencies

In [None]:
!pip install sentence-transformers pinecone pandas tqdm pyarrow

## 2. Setup Pinecone

In [None]:
import os
from pinecone import Pinecone, ServerlessSpec

# Your Pinecone API key
PINECONE_API_KEY = "pcsk_2JKS4Y_LNuT72kmgxsuWksy2LyqcQP5Q2iX626vPCwb2KEjj23Vf72a43ZWgNp6FcCJshz"
INDEX_NAME = "vogue-archive"

# Initialize Pinecone
pc = Pinecone(api_key=PINECONE_API_KEY)

# Create index if it doesn't exist
if INDEX_NAME not in pc.list_indexes().names():
    pc.create_index(
        name=INDEX_NAME,
        dimension=384,  # all-MiniLM-L6-v2 dimension
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        )
    )

index = pc.Index(INDEX_NAME)
print(f"Index '{INDEX_NAME}' ready!")

## 3. Load Embedding Model

In [None]:
from sentence_transformers import SentenceTransformer

# Load the model (lightweight, works on free Colab)
model = SentenceTransformer('all-MiniLM-L6-v2')
print(f"Model loaded. Embedding dimension: {model.get_sentence_embedding_dimension()}")

## 4. Load Vogue Runway Data

Two options:

**Option A: Use the download script (Recommended)**
1. Run `download_vogue_data.py` on your computer first
2. Upload the generated `vogue_runway_prepared.json` file to Colab

**Option B: Download directly in Colab (shown below)**

In [None]:
import json
import pandas as pd

# Option A: Upload the prepared JSON file from download_vogue_data.py
# Uncomment and run this if you have the file:
# from google.colab import files
# uploaded = files.upload()  # Upload vogue_runway_prepared.json
# with open('vogue_runway_prepared.json', 'r') as f:
#     vogue_items = json.load(f)

# Option B: Download directly in Colab (small subset)
import requests

print("Downloading Vogue Runway metadata...")
url = "https://archive.org/download/VogueRunway_dataset/VogueRunway.parquet"

# Download parquet file
response = requests.get(url, stream=True)
with open('VogueRunway.parquet', 'wb') as f:
    for chunk in response.iter_content(chunk_size=8192):
        f.write(chunk)

print("Loading metadata...")
df = pd.read_parquet('VogueRunway.parquet')

print(f"Total items: {len(df):,}")
print(f"Columns: {df.columns.tolist()}")

# Take top 1000 items by aesthetic score
if 'aesthetic' in df.columns:
    df = df.nlargest(1000, 'aesthetic')
else:
    df = df.head(1000)

# Convert to format for embedding
vogue_items = []
for idx, row in df.iterrows():
    # Create description from metadata
    desc_parts = []
    if pd.notna(row.get('designer')):
        desc_parts.append(f"{row['designer']}")
    if pd.notna(row.get('season')) and pd.notna(row.get('year')):
        desc_parts.append(f"{row['season']} {row['year']}")
    if pd.notna(row.get('category')):
        desc_parts.append(f"{row['category']}")
    if pd.notna(row.get('section')):
        desc_parts.append(f"{row['section']}")
    if pd.notna(row.get('city')):
        desc_parts.append(f"from {row['city']} Fashion Week")
    
    description = " ".join(desc_parts)
    
    vogue_items.append({
        "id": f"vogue_runway_{row['key']}",
        "description": description,
        "metadata": {
            "designer": str(row.get('designer', '')),
            "season": str(row.get('season', '')),
            "year": int(row.get('year', 0)) if pd.notna(row.get('year')) else 0,
            "category": str(row.get('category', '')),
            "city": str(row.get('city', '')),
            "section": str(row.get('section', '')),
            "image_url": row.get('url', ''),
            "aesthetic_score": float(row.get('aesthetic', 0)) if pd.notna(row.get('aesthetic')) else 0,
        }
    })

print(f"\nPrepared {len(vogue_items)} items for embedding")
print(f"\nSample item:")
print(json.dumps(vogue_items[0], indent=2))

## 5. Create Embeddings and Upload to Pinecone

In [None]:
from tqdm import tqdm

# Batch size for uploading
BATCH_SIZE = 100

def process_batch(batch_items):
    """Process a batch of Vogue items and upload to Pinecone"""
    vectors = []
    
    for item in batch_items:
        # Use the description we already created
        text = item['description']
        
        # Generate embedding
        embedding = model.encode(text).tolist()
        
        # Prepare metadata for Pinecone
        metadata = {
            "description": item['description'],
            "designer": item['metadata']['designer'],
            "season": item['metadata']['season'],
            "year": item['metadata']['year'],
            "category": item['metadata']['category'],
            "city": item['metadata']['city'],
            "section": item['metadata']['section'],
            "image_url": item['metadata']['image_url'],
            "aesthetic_score": item['metadata']['aesthetic_score'],
        }
        
        vectors.append({
            "id": item['id'],
            "values": embedding,
            "metadata": metadata
        })
    
    # Upload to Pinecone
    index.upsert(vectors=vectors)
    return len(vectors)

# Process all items in batches
print(f"\nProcessing {len(vogue_items)} items in batches of {BATCH_SIZE}...")
total_uploaded = 0

for i in tqdm(range(0, len(vogue_items), BATCH_SIZE)):
    batch = vogue_items[i:i+BATCH_SIZE]
    count = process_batch(batch)
    total_uploaded += count

print(f"\nâœ“ Upload complete! {total_uploaded} vectors uploaded to Pinecone.")
print(f"\nIndex stats: {index.describe_index_stats()}")

## 6. Test Search

In [None]:
# Test a search query
query = "elegant evening gowns from the 1950s"
query_embedding = model.encode(query).tolist()

results = index.query(
    vector=query_embedding,
    top_k=5,
    include_metadata=True
)

print(f"\nSearch results for: '{query}'\n")
for match in results['matches']:
    print(f"Score: {match['score']:.3f}")
    print(f"ID: {match['id']}")
    print(f"Description: {match['metadata']['description']}")
    print("-" * 50)

## Done!

Your Vogue archive is now searchable in Pinecone.

Next steps:
1. Deploy the API (see `../api/`)
2. Connect your React Native app