In [None]:
import pandas as pd
import numpy as np
import torch
import os
import json
import shutil
from sklearn.preprocessing import LabelEncoder

RAW_USER_PATH = '../../../data/raw/ml-100k/u.user'
PROCESSED_MOVIES_PATH = '../../../data/processed/movies_graph_ready.csv'
OUTPUT_DIR = '../../../data/processed/context_data'

os.makedirs(OUTPUT_DIR, exist_ok=True)

print(f"Processing context data...")
print(f"User Input: {RAW_USER_PATH}")
print(f"Movie Input: {PROCESSED_MOVIES_PATH}")
print("-" * 30)

print("1. Processing User Context (Age, Gender, Occupation)...")

users_df = pd.read_csv(
    RAW_USER_PATH, 
    sep='|', 
    names=['user_id', 'age', 'gender', 'occupation', 'zip_code'],
    encoding='latin-1'
)

# Age Bucketing (Captures life stages: Student, Young Adult, Parent, etc.)
# Bins: 0-18, 18-25, 25-35, 35-45, 45-50, 50-55, 55-65, 65+
age_bins = [0, 18, 25, 35, 45, 50, 55, 65, 120]
users_df['age_bucket'] = pd.cut(users_df['age'], bins=age_bins, labels=False)
# Fill NaNs with a specific bucket if any (though usually u.user is clean)
users_df['age_bucket'] = users_df['age_bucket'].fillna(0).astype(int)

# Label Encoding for Categorical
le_gender = LabelEncoder()
users_df['gender_idx'] = le_gender.fit_transform(users_df['gender'])

le_occ = LabelEncoder()
users_df['occupation_idx'] = le_occ.fit_transform(users_df['occupation'])

# Select features for the model
# We keep user_id to map it back to the graph later
user_context_df = users_df[['user_id', 'age_bucket', 'gender_idx', 'occupation_idx']]

print(f"   Users processed: {len(user_context_df)}")
print(f"   Age buckets created: {len(age_bins)-1}")
print(f"   Occupations found: {len(le_occ.classes_)}")

print("\n2. Processing Movie Context (Budget, Revenue, Year, Stats)...")

movies_df = pd.read_csv(PROCESSED_MOVIES_PATH)

# Helper function for Quantile Bucketing (Good for Power-Law distributions like Budget)
def get_quantile_buckets(series, n_bins=5):
    # Filter out 0s for calculating quantiles to avoid skewing
    non_zeros = series[series > 1000] 
    if len(non_zeros) == 0:
        return pd.Series(0, index=series.index)
    
    # Create bins based on non-zero data
    try:
        _, bins = pd.qcut(non_zeros, q=n_bins, retbins=True, duplicates='drop')
    except:
        # Fallback if data is too sparse
        return pd.Series(0, index=series.index)
        
    # Extend bins to include 0 on the left and infinity on the right
    bins = np.concatenate(([0], bins))
    bins[1] = 1 # Set the threshold for "Zero/Unknown" bucket
    bins[-1] = float('inf')
    
    # 0 -> Bucket 0
    # Low Budget -> Bucket 1 ... High Budget -> Bucket N
    return pd.cut(series, bins=bins, labels=False).fillna(0).astype(int)

# Financials (Budget & Revenue)
# Many movies have 0 budget/revenue. We treat '0' as its own category.
movies_df['budget_bucket'] = get_quantile_buckets(movies_df['budget'], n_bins=5)
movies_df['revenue_bucket'] = get_quantile_buckets(movies_df['revenue'], n_bins=5)

# Popularity & Votes (Quantile binning)
# We want roughly equal number of movies in "Low Pop", "Mid Pop", "High Pop"
movies_df['popularity_bucket'] = pd.qcut(movies_df['popularity'], q=5, labels=False, duplicates='drop')
movies_df['vote_count_bucket'] = pd.qcut(movies_df['vote_count'], q=5, labels=False, duplicates='drop')

# Vote Average (Standard Bins)
# Rating 0-10. Let's bin by 1.0 intervals.
movies_df['vote_avg_bucket'] = pd.cut(movies_df['vote_average'], bins=list(range(0, 12)), labels=False).fillna(0).astype(int)

# Release Year (Decades)
# Convert to numeric, errors='coerce' turns bad strings to NaN
movies_df['release_year'] = pd.to_numeric(movies_df['release_year'], errors='coerce').fillna(1900)
# Bins: <1960, 1960s, 1970s, 1980s, 1990s, 2000s, 2010s, 2020+
year_bins = [0, 1960, 1970, 1980, 1990, 2000, 2010, 2020, 2030]
movies_df['year_bucket'] = pd.cut(movies_df['release_year'], bins=year_bins, labels=False).fillna(0).astype(int)

# Runtime (Short, Medium, Long, Epic)
# < 90 min, 90-120, 120-150, 150+
runtime_bins = [0, 90, 120, 150, 999]
movies_df['runtime_bucket'] = pd.cut(movies_df['runtime'], bins=runtime_bins, labels=False).fillna(0).astype(int)

movie_context_df = movies_df[[
    'ml_movie_id', 
    'year_bucket', 
    'budget_bucket', 
    'revenue_bucket', 
    'popularity_bucket', 
    'vote_avg_bucket',
    'vote_count_bucket',
    'runtime_bucket'
]]

print(f"   Movies processed: {len(movie_context_df)}")

print("\n3. Saving Data and Metadata...")

user_out_path = os.path.join(OUTPUT_DIR, 'user_context.csv')
movie_out_path = os.path.join(OUTPUT_DIR, 'movie_context.csv')

user_context_df.to_csv(user_out_path, index=False)
movie_context_df.to_csv(movie_out_path, index=False)

# Create a Dictionary for Model Initialization
# This tells the model how large the embedding table needs to be for each feature.
# E.g. 'gender_idx': 2 means nn.Embedding(2, embedding_dim)
metadata = {
    "user_features": {
        "age_bucket": int(user_context_df['age_bucket'].max() + 1),
        "gender_idx": int(user_context_df['gender_idx'].max() + 1),
        "occupation_idx": int(user_context_df['occupation_idx'].max() + 1)
    },
    "movie_features": {
        "year_bucket": int(movie_context_df['year_bucket'].max() + 1),
        "budget_bucket": int(movie_context_df['budget_bucket'].max() + 1),
        "revenue_bucket": int(movie_context_df['revenue_bucket'].max() + 1),
        "popularity_bucket": int(movie_context_df['popularity_bucket'].max() + 1),
        "vote_avg_bucket": int(movie_context_df['vote_avg_bucket'].max() + 1),
        "vote_count_bucket": int(movie_context_df['vote_count_bucket'].max() + 1),
        "runtime_bucket": int(movie_context_df['runtime_bucket'].max() + 1)
    }
}

json_path = os.path.join(OUTPUT_DIR, 'context_metadata.json')
with open(json_path, 'w') as f:
    json.dump(metadata, f, indent=4)

print(f"   Saved user context to: {user_out_path}")
print(f"   Saved movie context to: {movie_out_path}")
print(f"   Saved metadata (vocab sizes) to: {json_path}")

print("\n--- Feature Vocab Sizes (Embedding Input Dimensions) ---")
print(json.dumps(metadata, indent=2))

Processing context data...
User Input: ../../../data/raw/ml-100k/u.user
Movie Input: ../../../data/processed/movies_graph_ready.csv
------------------------------
1. Processing User Context (Age, Gender, Occupation)...
   Users processed: 943
   Age buckets created: 8
   Occupations found: 21

2. Processing Movie Context (Budget, Revenue, Year, Stats)...
   Movies processed: 1638

3. Saving Data and Metadata...
   Saved user context to: ../../../data/processed/context_data/user_context.csv
   Saved movie context to: ../../../data/processed/context_data/movie_context.csv
   Saved metadata (vocab sizes) to: ../../../data/processed/context_data/context_metadata.json

--- Feature Vocab Sizes (Embedding Input Dimensions) ---
{
  "user_features": {
    "age_bucket": 8,
    "gender_idx": 2,
    "occupation_idx": 21
  },
  "movie_features": {
    "year_bucket": 7,
    "budget_bucket": 6,
    "revenue_bucket": 6,
    "popularity_bucket": 5,
    "vote_avg_bucket": 9,
    "vote_count_bucket": 5,
