In [1]:
import numpy as np
import pandas as pd
import zipfile
import requests
import os
from surprise import Dataset, SVD, Reader
from surprise.model_selection import cross_validate

In [None]:
# Define dataset name
DATASET_NAME = 'ml-latest-small'

# Create data directory if it doesn't exist
os.makedirs('data', exist_ok=True)

# Download and extract the zip file if it hasn't been downloaded yet
zip_path = f'data/{DATASET_NAME}.zip'
if not os.path.exists(zip_path):
    print("Downloading MovieLens dataset...")
    response = requests.get(f'https://files.grouplens.org/datasets/movielens/{DATASET_NAME}.zip', stream=True)
    response.raise_for_status()  # Raise an exception for bad status codes
    
    # Save the zip file
    with open(zip_path, 'wb') as f:
        for chunk in response.iter_content(chunk_size=8192):
            f.write(chunk)
    print("Download complete!")

# Extract the zip file if it hasn't been extracted yet
if not os.path.exists(f'data/{DATASET_NAME}/ratings.csv'):
    print("Extracting dataset...")
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall('data')
    print("Extraction complete!")

RATINGS_SUBSAMPLE = 1

# Read the ratings and links data
print(f"Loading {RATINGS_SUBSAMPLE*100}% of the ratings data...")
ratings_df = pd.read_csv(f'data/{DATASET_NAME}/ratings.csv')
if RATINGS_SUBSAMPLE < 1.0:
    ratings_df = ratings_df.sample(frac=RATINGS_SUBSAMPLE, random_state=42)
    print(f"Loaded {len(ratings_df):,} ratings")

links_df = pd.read_csv(f'data/{DATASET_NAME}/links.csv')

In [3]:
# Convert ratings to use TMDB IDs
df = ratings_df.merge(links_df[['movieId', 'tmdbId']], on='movieId', how='inner')

# Rename columns to match the expected format
df = df.drop(columns=['movieId', 'timestamp'], axis=1)
df = df.rename(columns={
    'userId': 'userID',
    'tmdbId': 'movieID'
})

In [None]:
print(f"Original userId range: {df['userID'].min()} to {df['userID'].max()}")
print(f"Original unique users: {df['userID'].nunique():,}")

In [5]:
# Convert IDs to strings as required by the Reader
df['userID'] = df['userID'].astype(str)
df['movieID'] = df['movieID'].astype(str)

# Create a new Reader object
reader = Reader(rating_scale=(0.5, 5))

# Create a new Dataset object with TMDB IDs
data = Dataset.load_from_df(df[['userID', 'movieID', 'rating']], reader)

# Build the full trainset
trainset = data.build_full_trainset()

In [None]:

algo = SVD()
algo.fit(trainset)
# pd.DataFrame(cross_validate(algo, data, measures=["RMSE", "MAE"], cv=3, verbose=True))

In [None]:
testset = trainset.build_anti_testset()
predictions = algo.test(testset)

In [None]:
# Convert predictions to DataFrame with only the essential columns
predictions_df = pd.DataFrame([(p.uid, p.iid, p.est) for p in predictions], 
                            columns=['user_id', 'movie_id', 'rating'])

# Get top 20 predictions per user
top_20_predictions = predictions_df.groupby('user_id').apply(
    lambda x: x.nlargest(20, 'rating')
).reset_index(drop=True)

# Save as parquet with compression for smaller file size
top_20_predictions.to_parquet('../backend/models/collab_predictions.parquet', compression='snappy')