In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.linear_model import Ridge
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import mean_squared_error

In [3]:
# Create data directory if it doesn't exist
os.makedirs('data', exist_ok=True)

# Download and extract the zip file if it hasn't been downloaded yet
zip_path = 'data/ml-latest-small.zip'
if not os.path.exists(zip_path):
    print("Downloading MovieLens dataset...")
    response = requests.get('https://files.grouplens.org/datasets/movielens/ml-latest-small.zip', stream=True)
    response.raise_for_status()  # Raise an exception for bad status codes
    
    # Save the zip file
    with open(zip_path, 'wb') as f:
        for chunk in response.iter_content(chunk_size=8192):
            f.write(chunk)
    print("Download complete!")

# Extract the zip file if it hasn't been extracted yet
if not os.path.exists('data/ml-latest-small/ratings.csv'):
    print("Extracting dataset...")
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall('data')
    print("Extraction complete!")

RATINGS_SUBSAMPLE = 1

# Read the ratings and links data
print(f"Loading {RATINGS_SUBSAMPLE*100}% of the ratings data...")
ratings_df = pd.read_csv('data/ml-latest-small/ratings.csv')
if RATINGS_SUBSAMPLE < 1.0:
    ratings_df = ratings_df.sample(frac=RATINGS_SUBSAMPLE, random_state=42)
    print(f"Loaded {len(ratings_df):,} ratings")

links_df = pd.read_csv('data/ml-latest-small/links.csv')
movies_df = pd.read_csv('data/ml-latest-small/movies.csv')

Loading 100% of the ratings data...


In [4]:
# Drop movies with no genres
movies_df = movies_df[movies_df['genres'] != '(no genres listed)']

# Convert ratings to use TMDB IDs
movies_df = movies_df.merge(links_df[['movieId', 'tmdbId']], on='movieId', how='inner')

# Create genres string column for TF-IDF
movies_df['genres'] = movies_df['genres'].mask(movies_df['genres'] == "(no genres listed)", "Unknown").str.split("|")

# Rename columns to match the expected format
movies_df = movies_df.drop(columns=['movieId', 'title'], axis=1)
movies_df = movies_df.rename(columns={
    'tmdbId': 'movie_id',
})
movies_df.shape

(9708, 2)

In [5]:
# Convert ratings to use TMDB IDs
ratings_df = ratings_df.merge(links_df[['movieId', 'tmdbId']], on='movieId', how='inner')

# Rename columns to match the expected format
ratings_df = ratings_df.drop(columns=['movieId', 'timestamp'], axis=1)
ratings_df = ratings_df.rename(columns={
    'userId': 'user_id',
    'tmdbId': 'movie_id'
})

# Filter ratings to only include movie IDs that appear in movies_df
ratings_df = ratings_df[ratings_df['movie_id'].isin(movies_df['movie_id'])]

# Handle duplicate ratings by taking the mean
ratings_df = ratings_df.groupby(['user_id', 'movie_id'])['rating'].mean().reset_index()

ratings_df.nunique()

user_id      610
movie_id    9681
rating        10
dtype: int64

In [6]:
# Drop duplicates based on movie_id
movies_df = movies_df.drop_duplicates(subset='movie_id')

# Then proceed as before
mlb = MultiLabelBinarizer()
genre_features = mlb.fit_transform(movies_df['genres'])

genre_df = pd.DataFrame(
    genre_features,
    columns=mlb.classes_,
    index=movies_df['movie_id']
)

In [7]:
# First create the user_movie_matrix
user_movie_matrix = ratings_df.pivot(index='user_id', columns='movie_id', values='rating')

# Then get only the genre features for movies that have ratings
genre_features = genre_df.loc[user_movie_matrix.columns].values

In [8]:
# Align genre features with movie IDs in ratings
movie_ids = user_movie_matrix.columns
genre_features = genre_df.loc[movie_ids].values

In [9]:
# Train a Ridge model for each user
user_models = {}
for user_id in user_movie_matrix.index:
    # Get user's ratings
    user_ratings = user_movie_matrix.loc[user_id].values
    
    # Get indices of non-NaN ratings
    rated_indices = ~np.isnan(user_ratings)
    
    # Only train on movies the user has rated
    if np.sum(rated_indices) > 0:  # Only train if user has rated at least one movie
        model = Ridge(alpha=1.0)
        model.fit(genre_features[rated_indices], user_ratings[rated_indices])

        user_models[user_id] = model

# Initialize predictions array with same shape as user_movie_matrix
predictions = np.zeros_like(user_movie_matrix.values)

# Fill predictions array with model predictions
for i, user_id in enumerate(user_movie_matrix.index):
    if user_id in user_models:  # Only predict if we have a model for this user
        predictions[i] = user_models[user_id].predict(genre_features)
    else:
        predictions[i] = np.nan  # Set predictions to NaN for users with no ratings
        print("No model for user:", user_id)



In [10]:
# Copy original matrix
imputed_values = user_movie_matrix.values.copy()

# Find NaN positions
mask = np.isnan(imputed_values)

# Fill missing values with predictions
imputed_values[mask] = predictions[mask]

# Build imputed DataFrame
imputed_matrix = pd.DataFrame(imputed_values, index=user_movie_matrix.index, columns=user_movie_matrix.columns)

# Extract predicted values (only for imputed entries)
predictions_list = []
for i, user_id in enumerate(imputed_matrix.index):
    for j, movie_id in enumerate(imputed_matrix.columns):
        if mask[i, j]:  # Only include predicted (imputed) values
            predictions_list.append([user_id, movie_id, imputed_matrix.iloc[i, j]])

In [11]:
# Extract actual and predicted values only where ratings exist (i.e., not NaN in original matrix)
actual = user_movie_matrix.values

# Mask for known (non-NaN) ratings
mask = ~np.isnan(actual)

# Compute RMSE
rmse = np.sqrt(mean_squared_error(actual[mask], predictions[mask]))

print(f"RMSE on known ratings: {rmse:.4f}")

RMSE on known ratings: 0.8407


In [12]:
# Convert predictions_list to DataFrame
predictions_df = pd.DataFrame(predictions_list, columns=["user_id", "movie_id", "rating"])

# Ensure consistent data types
predictions_df["user_id"] = predictions_df["user_id"].astype(str)
predictions_df["movie_id"] = predictions_df["movie_id"].astype(str)

# Save to Parquet
predictions_df.to_parquet("../backend/models/content_predictions.parquet", index=False)

In [13]:
# Save the ratings_df as a parquet file
ratings_df.to_parquet("../backend/models/current_ratings.parquet")