In [16]:
import numpy as np
import pandas as pd
import zipfile
import requests
import os
from collections import defaultdict
from surprise import Dataset, SVD, Reader
from surprise.model_selection import cross_validate
from typing import List, Tuple
import gc
import time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from recommend import get_user_content_recommendations, load_model_data

In [17]:
# Create data directory if it doesn't exist
os.makedirs('data', exist_ok=True)

# Download and extract the zip file if it hasn't been downloaded yet
zip_path = 'data/ml-latest-small.zip'
if not os.path.exists(zip_path):
    print("Downloading MovieLens dataset...")
    response = requests.get('https://files.grouplens.org/datasets/movielens/ml-latest-small.zip', stream=True)
    response.raise_for_status()  # Raise an exception for bad status codes
    
    # Save the zip file
    with open(zip_path, 'wb') as f:
        for chunk in response.iter_content(chunk_size=8192):
            f.write(chunk)
    print("Download complete!")

# Extract the zip file if it hasn't been extracted yet
if not os.path.exists('data/ml-latest-small/ratings.csv'):
    print("Extracting dataset...")
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall('data')
    print("Extraction complete!")

RATINGS_SUBSAMPLE = 1

# Read the ratings and links data
print(f"Loading {RATINGS_SUBSAMPLE*100}% of the ratings data...")
ratings_df = pd.read_csv('data/ml-latest-small/ratings.csv')
if RATINGS_SUBSAMPLE < 1.0:
    ratings_df = ratings_df.sample(frac=RATINGS_SUBSAMPLE, random_state=42)
    print(f"Loaded {len(ratings_df):,} ratings")

links_df = pd.read_csv('data/ml-latest-small/links.csv')
movies_df = pd.read_csv('data/ml-latest-small/movies.csv')

Loading 100% of the ratings data...


In [18]:
# Convert ratings to use TMDB IDs
ratings_df = ratings_df.merge(links_df[['movieId', 'tmdbId']], on='movieId', how='inner')

# Rename columns to match the expected format
ratings_df = ratings_df.drop(columns=['movieId', 'timestamp'], axis=1)
ratings_df = ratings_df.rename(columns={
    'userId': 'user_id',
    'tmdbId': 'movie_id'
})
ratings_df.head()

Unnamed: 0,user_id,rating,movie_id
0,1,4.0,862.0
1,1,4.0,15602.0
2,1,4.0,949.0
3,1,5.0,807.0
4,1,5.0,629.0


In [19]:
# Convert ratings to use TMDB IDs
movies_df = movies_df.merge(links_df[['movieId', 'tmdbId']], on='movieId', how='inner')

# Create genres string column for TF-IDF
movies_df['genres_str'] = movies_df['genres'].str.replace('|', ' ')

# Rename columns to match the expected format
movies_df = movies_df.drop(columns=['movieId', 'title', 'genres'], axis=1)
movies_df = movies_df.rename(columns={
    'tmdbId': 'movie_id',
    'genres_str': 'genres'
})
movies_df.head()

Unnamed: 0,movie_id,genres
0,862.0,Adventure Animation Children Comedy Fantasy
1,8844.0,Adventure Children Fantasy
2,15602.0,Comedy Romance
3,31357.0,Comedy Drama Romance
4,11862.0,Comedy


In [20]:
# Create a TF-IDF vectorizer
tfidf = TfidfVectorizer(stop_words='english')

# Create TF-IDF matrix
tfidf_matrix = tfidf.fit_transform(movies_df['genres'])

# Calculate cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix)

In [21]:
movies_df.head()

Unnamed: 0,movie_id,genres
0,862.0,Adventure Animation Children Comedy Fantasy
1,8844.0,Adventure Children Fantasy
2,15602.0,Comedy Romance
3,31357.0,Comedy Drama Romance
4,11862.0,Comedy


In [22]:
np.save('../backend/models/cosine_sim.npy', cosine_sim)
movies_df.to_parquet('../backend/models/movies.parquet')
ratings_df.to_parquet('../backend/models/ratings.parquet')

In [None]:
model_data = load_model_data()


{'movies_df':       movie_id                                       genres
 0        862.0  Adventure Animation Children Comedy Fantasy
 1       8844.0                   Adventure Children Fantasy
 2      15602.0                               Comedy Romance
 3      31357.0                         Comedy Drama Romance
 4      11862.0                                       Comedy
 ...        ...                                          ...
 9737  432131.0              Action Animation Comedy Fantasy
 9738  445030.0                     Animation Comedy Fantasy
 9739  479308.0                                        Drama
 9740  483455.0                             Action Animation
 9741   37891.0                                       Comedy
 
 [9742 rows x 2 columns],
 'ratings_df':         user_id  rating  movie_id
 0             1     4.0     862.0
 1             1     4.0   15602.0
 2             1     4.0     949.0
 3             1     5.0     807.0
 4             1     5.0     629.0
 ..

In [None]:
uid=str(1)
get_user_content_recommendations(uid, model_data, n=5)