In [8]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
from lightfm import LightFM
import numpy as np
from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix
from lightfm.evaluation import precision_at_k, auc_score
import pickle
import os
from scipy.sparse import save_npz, load_npz

In [2]:
# Data from MovieLens 32M dataset https://grouplens.org/datasets/movielens/32m/
ratings_df = pd.read_csv('data/ratings.csv')
links_df = pd.read_csv('data/links.csv')
movies_df = pd.read_csv('data/movies.csv')

In [3]:
# Convert ratings to use TMDB IDs
ratings_df = ratings_df.merge(links_df[['movieId', 'tmdbId']], on='movieId', how='inner')

# Rename columns to match the expected format
ratings_df = ratings_df.drop(columns=['movieId', 'timestamp'], axis=1)
ratings_df = ratings_df.rename(columns={'tmdbId': 'movieId'})

# Remove any rows with NaN values
ratings_df = ratings_df.dropna(subset=['userId', 'movieId', 'rating'])

# Ensure userId and movieId are integers
ratings_df['userId'] = ratings_df['userId'].astype(int)
ratings_df['movieId'] = ratings_df['movieId'].astype(int)

In [4]:
# Create mappings for user and movie IDs
user_ids = ratings_df['userId'].unique()
movie_ids = ratings_df['movieId'].unique()

user_id_map = {id_: i for i, id_ in enumerate(user_ids)}
movie_id_map = {id_: i for i, id_ in enumerate(movie_ids)}

# Convert ratings to matrix format
row_ind = [user_id_map[x] for x in ratings_df['userId']]
col_ind = [movie_id_map[x] for x in ratings_df['movieId']]
ratings = ratings_df['rating'].values

# Create sparse matrix
sparse_matrix = csr_matrix(
    (ratings, (row_ind, col_ind)),
    shape=(len(user_ids), len(movie_ids))
)

print("Sparse matrix shape:", sparse_matrix.shape)
print("Matrix density: {:.4f}%".format(
    100 * sparse_matrix.nnz / (sparse_matrix.shape[0] * sparse_matrix.shape[1])
))

Sparse matrix shape: (200948, 84275)
Matrix density: 0.1889%


In [5]:
# Split the ratings data into train and test
ratings_train, ratings_test = train_test_split(
    ratings_df,
    test_size=0.2,
    random_state=42
)

# Create train sparse matrix
train_row_ind = [user_id_map[x] for x in ratings_train['userId']]
train_col_ind = [movie_id_map[x] for x in ratings_train['movieId']]
train_ratings = ratings_train['rating'].values

train_matrix = csr_matrix(
    (train_ratings, (train_row_ind, train_col_ind)),
    shape=(len(user_ids), len(movie_ids))
)

# Create test sparse matrix
test_row_ind = [user_id_map[x] for x in ratings_test['userId']]
test_col_ind = [movie_id_map[x] for x in ratings_test['movieId']]
test_ratings = ratings_test['rating'].values

test_matrix = csr_matrix(
    (test_ratings, (test_row_ind, test_col_ind)),
    shape=(len(user_ids), len(movie_ids))
)

print("Training matrix shape:", train_matrix.shape)
print("Testing matrix shape:", test_matrix.shape)

Training matrix shape: (200948, 84275)
Testing matrix shape: (200948, 84275)


In [None]:
def create_movie_features():
    # Get unique genres
    all_genres = set()
    for genres in movies_df['genres'].str.split('|'):
        all_genres.update(genres)
    
    # Create a mapping of genre to index
    genre_to_idx = {genre: idx for idx, genre in enumerate(all_genres)}
    
    # Create a feature matrix for movies
    movie_features = []
    for _, row in movies_df.iterrows():
        movie_id = row['movieId']
        # Get the TMDB ID for this movie
        tmdb_id = links_df[links_df['movieId'] == movie_id]['tmdbId'].values
        if len(tmdb_id) > 0:
            tmdb_id = tmdb_id[0]
            # Create a feature vector for this movie
            feature_vector = np.zeros(len(genre_to_idx))
            for genre in row['genres'].split('|'):
                if genre in genre_to_idx:
                    feature_vector[genre_to_idx[genre]] = 1
            movie_features.append((tmdb_id, feature_vector))
    
    return movie_features, genre_to_idx

In [None]:
# Create the feature matrices
movie_features, genre_to_idx = create_movie_features()

# Convert to the format LightFM expects
item_features = []
item_feature_map = {}
for tmdb_id, features in movie_features:
    if tmdb_id in movie_id_map:
        item_idx = movie_id_map[tmdb_id]
        item_features.append(features)
        item_feature_map[item_idx] = len(item_features) - 1

item_features = np.array(item_features)

In [6]:
# Initialize the model
model = LightFM(loss='warp',
                learning_schedule='adagrad',
                learning_rate=0.05,
                item_alpha=0.1,
                user_alpha=0.1)

# Fit the model
model.fit(train_matrix,
          item_features=item_features,
          epochs=30,
          num_threads=4,
          verbose=True)

Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19


<lightfm.lightfm.LightFM at 0x168342f10>

In [9]:
# Save the sparse matrices separately
save_npz('train_matrix.npz', train_matrix)
save_npz('test_matrix.npz', test_matrix)

# Save the model and mappings
model_data = {
    'model': model,
    'user_id_map': user_id_map,
    'movie_id_map': movie_id_map,
    'user_ids': user_ids,
    'movie_ids': movie_ids
}

# Save to file
with open('lightfm_model.pkl', 'wb') as f:
    pickle.dump(model_data, f)

print("Model and associated data saved successfully!")

Model and associated data saved successfully!


In [None]:

# Calculate precision@k and AUC for the test set
# train_precision = precision_at_k(model, train_matrix, k=10).mean()
# test_precision = precision_at_k(model, test_matrix, k=10).mean()

# train_auc = auc_score(model, train_matrix).mean()
# test_auc = auc_score(model, test_matrix).mean()

# print('Train precision@10: %.2f' % train_precision)
# print('Test precision@10: %.2f' % test_precision)
# print('Train AUC: %.2f' % train_auc)
# print('Test AUC: %.2f' % test_auc)

In [11]:
def load_model():
    """
    Load the saved model and associated data
    """
    # Load the sparse matrices
    train_matrix = load_npz('data/models/train_matrix.npz')
    test_matrix = load_npz('data/models/test_matrix.npz')
    
    # Load the model and mappings
    with open('data/models/lightfm_model.pkl', 'rb') as f:
        model_data = pickle.load(f)
    
    return (
        model_data['model'],
        model_data['user_id_map'],
        model_data['movie_id_map'],
        model_data['user_ids'],
        model_data['movie_ids'],
        train_matrix,
        test_matrix
    )

def get_movie_title(tmdb_id):
    try:
        movie_id = links_df[links_df['tmdbId'] == tmdb_id]['movieId'].values[0]
        return movies_df[movies_df['movieId'] == movie_id]['title'].values[0]
    except:
        return f"Movie with TMDB ID {tmdb_id} (title not found)"

def get_recommendations(user_id, n_items=10):
    # Get the internal user index
    user_idx = user_id_map[user_id]
    
    # Get scores for all items
    scores = model.predict(user_idx, np.arange(len(movie_ids)))
    
    # Get the top N items
    top_items = np.argsort(-scores)[:n_items]
    
    # Convert back to original TMDB IDs
    top_tmdb_ids = [list(movie_id_map.keys())[list(movie_id_map.values()).index(idx)] for idx in top_items]
    
    return top_tmdb_ids

In [12]:
# Example usage:
user_id = 1  # Replace with actual user ID
recommendations = get_recommendations(user_id)
print(f"Top 10 recommendations for user {user_id}:")
for tmdb_id in recommendations:
    try:
        movie_title = get_movie_title(tmdb_id)
        print(f"- {movie_title}")
    except:
        print(f"- Movie with TMDB ID {tmdb_id} (title not found)")

Top 10 recommendations for user 1:
- Nobody's Home (2013)
- Road to Yesterday (2015)
- Technicolour Daydream (2018)
- Four Sisters Before the Wedding (2020)
- For Kayako (1984)
- Under Cover (1987)
- Une affaire qui roule (2002)
- The Bellmen (2020)
- Whitney Cummings: I Love You (2014)
- The Assassination of Abraham Lincoln (2009)


In [13]:
def handle_new_user(user_ratings, n_recommendations=10):
    """
    Handle a new user by creating a temporary profile and making recommendations
    
    Parameters:
    user_ratings: list of tuples (movie_id, rating) where movie_id is the TMDB ID
    n_recommendations: number of recommendations to return
    """
    # Create a new user ID (one more than the maximum existing user ID)
    new_user_id = max(user_id_map.keys()) + 1
    
    # Create a temporary sparse matrix for the new user
    new_user_matrix = csr_matrix((1, len(movie_ids)))
    
    # Add the user's ratings to the matrix
    for movie_id, rating in user_ratings:
        if movie_id in movie_id_map:
            movie_idx = movie_id_map[movie_id]
            new_user_matrix[0, movie_idx] = rating
    
    # Get the model's predictions for the new user
    scores = model.predict(0, np.arange(len(movie_ids)))
    
    # Get the top N items that the user hasn't rated
    rated_movies = set(movie_id for movie_id, _ in user_ratings)
    top_items = []
    for idx in np.argsort(-scores):
        movie_id = list(movie_id_map.keys())[list(movie_id_map.values()).index(idx)]
        if movie_id not in rated_movies:
            top_items.append(movie_id)
            if len(top_items) >= n_recommendations:
                break
    
    return top_items

# Example of a new user with some initial ratings
new_user_ratings = [
    (862, 5.0),  # Toy Story
    (278, 4.5),  # The Shawshank Redemption
    (238, 4.0),  # The Godfather
]

print("New user's initial ratings:")
for movie_id, rating in new_user_ratings:
    print(f"- {get_movie_title(movie_id)}: {rating}")

print("\nRecommended movies for new user:")
recommendations = handle_new_user(new_user_ratings)
for movie_id in recommendations:
    print(f"- {get_movie_title(movie_id)}")



New user's initial ratings:
- Toy Story (1995): 5.0
- Shawshank Redemption, The (1994): 4.5
- Godfather, The (1972): 4.0

Recommended movies for new user:
- Nobody's Home (2013)
- Road to Yesterday (2015)
- Technicolour Daydream (2018)
- Four Sisters Before the Wedding (2020)
- For Kayako (1984)
- Under Cover (1987)
- Une affaire qui roule (2002)
- The Bellmen (2020)
- Whitney Cummings: I Love You (2014)
- The Assassination of Abraham Lincoln (2009)

Content-based recommendations for genre preferences:


KeyboardInterrupt: 

In [None]:
# Alternative approach: Content-based recommendations for completely new users
def get_content_based_recommendations(genre_preferences, n_recommendations=10):
    """
    Get recommendations based on genre preferences for users with no ratings
    
    Parameters:
    genre_preferences: list of preferred genres
    n_recommendations: number of recommendations to return
    """
    # Get movies that match the preferred genres
    genre_movies = movies_df[movies_df['genres'].str.contains('|'.join(genre_preferences), case=False)]
    
    # Get the corresponding TMDB IDs
    genre_movie_ids = []
    for movie_id in genre_movies['movieId']:
        tmdb_id = links_df[links_df['movieId'] == movie_id]['tmdbId'].values
        if len(tmdb_id) > 0:
            genre_movie_ids.append(tmdb_id[0])
    
    # Get the average rating for these movies
    movie_ratings = {}
    for tmdb_id in genre_movie_ids:
        if tmdb_id in movie_id_map:
            movie_idx = movie_id_map[tmdb_id]
            # Get the average rating for this movie
            ratings = ratings_df[ratings_df['movieId'] == tmdb_id]['rating']
            if len(ratings) > 0:
                movie_ratings[tmdb_id] = ratings.mean()
    
    # Sort by average rating and return top N
    sorted_movies = sorted(movie_ratings.items(), key=lambda x: x[1], reverse=True)
    return [movie_id for movie_id, _ in sorted_movies[:n_recommendations]]

# Example usage for content-based recommendations:
print("\nContent-based recommendations for genre preferences:")
genre_preferences = ['Action', 'Adventure']
content_recommendations = get_content_based_recommendations(genre_preferences)
for movie_id in content_recommendations:
    print(f"- {get_movie_title(movie_id)}")