In [80]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
from lightfm import LightFM
import numpy as np
from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix
from lightfm.evaluation import precision_at_k, auc_score
import pickle
import os
from scipy.sparse import save_npz, load_npz, identity

In [81]:
# Data from MovieLens 32M dataset https://grouplens.org/datasets/movielens/100k
ratings_df = pd.read_csv('data/ratings.csv')
links_df = pd.read_csv('data/links.csv')
movies_df = pd.read_csv('data/movies.csv')

In [82]:
# # Convert ratings to use TMDB IDs
ratings_df = ratings_df.merge(links_df[['movieId', 'tmdbId']], on='movieId', how='inner')

# # Rename columns to match the expected format
ratings_df = ratings_df.drop(columns=['movieId', 'timestamp'], axis=1)
ratings_df = ratings_df.rename(columns={'tmdbId': 'movieId'})

# Remove any rows with NaN values
ratings_df = ratings_df.dropna(subset=['userId', 'movieId', 'rating'])

# Ensure userId and movieId are integers
ratings_df['userId'] = ratings_df['userId'].astype(int)
ratings_df['movieId'] = ratings_df['movieId'].astype(int)

# Binarize ratings
ratings_df['rating'] = (ratings_df['rating'] >= 4.0).astype(int)

In [83]:

# Create mappings for user and movie IDs
user_ids = ratings_df['userId'].unique()
movie_ids = ratings_df['movieId'].unique()

user_id_map = {id_: i for i, id_ in enumerate(user_ids)}
movie_id_map = {id_: i for i, id_ in enumerate(movie_ids)}

# Convert ratings to matrix format
row_ind = [user_id_map[x] for x in ratings_df['userId']]
col_ind = [movie_id_map[x] for x in ratings_df['movieId']]
ratings = ratings_df['rating'].values

# Create sparse matrix
sparse_matrix = csr_matrix(
    (ratings, (row_ind, col_ind)),
    shape=(len(user_ids), len(movie_ids))
)

print("Sparse matrix shape:", sparse_matrix.shape)
print("Matrix density: {:.4f}%".format(
    100 * sparse_matrix.nnz / (sparse_matrix.shape[0] * sparse_matrix.shape[1])
))

Sparse matrix shape: (610, 9715)
Matrix density: 1.7013%


In [84]:
def leave_one_out_split(ratings_df):
    """
    Perform a leave-one-out split: for each user, hold out 1 interaction for testing.
    """
    train_list = []
    test_list = []
    
    grouped = ratings_df.groupby('userId')
    
    for user_id, group in grouped:
        if len(group) < 2:
            # If user has only 1 interaction, keep it in training
            train_list.append(group)
        else:
            # Randomly sample one for test
            test_sample = group.sample(n=1, random_state=42)
            train_sample = group.drop(test_sample.index)
            
            test_list.append(test_sample)
            train_list.append(train_sample)
    
    ratings_train = pd.concat(train_list)
    ratings_test = pd.concat(test_list)
    
    return ratings_train, ratings_test

# Split the ratings data into train and test
ratings_train, ratings_test = leave_one_out_split(ratings_df)

# Create train sparse matrix
train_row_ind = [user_id_map[x] for x in ratings_train['userId']]
train_col_ind = [movie_id_map[x] for x in ratings_train['movieId']]
train_ratings = ratings_train['rating'].values

train_matrix = csr_matrix(
    (train_ratings, (train_row_ind, train_col_ind)),
    shape=(len(user_ids), len(movie_ids))
)

# Create test sparse matrix
test_row_ind = [user_id_map[x] for x in ratings_test['userId']]
test_col_ind = [movie_id_map[x] for x in ratings_test['movieId']]
test_ratings = ratings_test['rating'].values

test_matrix = csr_matrix(
    (test_ratings, (test_row_ind, test_col_ind)),
    shape=(len(user_ids), len(movie_ids))
)

print("Training matrix shape:", train_matrix.shape)
print("Testing matrix shape:", test_matrix.shape)

Training matrix shape: (610, 9715)
Testing matrix shape: (610, 9715)


In [85]:
def create_movie_features():
    # Get unique genres
    all_genres = set()
    for genres in movies_df['genres'].str.split('|'):
        all_genres.update(genres)
    
    # Create a mapping of genre to index
    genre_to_idx = {genre: idx for idx, genre in enumerate(all_genres)}
    
    # Create a feature matrix for movies
    movie_features = []
    for _, row in movies_df.iterrows():
        movie_id = row['movieId']
        # Get the TMDB ID for this movie
        tmdb_id = links_df[links_df['movieId'] == movie_id]['tmdbId'].values
        if len(tmdb_id) > 0:
            tmdb_id = tmdb_id[0]
            # Create a feature vector for this movie
            feature_vector = np.zeros(len(genre_to_idx))
            for genre in row['genres'].split('|'):
                if genre in genre_to_idx:
                    feature_vector[genre_to_idx[genre]] = 1
            movie_features.append((tmdb_id, feature_vector))
    
    return movie_features, genre_to_idx

In [86]:
# Create the feature matrices
movie_features, genre_to_idx = create_movie_features()

# Convert to the format LightFM expects
item_features = []
item_feature_map = {}
for tmdb_id, features in movie_features:
    if tmdb_id in movie_id_map:
        item_idx = movie_id_map[tmdb_id]
        item_features.append(features)
        item_feature_map[item_idx] = len(item_features) - 1

# Convert to sparse matrix
item_features = csr_matrix(np.array(item_features))

In [87]:
def save_model(model, user_id_map, movie_id_map, user_ids, movie_ids, item_features, genre_to_idx, train_matrix, test_matrix, save_dir='models'):
    """
    Save the model and all associated data
    
    Parameters:
    model: The trained LightFM model
    user_id_map: Dictionary mapping user IDs to indices
    movie_id_map: Dictionary mapping movie IDs to indices
    user_ids: Array of user IDs
    movie_ids: Array of movie IDs
    item_features: Sparse matrix of item features
    genre_to_idx: Dictionary mapping genres to indices
    train_matrix: Sparse matrix of training data
    test_matrix: Sparse matrix of test data
    save_dir: Directory to save the model files
    """
    # Create directory if it doesn't exist
    os.makedirs(save_dir, exist_ok=True)
    
    # Save the model and mappings
    model_data = {
        'model': model,
        'user_id_map': user_id_map,
        'movie_id_map': movie_id_map,
        'user_ids': user_ids,
        'movie_ids': movie_ids,
        'genre_to_idx': genre_to_idx
    }
    
    # Save to file
    with open(os.path.join(save_dir, 'lightfm_model.pkl'), 'wb') as f:
        pickle.dump(model_data, f)
    
    # Save the sparse matrices separately
    save_npz(os.path.join(save_dir, 'train_matrix.npz'), train_matrix)
    save_npz(os.path.join(save_dir, 'test_matrix.npz'), test_matrix)
    save_npz(os.path.join(save_dir, 'item_features.npz'), item_features)
    
    # Print file sizes
    model_size = os.path.getsize(os.path.join(save_dir, 'lightfm_model.pkl')) / (1024 * 1024)  # Convert to MB
    train_matrix_size = os.path.getsize(os.path.join(save_dir, 'train_matrix.npz')) / (1024 * 1024)
    test_matrix_size = os.path.getsize(os.path.join(save_dir, 'test_matrix.npz')) / (1024 * 1024)
    item_features_size = os.path.getsize(os.path.join(save_dir, 'item_features.npz')) / (1024 * 1024)
    
    print("Model and associated data saved successfully!")

def load_model(load_dir='models'):
    """
    Load the saved model and associated data
    
    Parameters:
    load_dir: Directory where the model files are saved
    """
    # Load the sparse matrices
    train_matrix = load_npz(os.path.join(load_dir, 'train_matrix.npz'))
    test_matrix = load_npz(os.path.join(load_dir, 'test_matrix.npz'))
    item_features = load_npz(os.path.join(load_dir, 'item_features.npz'))
    
    # Load the model and mappings
    with open(os.path.join(load_dir, 'lightfm_model.pkl'), 'rb') as f:
        model_data = pickle.load(f)
    
    return (
        model_data['model'],
        model_data['user_id_map'],
        model_data['movie_id_map'],
        model_data['user_ids'],
        model_data['movie_ids'],
        item_features,
        model_data['genre_to_idx'],
        train_matrix,
        test_matrix
    )

# Example usage:
model, user_id_map, movie_id_map, user_ids, movie_ids, item_features, genre_to_idx, train_matrix, test_matrix = load_model()

In [88]:
# Initialize the model
model = LightFM(no_components=30,
                loss='warp',
                learning_rate=0.05,
                item_alpha=1e-6,
                user_alpha=1e-6)

# Fit the model with item features
model.fit(train_matrix,
          item_features=item_features,
          epochs=30,
          num_threads=8,
          verbose=True)

save_model(
    model=model,
    user_id_map=user_id_map,
    movie_id_map=movie_id_map,
    user_ids=user_ids,
    movie_ids=movie_ids,
    item_features=item_features,
    genre_to_idx=genre_to_idx,
    train_matrix=train_matrix,
    test_matrix=test_matrix
)

Epoch: 100%|██████████| 50/50 [39:55<00:00, 47.92s/it]


Model and associated data saved successfully!


In [89]:
def get_movie_title(tmdb_id):
    try:
        movielens_id = links_df[links_df['tmdbId'] == tmdb_id]['movieId'].values[0]
        return movies_df[movies_df['movieId'] == movielens_id]['title'].values[0]
    except:
        return f"Movie with ID {tmdb_id} (title not found)"

def get_recommendations(user_id, n_items=10):
    """
    Get recommendations for a user
    
    Parameters:
    user_id: The user ID to get recommendations for
    n_items: Number of items to recommend
    """
    # Get the internal user index
    user_idx = user_id_map[user_id]
    
    # Get scores for all items
    scores = model.predict(user_idx, 
                         np.arange(len(movie_ids)),
                         item_features=item_features)
    
    # Get the top N items
    top_items = np.argsort(-scores)[:n_items]
    
    # Convert back to original TMDB IDs and verify they exist in our dataset
    top_tmdb_ids = []
    for idx in top_items:
        tmdb_id = list(movie_id_map.keys())[list(movie_id_map.values()).index(idx)]
        # Verify the movie exists in our dataset
        if tmdb_id in links_df['tmdbId'].values:
            top_tmdb_ids.append(tmdb_id)
            if len(top_tmdb_ids) >= n_items:
                break
    
    return top_tmdb_ids

In [90]:
# Example usage:
user_id = 1  # Replace with actual user ID
recommendations = get_recommendations(user_id)
print(f"Top 10 recommendations for user {user_id}:")
for tmdb_id in recommendations:
    try:
        movie_title = get_movie_title(tmdb_id)
        print(f"- {movie_title}")
    except:
        print(f"- Movie with TMDB ID {tmdb_id} (title not found)")

Top 10 recommendations for user 1:
- Blue Juice (1995)


In [91]:
def handle_new_user(user_ratings, n_recommendations=10):
    """
    Handle a new user by creating a temporary profile and making recommendations
    
    Parameters:
    user_ratings: list of tuples (movie_id, rating) where movie_id is the TMDB ID
    n_recommendations: number of recommendations to return
    """
    # Create a new user ID (one more than the maximum existing user ID)
    new_user_id = max(user_id_map.keys()) + 1
    
    # Create a temporary sparse matrix for the new user
    new_user_matrix = csr_matrix((1, len(movie_ids)))
    
    # Add the user's ratings to the matrix
    for movie_id, rating in user_ratings:
        if movie_id in movie_id_map:
            movie_idx = movie_id_map[movie_id]
            new_user_matrix[0, movie_idx] = rating
    
    # Get the model's predictions for the new user
    scores = model.predict(0, 
                         np.arange(len(movie_ids)),
                         item_features=item_features)
    
    # Get the top N items that the user hasn't rated
    rated_movies = set(movie_id for movie_id, _ in user_ratings)
    top_items = []
    for idx in np.argsort(-scores):
        movie_id = list(movie_id_map.keys())[list(movie_id_map.values()).index(idx)]
        # Verify the movie exists in our dataset and hasn't been rated
        if movie_id in links_df['tmdbId'].values and movie_id not in rated_movies:
            top_items.append(movie_id)
            if len(top_items) >= n_recommendations:
                break
    return top_items

# Example usage:
new_user_ratings = [
    (862, 5.0),  # Toy Story
    (863, 4.5),  # Toy Story 2
    (10193, 5.0),  # Toy Story 3
    (408, 5.0), # Snow White and the Seven Dwarfs
    (38757, 5.0), 
    (37135, 5.0),
]

print("New user's initial ratings:")
for movie_id, rating in new_user_ratings:
    print(f"- {get_movie_title(movie_id)}: {rating}")

print("\nRecommended movies for new user:")
recommendations = handle_new_user(new_user_ratings)
for movie_id in recommendations:
    print(f"- {get_movie_title(movie_id)}")



New user's initial ratings:
- Toy Story (1995): 5.0
- Toy Story 2 (1999): 4.5
- Toy Story 3 (2010): 5.0
- Snow White and the Seven Dwarfs (1937): 5.0
- Tangled (2010): 5.0
- Tarzan (1999): 5.0

Recommended movies for new user:
- Blue Juice (1995)
- Happy Feet Two (2011)
- Match Factory Girl, The (Tulitikkutehtaan tyttö) (1990)
- Apartment, The (1960)
- Deepwater Horizon (2016)
- George Carlin: It's Bad for Ya! (2008)
- Walk in the Clouds, A (1995)
- Promise, The (La promesse) (1996)
- In Too Deep (1999)
- My Voyage to Italy (Il mio viaggio in Italia) (1999)


In [92]:
def get_content_based_recommendations(genre_preferences, n_recommendations=10):
    """
    Get recommendations based on genre preferences for users with no ratings
    
    Parameters:
    genre_preferences: list of preferred genres
    n_recommendations: number of recommendations to return
    """
    # Create a feature vector for the genre preferences
    feature_vector = np.zeros(len(genre_to_idx))
    for genre in genre_preferences:
        if genre in genre_to_idx:
            feature_vector[genre_to_idx[genre]] = 1
    
    # Convert to sparse matrix
    user_features = csr_matrix(feature_vector.reshape(1, -1))
    
    # Get scores for all items
    scores = model.predict(0, 
                         np.arange(len(movie_ids)),
                         item_features=item_features,
                         user_features=user_features)
    
    # Get the top N items
    top_items = np.argsort(-scores)[:n_recommendations]
    
    # Convert back to original TMDB IDs and verify they exist in our dataset
    top_tmdb_ids = []
    for idx in top_items:
        tmdb_id = list(movie_id_map.keys())[list(movie_id_map.values()).index(idx)]
        # Verify the movie exists in our dataset
        if tmdb_id in links_df['tmdbId'].values:
            top_tmdb_ids.append(tmdb_id)
            if len(top_tmdb_ids) >= n_recommendations:
                break
    
    return top_tmdb_ids

# Example usage for content-based recommendations:
print("\nContent-based recommendations for genre preferences:")
genre_preferences = ['Animation', 'Children']
content_recommendations = get_content_based_recommendations(genre_preferences)
for movie_id in content_recommendations:
    print(f"- {get_movie_title(movie_id)}")


Content-based recommendations for genre preferences:
- City Hunter (Sing si lip yan) (1993)
- Gilda (1946)
- Taking of Pelham One Two Three, The (1974)
- Grudge, The (2004)


In [93]:
# Load the model and all associated data
model, user_id_map, movie_id_map, user_ids, movie_ids, item_features, genre_to_idx, train_matrix, test_matrix = load_model()

In [95]:
# Calculate precision@k and AUC for the test set using a sample of users
nonzero_users = test_matrix.getnnz(axis=1).nonzero()[0]
sample_users = np.random.choice(nonzero_users, size=100, replace=False)

# Create a sparse matrix for user features (identity matrix for users)
user_features = identity(train_matrix.shape[0], format='csr')

train_precision = precision_at_k(model, 
                               train_matrix[sample_users], 
                               k=10,
                               item_features=item_features,
                               user_features=user_features[sample_users]).mean()
print('Train precision: %.2f' % train_precision)

test_precision = precision_at_k(model, 
                              test_matrix[sample_users], 
                              k=10,
                              item_features=item_features,
                              user_features=user_features[sample_users]).mean()
print('Test precision: %.2f' % test_precision)

train_auc = auc_score(model, 
                     train_matrix[sample_users],
                     item_features=item_features,
                     user_features=user_features[sample_users]).mean()
print('Train AUC: %.2f' % train_auc)

test_auc = auc_score(model, 
                    test_matrix[sample_users],
                    item_features=item_features,
                    user_features=user_features[sample_users]).mean()
print('Test AUC: %.2f' % test_auc)

Train precision: 0.00
Test precision: 0.00
Train AUC: 0.70
Test AUC: 0.58
