In [16]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
from lightfm import LightFM
import numpy as np
from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix
from lightfm.evaluation import precision_at_k, auc_score
import pickle
import os
from scipy.sparse import save_npz, load_npz, identity

In [17]:
# Data from MovieLens 32M dataset https://grouplens.org/datasets/movielens/32m/
ratings_df = pd.read_csv('data/ratings.csv')
links_df = pd.read_csv('data/links.csv')
movies_df = pd.read_csv('data/movies.csv')

In [18]:
# # Convert ratings to use TMDB IDs
# ratings_df = ratings_df.merge(links_df[['movieId', 'tmdbId']], on='movieId', how='inner')

# # Rename columns to match the expected format
# ratings_df = ratings_df.drop(columns=['movieId', 'timestamp'], axis=1)
# ratings_df = ratings_df.rename(columns={'tmdbId': 'movieId'})

# Remove any rows with NaN values
ratings_df = ratings_df.dropna(subset=['userId', 'movieId', 'rating'])

# Ensure userId and movieId are integers
ratings_df['userId'] = ratings_df['userId'].astype(int)
ratings_df['movieId'] = ratings_df['movieId'].astype(int)

In [19]:
# Create mappings for user and movie IDs
user_ids = ratings_df['userId'].unique()
movie_ids = ratings_df['movieId'].unique()

user_id_map = {id_: i for i, id_ in enumerate(user_ids)}
movie_id_map = {id_: i for i, id_ in enumerate(movie_ids)}

# Convert ratings to matrix format
row_ind = [user_id_map[x] for x in ratings_df['userId']]
col_ind = [movie_id_map[x] for x in ratings_df['movieId']]
ratings = ratings_df['rating'].values

# Create sparse matrix
sparse_matrix = csr_matrix(
    (ratings, (row_ind, col_ind)),
    shape=(len(user_ids), len(movie_ids))
)

print("Sparse matrix shape:", sparse_matrix.shape)
print("Matrix density: {:.4f}%".format(
    100 * sparse_matrix.nnz / (sparse_matrix.shape[0] * sparse_matrix.shape[1])
))

Sparse matrix shape: (200948, 84275)
Matrix density: 0.1889%


In [20]:
# Split the ratings data into train and test
ratings_train, ratings_test = train_test_split(
    ratings_df,
    test_size=0.2,
    random_state=42
)

# Create train sparse matrix
train_row_ind = [user_id_map[x] for x in ratings_train['userId']]
train_col_ind = [movie_id_map[x] for x in ratings_train['movieId']]
train_ratings = ratings_train['rating'].values

train_matrix = csr_matrix(
    (train_ratings, (train_row_ind, train_col_ind)),
    shape=(len(user_ids), len(movie_ids))
)

# Create test sparse matrix
test_row_ind = [user_id_map[x] for x in ratings_test['userId']]
test_col_ind = [movie_id_map[x] for x in ratings_test['movieId']]
test_ratings = ratings_test['rating'].values

test_matrix = csr_matrix(
    (test_ratings, (test_row_ind, test_col_ind)),
    shape=(len(user_ids), len(movie_ids))
)

print("Training matrix shape:", train_matrix.shape)
print("Testing matrix shape:", test_matrix.shape)

Training matrix shape: (200948, 84275)
Testing matrix shape: (200948, 84275)


In [21]:
def create_movie_features():
    # Get unique genres
    all_genres = set()
    for genres in movies_df['genres'].str.split('|'):
        all_genres.update(genres)
    
    # Create a mapping of genre to index
    genre_to_idx = {genre: idx for idx, genre in enumerate(all_genres)}
    
    # Create a feature matrix for movies
    movie_features = []
    for _, row in movies_df.iterrows():
        movie_id = row['movieId']
        # Get the TMDB ID for this movie
        tmdb_id = links_df[links_df['movieId'] == movie_id]['tmdbId'].values
        if len(tmdb_id) > 0:
            tmdb_id = tmdb_id[0]
            # Create a feature vector for this movie
            feature_vector = np.zeros(len(genre_to_idx))
            for genre in row['genres'].split('|'):
                if genre in genre_to_idx:
                    feature_vector[genre_to_idx[genre]] = 1
            movie_features.append((tmdb_id, feature_vector))
    
    return movie_features, genre_to_idx

In [22]:
# Create the feature matrices
movie_features, genre_to_idx = create_movie_features()

# Convert to the format LightFM expects
item_features = []
item_feature_map = {}
for tmdb_id, features in movie_features:
    if tmdb_id in movie_id_map:
        item_idx = movie_id_map[tmdb_id]
        item_features.append(features)
        item_feature_map[item_idx] = len(item_features) - 1

# Convert to sparse matrix
item_features = csr_matrix(np.array(item_features))

In [23]:
def save_model(model, user_id_map, movie_id_map, user_ids, movie_ids, item_features, genre_to_idx, train_matrix, test_matrix, save_dir='models'):
    """
    Save the model and all associated data
    
    Parameters:
    model: The trained LightFM model
    user_id_map: Dictionary mapping user IDs to indices
    movie_id_map: Dictionary mapping movie IDs to indices
    user_ids: Array of user IDs
    movie_ids: Array of movie IDs
    item_features: Sparse matrix of item features
    genre_to_idx: Dictionary mapping genres to indices
    train_matrix: Sparse matrix of training data
    test_matrix: Sparse matrix of test data
    save_dir: Directory to save the model files
    """
    # Create directory if it doesn't exist
    os.makedirs(save_dir, exist_ok=True)
    
    # Save the model and mappings
    model_data = {
        'model': model,
        'user_id_map': user_id_map,
        'movie_id_map': movie_id_map,
        'user_ids': user_ids,
        'movie_ids': movie_ids,
        'genre_to_idx': genre_to_idx
    }
    
    # Save to file
    with open(os.path.join(save_dir, 'lightfm_model.pkl'), 'wb') as f:
        pickle.dump(model_data, f)
    
    # Save the sparse matrices separately
    save_npz(os.path.join(save_dir, 'train_matrix.npz'), train_matrix)
    save_npz(os.path.join(save_dir, 'test_matrix.npz'), test_matrix)
    save_npz(os.path.join(save_dir, 'item_features.npz'), item_features)
    
    # Print file sizes
    model_size = os.path.getsize(os.path.join(save_dir, 'lightfm_model.pkl')) / (1024 * 1024)  # Convert to MB
    train_matrix_size = os.path.getsize(os.path.join(save_dir, 'train_matrix.npz')) / (1024 * 1024)
    test_matrix_size = os.path.getsize(os.path.join(save_dir, 'test_matrix.npz')) / (1024 * 1024)
    item_features_size = os.path.getsize(os.path.join(save_dir, 'item_features.npz')) / (1024 * 1024)
    
    print("Model and associated data saved successfully!")

def load_model(load_dir='models'):
    """
    Load the saved model and associated data
    
    Parameters:
    load_dir: Directory where the model files are saved
    """
    # Load the sparse matrices
    train_matrix = load_npz(os.path.join(load_dir, 'train_matrix.npz'))
    test_matrix = load_npz(os.path.join(load_dir, 'test_matrix.npz'))
    item_features = load_npz(os.path.join(load_dir, 'item_features.npz'))
    
    # Load the model and mappings
    with open(os.path.join(load_dir, 'lightfm_model.pkl'), 'rb') as f:
        model_data = pickle.load(f)
    
    return (
        model_data['model'],
        model_data['user_id_map'],
        model_data['movie_id_map'],
        model_data['user_ids'],
        model_data['movie_ids'],
        item_features,
        model_data['genre_to_idx'],
        train_matrix,
        test_matrix
    )

# Example usage:
model, user_id_map, movie_id_map, user_ids, movie_ids, item_features, genre_to_idx, train_matrix, test_matrix = load_model()

In [24]:
# Initialize the model
model = LightFM(loss='bpr',
                learning_schedule='adagrad',
                learning_rate=0.1)

# Fit the model with item features
model.fit(train_matrix,
          item_features=item_features,
          epochs=50,
          num_threads=4,
          verbose=True)

save_model(
    model=model,
    user_id_map=user_id_map,
    movie_id_map=movie_id_map,
    user_ids=user_ids,
    movie_ids=movie_ids,
    item_features=item_features,
    genre_to_idx=genre_to_idx,
    train_matrix=train_matrix,
    test_matrix=test_matrix
)

Epoch: 100%|██████████| 50/50 [49:42<00:00, 59.64s/it]   


Model and associated data saved successfully!


In [25]:
def get_movie_title(movie_id):
    try:
        return movies_df[movies_df['movieId'] == movie_id]['title'].values[0]
    except:
        return f"Movie with ID {movie_id} (title not found)"

def get_recommendations(user_id, n_items=10):
    """
    Get recommendations for a user
    
    Parameters:
    user_id: The user ID to get recommendations for
    n_items: Number of items to recommend
    """
    # Get the internal user index
    user_idx = user_id_map[user_id]
    
    # Get scores for all items
    scores = model.predict(user_idx, 
                         np.arange(len(movie_ids)),
                         item_features=item_features)  # Add item_features here
    
    # Get the top N items
    top_items = np.argsort(-scores)[:n_items]
    
    # Convert back to original TMDB IDs
    top_tmdb_ids = [list(movie_id_map.keys())[list(movie_id_map.values()).index(idx)] for idx in top_items]
    
    return top_tmdb_ids

In [26]:
# Example usage:
user_id = 1  # Replace with actual user ID
recommendations = get_recommendations(user_id)
print(f"Top 10 recommendations for user {user_id}:")
for tmdb_id in recommendations:
    try:
        movie_title = get_movie_title(tmdb_id)
        print(f"- {movie_title}")
    except:
        print(f"- Movie with TMDB ID {tmdb_id} (title not found)")

Top 10 recommendations for user 1:
- Sound of Sun (2017)
- Midnight Limited (1940)
- The Blazing World (2021)
- Perry Mason: The Case of the Murdered Madam (1987)
- My Tomorrow (2011)
- 20,000 Leagues Under the Sea (1916)
- Between Something & Nothing (2008)
- Cage II: The Arena of Death (1994)
- Quest for Fire (Guerre du feu, La) (1981)
- Bad Hair (2013)


In [27]:
def handle_new_user(user_ratings, n_recommendations=10):
    """
    Handle a new user by creating a temporary profile and making recommendations
    
    Parameters:
    user_ratings: list of tuples (movie_id, rating) where movie_id is the TMDB ID
    n_recommendations: number of recommendations to return
    """
    # Create a new user ID (one more than the maximum existing user ID)
    new_user_id = max(user_id_map.keys()) + 1
    
    # Create a temporary sparse matrix for the new user
    new_user_matrix = csr_matrix((1, len(movie_ids)))
    
    # Add the user's ratings to the matrix
    for movie_id, rating in user_ratings:
        if movie_id in movie_id_map:
            movie_idx = movie_id_map[movie_id]
            new_user_matrix[0, movie_idx] = rating
    
    # Get the model's predictions for the new user
    scores = model.predict(0, 
                         np.arange(len(movie_ids)),
                         item_features=item_features)  # Add item_features here
    
    # Get the top N items that the user hasn't rated
    rated_movies = set(movie_id for movie_id, _ in user_ratings)
    top_items = []
    for idx in np.argsort(-scores):
        movie_id = list(movie_id_map.keys())[list(movie_id_map.values()).index(idx)]
        if movie_id not in rated_movies:
            top_items.append(movie_id)
            if len(top_items) >= n_recommendations:
                break
    
    return top_items

# Example usage:
new_user_ratings = [
    (862, 5.0),  # Toy Story
    (863, 4.5),  # Toy Story 2
    (10193, 5.0),  # Toy Story 3
]

print("New user's initial ratings:")
for movie_id, rating in new_user_ratings:
    print(f"- {get_movie_title(movie_id)}: {rating}")

print("\nRecommended movies for new user:")
recommendations = handle_new_user(new_user_ratings)
for movie_id in recommendations:
    print(f"- {get_movie_title(movie_id)}")



New user's initial ratings:
- Toy Story (1995): 5.0
- Toy Story 2 (1999): 4.5
- Toy Story 3 (2010): 5.0

Recommended movies for new user:
- Sound of Sun (2017)
- Midnight Limited (1940)
- The Blazing World (2021)
- Perry Mason: The Case of the Murdered Madam (1987)
- My Tomorrow (2011)
- 20,000 Leagues Under the Sea (1916)
- Between Something & Nothing (2008)
- Cage II: The Arena of Death (1994)
- Quest for Fire (Guerre du feu, La) (1981)
- Bad Hair (2013)


In [28]:
# Alternative approach: Content-based recommendations for completely new users
def get_content_based_recommendations(genre_preferences, n_recommendations=10):
    """
    Get recommendations based on genre preferences for users with no ratings
    
    Parameters:
    genre_preferences: list of preferred genres
    n_recommendations: number of recommendations to return
    """
    # Create a feature vector for the genre preferences
    feature_vector = np.zeros(len(genre_to_idx))
    for genre in genre_preferences:
        if genre in genre_to_idx:
            feature_vector[genre_to_idx[genre]] = 1
    
    # Convert to sparse matrix
    user_features = csr_matrix(feature_vector.reshape(1, -1))
    
    # Get scores for all items
    scores = model.predict(0, 
                         np.arange(len(movie_ids)),
                         item_features=item_features,
                         user_features=user_features)
    
    # Get the top N items
    top_items = np.argsort(-scores)[:n_recommendations]
    
    # Convert back to original TMDB IDs
    top_tmdb_ids = [list(movie_id_map.keys())[list(movie_id_map.values()).index(idx)] for idx in top_items]
    
    return top_tmdb_ids

# Example usage for content-based recommendations:
print("\nContent-based recommendations for genre preferences:")
genre_preferences = ['Action', 'Adventure']
content_recommendations = get_content_based_recommendations(genre_preferences)
for movie_id in content_recommendations:
    print(f"- {get_movie_title(movie_id)}")


Content-based recommendations for genre preferences:
- Middle of the World, The (O Caminho das Nuvens) (2003)
- The Final Days (1989)
- Warren Ellis: Captured Ghosts (2011)
- The Gatling Gun (1971)
- The Bare Wench Project 2: Scared Topless (2001)
- Deadly Hero (1976)
- Soloalbum (2003)
- Geri's Game (1997)
- King for a Day (1983)
- To Sir, with Love II (1996)


In [29]:
# Load the model and all associated data
model, user_id_map, movie_id_map, user_ids, movie_ids, item_features, genre_to_idx, train_matrix, test_matrix = load_model()

In [30]:
# Calculate precision@k and AUC for the test set using a sample of users
nonzero_users = test_matrix.getnnz(axis=1).nonzero()[0]
sample_users = np.random.choice(nonzero_users, size=100, replace=False)

# Create a sparse matrix for user features (identity matrix for users)
user_features = identity(train_matrix.shape[0], format='csr')

train_precision = precision_at_k(model, 
                               train_matrix[sample_users], 
                               k=50,
                               item_features=item_features,
                               user_features=user_features[sample_users]).mean()
print('Train precision: %.2f' % train_precision)

test_precision = precision_at_k(model, 
                              test_matrix[sample_users], 
                              k=50,
                              item_features=item_features,
                              user_features=user_features[sample_users]).mean()
print('Test precision: %.2f' % test_precision)

train_auc = auc_score(model, 
                     train_matrix[sample_users],
                     item_features=item_features,
                     user_features=user_features[sample_users]).mean()
print('Train AUC: %.2f' % train_auc)

test_auc = auc_score(model, 
                    test_matrix[sample_users],
                    item_features=item_features,
                    user_features=user_features[sample_users]).mean()
print('Test AUC: %.2f' % test_auc)

Train precision: 0.00
Test precision: 0.00
Train AUC: 0.62
Test AUC: 0.54
