# Importing libraries


In [1]:
import pandas as pd
import cornac
from cornac.data import Dataset
from cornac.models import SVD
from cornac.eval_methods import RatioSplit
from cornac.metrics import RMSE, MAE, Precision, Recall


# Loading the data


In [2]:
# Loading the data
movies_path = 'output_data/movie_user_IQR.csv'
ratings_path = 'output_data/rating_user_IQR.csv'

# Load the data into DataFrames
ratings_df = pd.read_csv(ratings_path)
movies_df = pd.read_csv(movies_path)

# Display the first few rows of each DataFrame to verify correct loading
print("Ratings DataFrame:")
print(ratings_df.head())

print("Movies DataFrame:")
print(movies_df.head())


Ratings DataFrame:
   userId  movieId  rating
0       1        2     3.5
1       1       29     3.5
2       1       32     3.5
3       1       47     3.5
4       1       50     3.5
Movies DataFrame:
   movieId                        title  \
0        1                    Toy Story   
1        2                      Jumanji   
2        3             Grumpier Old Men   
3        4            Waiting to Exhale   
4        5  Father of the Bride Part II   

                                        genres    year  Action  Adventure  \
0  Adventure|Animation|Children|Comedy|Fantasy  1995.0       0          1   
1                   Adventure|Children|Fantasy  1995.0       0          1   
2                               Comedy|Romance  1995.0       0          0   
3                         Comedy|Drama|Romance  1995.0       0          0   
4                                       Comedy  1995.0       0          0   

   Animation  Children  Comedy  Crime  ...  Film-Noir  Horror  IMAX  Musical  \

In [6]:
ratings_df

Unnamed: 0,userId,movieId,rating
0,1,2,3.5
1,1,29,3.5
2,1,32,3.5
3,1,47,3.5
4,1,50,3.5
...,...,...,...
9962331,138493,68954,4.5
9962332,138493,69526,4.5
9962333,138493,69644,3.0
9962334,138493,70286,5.0


# Preparing the data for Cornac

In [3]:
# Prepare the data for Cornac
# Convert the DataFrame to a list of tuples, excluding the index and without naming the columns
data = list(ratings_df[['userId', 'movieId', 'rating']].itertuples(index=False, name=None))

# Create a RatioSplit object to divide the data into training and testing sets
splitter = RatioSplit(data, test_size=0.2, seed=42)

# Extract the training and testing sets from the RatioSplit object
train_set, test_set = splitter.train_set, splitter.test_set



# Training model

In [16]:
# Initialize the SVD model
# Set the number of latent factors to 50, with 10 iterations, a learning rate of 0.001, and regularization parameter of 0.01
svd = SVD(k=50, max_iter=10, learning_rate=0.001, lambda_reg=0.01)

# Train the model using the training data
svd.fit(train_set)


<cornac.models.svd.recom_svd.SVD at 0x21085c896d0>

# Evaluate the model

In [17]:
# Define evaluation metrics
metrics = [RMSE(), MAE(), Precision(k=10), Recall(k=10)]

# Create a RatioSplit object with the evaluation metrics
# RatioSplit splits the data into training and test sets based on the given test_size and seed
# The metrics parameter is used to specify which metrics to use for evaluating the model
eval_method = RatioSplit(data, test_size=0.2, seed=42, metrics=metrics)

# Evaluate the model using Cornac
# The Experiment class runs the evaluation using the specified evaluation method and metrics
cornac.Experiment(eval_method=eval_method, models=[svd], metrics=metrics).run()



TEST:
...
    |    MAE |   RMSE | Precision@10 | Recall@10 | Train (s) | Test (s)
--- + ------ + ------ + ------------ + --------- + --------- + --------
SVD | 0.5802 | 0.7389 |       0.1024 |    0.0219 |   86.5048 | 117.7853



# Optimization of the model

## Optimization of the k hyperparameters

In [None]:
# List of values for max_iter
k = [100, 150, 300, 500]

for k in k:
    # Initialize the SVD model
    # Set the number of latent factors to 50, max_iter to the current value in the loop,
    # learning rate to 0.001, and regularization parameter to 0.01
    svd = SVD(k=k, max_iter=10, learning_rate=0.001, lambda_reg=0.01)

    # Train the model with the training data
    svd.fit(train_set)

    # Define evaluation metrics
    metrics = [RMSE(), MAE(), Precision(k=10), Recall(k=10)]

    # Create a RatioSplit object with the evaluation metrics
    # RatioSplit splits the data into training and testing sets, based on the specified test_size and seed
    eval_method = RatioSplit(data, test_size=0.2, seed=42, metrics=metrics)

    # Print the current max_iter value being evaluated
    print(f"Results for max_iter={k}:")

    # Evaluate the model using Cornac
    # The Experiment class runs the evaluation using the specified eval_method and metrics
    cornac.Experiment(eval_method=eval_method, models=[svd], metrics=metrics).run()

    # Print a separator to distinguish between results for different max_iter values
    print(f"Results for max_iter={k} completed.")

The optimal value is obtain at k = 300

## Optimization of the max_iter parameters

In [None]:
# List of values for max_iter
max_iter_list = [23, 24, 25, 26, 27, 28]

for max_iter in max_iter_list:
    # Initialize the SVD model
    # Set the number of latent factors to 300, max_iter to the current value in the loop,
    # learning rate to 0.003, and regularization parameter to 0.02
    svd = SVD(k=300, max_iter=max_iter, learning_rate=0.003, lambda_reg=0.02)

    # Train the model using the training data
    svd.fit(train_set)

    # Define evaluation metrics
    metrics = [RMSE(), MAE(), Precision(k=10), Recall(k=10)]

    # Create a RatioSplit object with the evaluation metrics
    # RatioSplit splits the data into training and testing sets based on the given test_size and seed
    eval_method = RatioSplit(data, test_size=0.2, seed=42, metrics=metrics)
    
    # Print the current max_iter value being evaluated
    print(f"Results for max_iter={max_iter}:")

    # Evaluate the model using Cornac
    # The Experiment class runs the evaluation with the specified eval_method and metrics
    cornac.Experiment(eval_method=eval_method, models=[svd], metrics=metrics).run()

    # Print a separator to distinguish between results for different max_iter values
    print(f"Results for max_iter={max_iter} completed.")


The optimal value is obtain at max_iter = 23

## Optimization of the learning_rate hyperparameters

In [26]:
# List of values for learning_rate
learning_rate_list = [0.0025, 0.0026, 0.0027, 0.0028, 0.0029]

# Iterate over each learning rate value
for learning_rate in learning_rate_list:
    # Initialize the SVD model
    # Set the number of latent factors to 300, max_iter to a fixed value (e.g., 23),
    # learning rate to the current value in the loop, and regularization parameter to 0.02
    svd = SVD(k=300, max_iter=23, learning_rate=learning_rate, lambda_reg=0.02)

    # Train the model using the training data
    svd.fit(train_set)

    # Define evaluation metrics
    metrics = [RMSE(), MAE(), Precision(k=10), Recall(k=10)]

    # Create a RatioSplit object with the evaluation metrics
    # RatioSplit splits the data into training and testing sets based on the given test_size and seed
    eval_method = RatioSplit(data, test_size=0.2, seed=42, metrics=metrics)

    # Print the current learning rate being evaluated
    print(f"Results for learning_rate={learning_rate}:")

    # Evaluate the model using Cornac
    # The Experiment class runs the evaluation with the specified eval_method and metrics
    cornac.Experiment(eval_method=eval_method, models=[svd], metrics=metrics).run()

    # Print a separator to distinguish between results for different learning rates
    print(f"Evaluation completed for learning_rate={learning_rate}.")


Results for learning_rate=0.0025:





TEST:
...
    |    MAE |   RMSE | Precision@10 | Recall@10 | Train (s) | Test (s)
--- + ------ + ------ + ------------ + --------- + --------- + --------
SVD | 0.5861 | 0.7439 |       0.1267 |    0.0262 |   30.6509 | 102.0043

Evaluation completed for learning_rate=0.0025.
Results for learning_rate=0.0026:

TEST:
...
    |    MAE |   RMSE | Precision@10 | Recall@10 | Train (s) | Test (s)
--- + ------ + ------ + ------------ + --------- + --------- + --------
SVD | 0.5849 | 0.7425 |       0.1271 |    0.0264 |   29.3957 |  97.1177

Evaluation completed for learning_rate=0.0026.
Results for learning_rate=0.0027:

TEST:
...
    |    MAE |   RMSE | Precision@10 | Recall@10 | Train (s) | Test (s)
--- + ------ + ------ + ------------ + --------- + --------- + --------
SVD | 0.5837 | 0.7412 |       0.1249 |    0.0259 |   31.9990 | 100.9704

Evaluation completed for learning_rate=0.0027.
Results for learning_rate=0.0028:

TEST:
...
    |    MAE |   RMSE | Precision@10 | Recall@10 | Train (s) |

The optimal value is obtain at learning_rate = 0.0026

## Optimization of the lambda_reg

In [29]:
# List of values for lambda_reg
lambda_reg_list = [0.005, 0.006, 0.007, 0.008, 0.009]

# Iterate over each lambda_reg value
for lambda_reg in lambda_reg_list:
    # Initialize the SVD model
    # Set the number of latent factors to 150, max_iter to 25, learning rate to 0.001,
    # and lambda_reg to the current value in the loop
    svd = SVD(k=300, max_iter=23, learning_rate=0.0026, lambda_reg=lambda_reg)

    # Train the model using the training data
    svd.fit(train_set)

    # Define evaluation metrics
    metrics = [RMSE(), MAE(), Precision(k=10), Recall(k=10)]

    # Create a RatioSplit object with the evaluation metrics
    # RatioSplit splits the data into training and testing sets based on the given test_size and seed
    eval_method = RatioSplit(data, test_size=0.2, seed=42, metrics=metrics)

    # Print the current lambda_reg value being evaluated
    print(f"Results for lambda_reg={lambda_reg}:")

    # Evaluate the model using Cornac
    # The Experiment class runs the evaluation with the specified eval_method and metrics
    cornac.Experiment(eval_method=eval_method, models=[svd], metrics=metrics).run()

    # Print a separator to distinguish between results for different lambda_reg values
    print(f"Evaluation completed for lambda_reg={lambda_reg}.")

Results for lambda_reg=0.005:





TEST:
...
    |    MAE |   RMSE | Precision@10 | Recall@10 | Train (s) | Test (s)
--- + ------ + ------ + ------------ + --------- + --------- + --------
SVD | 0.5890 | 0.7514 |       0.1476 |    0.0307 |   27.6231 |  92.3708

Evaluation completed for lambda_reg=0.005.
Results for lambda_reg=0.006:

TEST:
...
    |    MAE |   RMSE | Precision@10 | Recall@10 | Train (s) | Test (s)
--- + ------ + ------ + ------------ + --------- + --------- + --------
SVD | 0.5858 | 0.7474 |       0.1495 |    0.0310 |   27.8618 |  90.4926

Evaluation completed for lambda_reg=0.006.
Results for lambda_reg=0.007:

TEST:
...
    |    MAE |   RMSE | Precision@10 | Recall@10 | Train (s) | Test (s)
--- + ------ + ------ + ------------ + --------- + --------- + --------
SVD | 0.5832 | 0.7438 |       0.1501 |    0.0312 |   26.3398 |  89.6741

Evaluation completed for lambda_reg=0.007.
Results for lambda_reg=0.008:

TEST:
...
    |    MAE |   RMSE | Precision@10 | Recall@10 | Train (s) | Test (s)
--- + ------ +

The optimal value is obtain at lambda_reg = 0.008

## Function for all optimization

This solution is quiet long to reach the best parameters

In [None]:
# List of value
k = [100, 150, 300, 500]
max_iter_list = [23, 24, 25, 26, 27, 28]
learning_rate_list = [0.0025, 0.0026, 0.0027, 0.0028, 0.0029]
lambda_reg_list = [0.005, 0.006, 0.007, 0.008, 0.009]

# Iterate over each 
for k in k:
    
    for max_iter in max_iter_list:
        
        for learning_rate in learning_rate_list:
            
            for lambda_reg in lambda_reg_list:
                # Initialize the SVD model
                svd = SVD(k=k, max_iter=max_iter, learning_rate=learning_rate, lambda_reg=lambda_reg)

                # Train the model using the training data
                svd.fit(train_set)

                # Define evaluation metrics
                metrics = [RMSE(), MAE(), Precision(k=10), Recall(k=10)]

                # Create a RatioSplit object with the evaluation metrics
                # RatioSplit splits the data into training and testing sets based on the given test_size and seed
                eval_method = RatioSplit(data, test_size=0.2, seed=42, metrics=metrics)

                # Print the current lambda_reg value being evaluated
                print(f"Results for k={k},  max_iter={max_iter}, learning_rate={learning_rate} & lambda_reg={lambda_reg}:")

                # Evaluate the model using Cornac
                # The Experiment class runs the evaluation with the specified eval_method and metrics
                cornac.Experiment(eval_method=eval_method, models=[svd], metrics=metrics).run()

                # Print a separator to distinguish between results for different lambda_reg values
                print(f"Evaluation completed for k={k},  max_iter={max_iter}, learning_rate={learning_rate} & lambda_reg={lambda_reg}.")


# Recommendation function

In [30]:
def recommend_top_n(user_id, n=10):
    """
    Recommend the top N movies for a given user based on the trained SVD model.

    Parameters:
    user_id (int): The ID of the user for whom recommendations are to be generated.
    n (int): The number of top recommendations to return (default is 10).

    Returns:
    Tuple of two DataFrames:
        - DataFrame of movies already seen by the user with ratings.
        - DataFrame of top N recommended movies with predicted scores.
    """

    # Check if the user is in the training data
    if user_id not in train_set.user_ids:
        raise ValueError(f"User {user_id} not found in the training data.")
    
    # Movies already seen by the user, sorted by rating
    user_seen = ratings_df[ratings_df['userId'] == user_id].sort_values(by='rating', ascending=False)
    top_n_seen = user_seen[['movieId', 'rating']].merge(movies_df[['movieId', 'title', 'genres']], on='movieId').head(n)
    
    # Get all unique movie IDs
    movie_ids = ratings_df['movieId'].unique()

    # Use the trained SVD model to get predicted scores for all movies for the given user
    scores = svd.score(user_id)

    # Associate the scores with movie IDs
    user_predictions = list(zip(movie_ids, scores))

    # Sort predictions by score in descending order and get the top N
    user_predictions.sort(key=lambda x: x[1], reverse=True)
    top_n_predictions = user_predictions[:n]

    # Extract the movie IDs for the top N recommendations
    top_n_movie_ids = [pred[0] for pred in top_n_predictions]

    # Retrieve the titles and genres of the recommended movies
    top_n_movies = movies_df[movies_df['movieId'].isin(top_n_movie_ids)][['movieId', 'title', 'genres']]
    top_n_movies = top_n_movies.merge(pd.DataFrame(top_n_predictions, columns=['movieId', 'score']), on='movieId')

    # Return two DataFrames: seen movies and recommended movies
    return top_n_seen[['title', 'genres', 'rating']], top_n_movies[['title', 'genres', 'score']].sort_values(by='score', ascending=False)


In [31]:
def display_recommendations(user_id, n=10):
    top_n_seen, top_n_recommendations = recommend_top_n(user_id, n)

    print(f"Top 10 films vus par l'utilisateur {user_id} :")
    display(top_n_seen)  # Afficher le DataFrame des films vus
    
    print("\nLes recommandations faites sont les suivantes :")
    display(top_n_recommendations)  # Afficher le DataFrame des recommandations


# Used function

In [32]:
# Example usage
user_id_example = 1  # Replace with a valid user ID from your dataset
display_recommendations(user_id_example, n=10)


Top 10 films vus par l'utilisateur 1 :


Unnamed: 0,title,genres,rating
0,Freaks,Crime|Drama|Horror,5.0
1,"Lord of the Rings: The Fellowship of the Ring,...",Adventure|Fantasy,5.0
2,"Lord of the Rings: The Two Towers, The",Adventure|Fantasy,5.0
3,"Lord of the Rings: The Return of the King, The",Action|Adventure|Drama|Fantasy,5.0
4,Star Wars: Episode V - The Empire Strikes Back,Action|Adventure|Sci-Fi,4.5
5,Raiders of the Lost Ark (Indiana Jones and the...,Action|Adventure,4.5
6,Spider-Man 2,Action|Adventure|Sci-Fi|IMAX,4.5
7,Constantine,Action|Fantasy|Horror|Thriller,4.0
8,Watership Down,Adventure|Animation|Children|Drama|Fantasy,4.0
9,Dragonslayer,Action|Adventure|Fantasy,4.0



Les recommandations faites sont les suivantes :


Unnamed: 0,title,genres,score
1,Jaws,Action|Horror,5.280891
7,Snatch,Comedy|Crime|Thriller,5.241471
4,Pi,Drama|Sci-Fi|Thriller,5.185878
8,Before the Devil Knows You're Dead,Crime|Drama|Thriller,5.160434
5,"Dirty Dozen, The",Action|Drama|War,5.089878
2,"Truman Show, The",Comedy|Drama|Sci-Fi,4.994098
6,Being John Malkovich,Comedy|Drama|Fantasy,4.937565
3,Primary Colors,Comedy|Drama,4.933546
9,Chain Reaction,Horror,4.904399
0,"Wizard of Oz, The",Adventure|Children|Fantasy|Musical,4.873779
