In [1]:
import sys
sys.path.append('../')
from hidden import MONGO_USR, MONGO_PWD
import numpy as np
from pymongo import MongoClient
from sklearn.decomposition import NMF
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
from scipy.sparse import csr_matrix
import os
import mlflow

import warnings
warnings.filterwarnings("ignore")

# Connexion à la base de données
client = MongoClient('mongodb://'+ MONGO_USR +':'+ MONGO_PWD +'@127.0.0.1:27017/?authSource=admin')
# client = MongoClient('127.0.0.1:27017', username= "MONGO_USR", password= "MONGO_PWD")
db = client['Movielens']
movies = db['movies']
users = db['users']

# Identification de l'interface MLflow
mlflow.set_tracking_uri("file://" + os.path.expanduser('~/mlruns'))
# Identification du nom du projet MLflow
mlflow.set_experiment("moviewise")


<Experiment: artifact_location='file:///home/kevin/mlruns/998316277040792967', creation_time=1707907843550, experiment_id='998316277040792967', last_update_time=1707907843550, lifecycle_stage='active', name='moviewise', tags={}>

In [2]:
movies = pd.DataFrame(movies.find({}, {"_id": 1, "title": 1}))

# Récupérer les données de la base de données
data = list(users.find({}, {"movies.movieid": 1, "_id": 1, "movies.rating": 1, "movies.timestamp": 1}))

# Convertir les données en DataFrame
users = pd.json_normalize(data)

# "Dérouler" la liste movies
users = users.explode('movies')

# Convertir chaque élément de la liste en une colonne distincte
users_tmp = users['movies'].apply(pd.Series)
users = pd.concat([users, users_tmp], axis=1).drop('movies', axis=1)

print('taille de movies :', len(movies))
print('taille de users :', len(users))
users.head()

taille de movies : 3883
taille de users : 1000209


Unnamed: 0,_id,movieid,rating,timestamp
0,6040,573,4,956704056
0,6040,589,4,956704996
0,6040,1,3,957717358
0,6040,2068,4,997453982
0,6040,592,2,956716016


In [3]:
merged_df = movies.merge(users, left_on='_id', right_on='movieid')

# drop "movieid" column
merged_df = merged_df.drop(columns=['movieid'])

# rename "_id_x" column to "movieid"
merged_df = merged_df.rename(columns={"_id_x": "movie_id"})

# rename "_id_y" column to "user_id"
merged_df = merged_df.rename(columns={"_id_y": "user_id"})

# Sort merged_df by timestamp
merged_df = merged_df.sort_values(by=['timestamp'])

# Reset index
merged_df = merged_df.reset_index(drop=True)

# Drop the first entire line because of the odd size of the dataset
merged_df = merged_df.drop(merged_df.index[0])

print('Taille de merged_df :', len(merged_df))
merged_df.tail(10)

Taille de merged_df : 1000208


Unnamed: 0,movie_id,title,user_id,rating,timestamp
1000199,3098,"Natural, The (1984)",5948,4,1046437932
1000200,3267,"Mariachi, El (1992)",5312,4,1046444711
1000201,2453,"Boy Who Could Fly, The (1986)",4958,4,1046454260
1000202,2043,Darby O'Gill and the Little People (1959),4958,1,1046454282
1000203,3489,Hook (1991),4958,4,1046454320
1000204,2399,Santa Claus: The Movie (1985),4958,1,1046454338
1000205,1407,Scream (1996),4958,5,1046454443
1000206,2634,"Mummy, The (1959)",4958,3,1046454548
1000207,3264,Buffy the Vampire Slayer (1992),4958,4,1046454548
1000208,1924,Plan 9 from Outer Space (1958),4958,4,1046454590


In [4]:
# Number of ratings per movie
movies_counts = merged_df['movie_id'].value_counts()
print(movies_counts.describe())

print('\n')

# Number of ratings per user
cusers_counts = merged_df['user_id'].value_counts()
print(cusers_counts.describe())

# Define the thresholds under which we drop the movies
movies_threshold = 33
users_threshold = 44

# Drop movies with less than 33 ratings
merged_df = merged_df[merged_df['movie_id'].isin(movies_counts[movies_counts > movies_threshold].index)]

# Drop users with less than 44 ratings
merged_df = merged_df[merged_df['user_id'].isin(cusers_counts[cusers_counts > users_threshold].index)]

print('\n')
print('Nouvelle taille de merged_df :', len(merged_df))

count    3706.000000
mean      269.888829
std       384.046465
min         1.000000
25%        33.000000
50%       123.500000
75%       350.000000
max      3428.000000
Name: count, dtype: float64


count    6040.000000
mean      165.597351
std       192.746879
min        20.000000
25%        44.000000
50%        96.000000
75%       208.000000
max      2314.000000
Name: count, dtype: float64


Nouvelle taille de merged_df : 942215


In [5]:
# Split merged_df into train (2%) and test (1%) dataframes
train_size = int(0.02 * len(merged_df))
test_size = int(0.01 * len(merged_df))
df_train = merged_df[:train_size]
df_test = merged_df[train_size:(train_size + test_size)]

print(len(df_train))
print(len(df_test))


18844
9422


In [6]:
# Drop every line of df_test that contains a movieid that is not in df_train
films_df_train = df_train['movie_id'].unique()
df_test = df_test[df_test['movie_id'].isin(films_df_train)]

# Drop every line of df_test that contains a user_id that is not in df_train
users_df_train = df_train['user_id'].unique()
df_test = df_test[df_test['user_id'].isin(users_df_train)]


print(len(df_train))
print(len(df_test))

18844
1831


In [7]:
# Pivot train dataframe to get a matrix of users and their ratings for movies
ratings_train = df_train.pivot(index='user_id', columns='movie_id', values='rating')

ratings_train

movie_id,1,2,3,4,5,6,7,8,10,11,...,3593,3598,3600,3602,3604,3605,3606,3608,3610,3614
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5880,,,,,,,,,,,...,,,,,,,,,,3.0
5881,,,,,,,,,,3.0,...,,,,,,,,,,
5884,,,,,,,,,,,...,,,,,,,,,,
5885,,,,,,,,,4.0,,...,,,,,,,,,,
5886,4.0,2.0,,,,4.0,,,3.0,3.0,...,,,,,,,,3.0,,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6035,4.0,,1.0,2.0,1.0,,3.0,,,4.0,...,,,,,,,,,,
6036,,,,2.0,,3.0,,,,3.0,...,,,,,,,,,,
6037,,,,,,,,,,,...,,,,,,,,,,
6039,,,,,,,,,,,...,,,,,,,,,,


In [8]:
# Fill NaN values with 
ratings_train = ratings_train.fillna(0)

# Drop lines with only zeros
ratings_train = ratings_train[ratings_train.sum(axis=1) > 0]

# Sparse ratings train dataframe
ratings_train_sparse = ratings_train.astype(pd.SparseDtype("float", 0))

ratings_train_sparse

movie_id,1,2,3,4,5,6,7,8,10,11,...,3593,3598,3600,3602,3604,3605,3606,3608,3610,3614
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5880,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
5881,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5884,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5885,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5886,4.0,2.0,0.0,0.0,0.0,4.0,0.0,0.0,3.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6035,4.0,0.0,1.0,2.0,1.0,0.0,3.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6036,0.0,0.0,0.0,2.0,0.0,3.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6037,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6039,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
n_components = 10
max_iter = 100
nmf = NMF(n_components=n_components, max_iter=max_iter)

# Fit the model to the user-item train matrix
U_train = nmf.fit_transform(ratings_train_sparse)  # User matrix train
M = nmf.components_  # Item matrix

pred_matrix = np.dot(U_train, M)
pred_matrix

array([[3.86573870e-01, 3.70580562e-02, 3.67091713e-02, ...,
        1.34020589e-01, 1.27307340e-02, 5.60418851e-02],
       [1.05602916e+00, 4.62134776e-01, 8.03478320e-01, ...,
        2.15203282e-01, 9.33014787e-04, 7.58583426e-02],
       [5.41659793e-01, 5.13681898e-01, 2.72695869e-03, ...,
        4.98578023e-01, 0.00000000e+00, 2.26605811e-01],
       ...,
       [1.40700493e+00, 1.97225546e-01, 6.68065007e-02, ...,
        8.82128457e-01, 0.00000000e+00, 2.71362700e-01],
       [7.94008899e-01, 9.81741387e-02, 1.08438714e-01, ...,
        3.69520285e-01, 6.43624914e-02, 4.48590823e-01],
       [1.77821582e+00, 8.93090772e-02, 1.34702759e-01, ...,
        7.37907632e-01, 0.00000000e+00, 4.82758036e-02]])

In [10]:
# "unpivot" the matrix returned to get
pred_df = pd.DataFrame(pred_matrix, columns=ratings_train.columns, index=ratings_train.index)

pred_df

movie_id,1,2,3,4,5,6,7,8,10,11,...,3593,3598,3600,3602,3604,3605,3606,3608,3610,3614
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5880,0.386574,0.037058,0.036709,0.025764,0.025552,0.043769,0.096004,0.040099,0.047871,0.159999,...,0.000000,0.000000,0.060555,0.021496,0.018057,0.024308,0.045635,0.134021,0.012731,0.056042
5881,1.056029,0.462135,0.803478,0.491094,0.068677,0.483564,0.839950,0.045794,0.375032,1.381919,...,0.016783,0.000000,0.004080,0.001129,0.001161,0.001043,0.046651,0.215203,0.000933,0.075858
5884,0.541660,0.513682,0.002727,0.001702,0.003268,0.328742,0.020765,0.056155,0.947876,0.283546,...,0.101619,0.009498,0.009629,0.002226,0.004664,0.004548,0.011968,0.498578,0.000000,0.226606
5885,1.231397,0.189163,0.237612,0.150875,0.016095,0.105159,0.239681,0.078625,0.242664,0.690310,...,0.009734,0.004006,0.004906,0.009009,0.013163,0.018322,0.111956,0.527823,0.000000,0.084713
5886,4.184745,1.389304,0.873595,0.729820,0.252112,1.428719,1.747979,0.130219,1.620545,2.739671,...,0.142102,0.013887,0.906297,0.294622,0.243705,0.341851,0.463582,1.711605,0.166175,1.258976
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6035,1.327428,0.941013,0.952797,0.500653,0.204328,0.386440,1.094123,0.235578,1.001435,1.384635,...,0.055828,0.009126,0.027198,0.018273,0.019469,0.017951,0.052991,0.657140,0.013888,0.288599
6036,0.003953,0.000411,0.000335,1.892647,0.000000,2.861113,0.001880,0.000000,0.000000,3.136520,...,0.000000,0.003626,0.001742,0.000573,0.036923,0.000662,0.000752,0.111098,0.000323,0.068678
6037,1.407005,0.197226,0.066807,0.015210,0.000000,0.000000,0.010245,0.110296,0.384465,0.495520,...,0.026843,0.019001,0.029692,0.054523,0.078809,0.110893,0.207517,0.882128,0.000000,0.271363
6039,0.794009,0.098174,0.108439,0.117507,0.000000,0.260320,0.375364,0.002050,0.021955,0.476492,...,0.002223,0.009500,0.365488,0.148203,0.142147,0.201217,0.204662,0.369520,0.064362,0.448591


In [11]:
# Stack the dataframe
pred_df = pred_df.stack().reset_index()
pred_df.columns = ['user_id', 'movie_id', 'user_movie_position'] # Rename columns

# Merge the train and test dataframes with the predictions dataframe
train_pred_df = pd.merge(df_train, pred_df, on=['user_id', 'movie_id'])
test_pred_df = pd.merge(df_test, pred_df, on=['user_id', 'movie_id'])

test_pred_df

Unnamed: 0,movie_id,title,user_id,rating,timestamp,user_movie_position
0,34,Babe (1995),5880,3,957545414,0.455918
1,1223,"Grand Day Out, A (1992)",5880,5,957545455,0.040604
2,1234,"Sting, The (1973)",5880,4,957545455,0.228984
3,3396,"Muppet Movie, The (1979)",5880,5,957545455,0.131486
4,2795,Vacation (1983),5880,3,957545455,0.070165
...,...,...,...,...,...,...
1826,1953,"French Connection, The (1971)",5880,4,957920426,0.061005
1827,3272,Bad Lieutenant (1992),5880,2,957920486,0.021668
1828,198,Strange Days (1995),5880,2,957920513,0.039469
1829,2001,Lethal Weapon 2 (1989),5880,2,957920513,0.054628


In [12]:
# Calculate the MSE
mse_train = mean_squared_error(train_pred_df['rating'], train_pred_df['user_movie_position'])
mse_test = mean_squared_error(test_pred_df['rating'], test_pred_df['user_movie_position'])

pred_df

Unnamed: 0,user_id,movie_id,user_movie_position
0,5880,1,0.386574
1,5880,2,0.037058
2,5880,3,0.036709
3,5880,4,0.025764
4,5880,5,0.025552
...,...,...,...
275269,6040,3605,0.000525
275270,6040,3606,0.146060
275271,6040,3608,0.737908
275272,6040,3610,0.000000


In [13]:
with mlflow.start_run() as run:
    mlflow.sklearn.log_model(nmf, "Model")
    mlflow.log_params({"n_components": n_components})
    mlflow.log_metric("Training MSE", mse_train)
    mlflow.log_metric("Test MSE", mse_test)



In [14]:
# sort test_pred_df by user_movie_position for every user ans reset index
test_pred_df = test_pred_df.sort_values(by=['user_id', 'user_movie_position'], ascending=[True, False]).reset_index(drop=True)

# Create a dataframe with the top 10 ratings for each user
top_10_df = test_pred_df.groupby('user_id').head(10)


grouped = top_10_df.groupby('user_id')
spearman_results = pd.DataFrame(columns=['user_id', 'spearman_corr'])
for user, group in grouped:
    spearman_corr = group['rating'].corr(group['user_movie_position'], method='spearman')
    spearman_results.loc[len(spearman_results)] = [user, spearman_corr]

# Afficher les résultats
spearman_results


Unnamed: 0,user_id,spearman_corr
0,5880.0,-0.291288
1,5881.0,-0.107204
2,5885.0,0.568535
3,5886.0,
4,5888.0,0.044495
5,5889.0,
6,5890.0,0.101703
7,5906.0,0.057354
8,5908.0,-0.113961
9,5917.0,0.113961


In [15]:
# Pour chaque groupe, calculer le coefficient de corrélation de Pearson et l'ajouter au dataframe des résultats
pearson_results = pd.DataFrame(columns=['user_id', 'pearson_corr'])
for user, group in grouped:
    pearson_corr = group['rating'].corr(group['user_movie_position'], method='pearson')
    pearson_results.loc[len(pearson_results)] = [user, pearson_corr]

# Afficher les résultats
pearson_results

Unnamed: 0,user_id,pearson_corr
0,5880.0,0.063502
1,5881.0,-0.097687
2,5885.0,0.530056
3,5886.0,
4,5888.0,0.106727
5,5889.0,
6,5890.0,0.084003
7,5906.0,0.095962
8,5908.0,-0.135167
9,5917.0,0.197439


In [16]:
# Regarder les meilleurs notes pourchaque user (parmis les films qu'il a déjà regarder) et regarder leurs positions dans la matrice de prédiction.
# OU Faire un top 10 des films pour chaque user et calculer la moyenne de ces notes (à mettre dans un dataframe à log dans MLflow)