In [3]:
import sys
sys.path.append('../')
from hidden import MONGO_USR, MONGO_PWD
import numpy as np
from pymongo import MongoClient
from sklearn.decomposition import NMF
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
from scipy.sparse import csr_matrix
import os
import mlflow

import warnings
warnings.filterwarnings("ignore")

# Connexion à la base de données
client = MongoClient('mongodb://'+ MONGO_USR +':'+ MONGO_PWD +'@127.0.0.1:27017/?authSource=admin')
# client = MongoClient('127.0.0.1:27017', username= "MONGO_USR", password= "MONGO_PWD")
db = client['Movielens']
movies = db['movies']
users = db['users']

# Identification de l'interface MLflow
mlflow.set_tracking_uri("file://" + os.path.expanduser('~/mlruns'))
# Identification du nom du projet MLflow
mlflow.set_experiment("moviewise")


<Experiment: artifact_location='file:///home/kevin/mlruns/998316277040792967', creation_time=1707907843550, experiment_id='998316277040792967', last_update_time=1707907843550, lifecycle_stage='active', name='moviewise', tags={}>

In [None]:
movies = pd.DataFrame(movies.find({}, {"_id": 1, "title": 1}))

# Récupérer les données de la base de données
data = list(users.find({}, {"movies.movieid": 1, "_id": 1, "movies.rating": 1, "movies.timestamp": 1}))

# Convertir les données en DataFrame
users = pd.json_normalize(data)

# "Dérouler" la liste movies
users = users.explode('movies')

# Convertir chaque élément de la liste en une colonne distincte
users_tmp = users['movies'].apply(pd.Series)
users = pd.concat([users, users_tmp], axis=1).drop('movies', axis=1)

print('taille de movies :', len(movies))
print('taille de users :', len(users))
users.head()

In [None]:
merged_df = movies.merge(users, left_on='_id', right_on='movieid')

# drop "movieid" column
merged_df = merged_df.drop(columns=['movieid'])

# rename "_id_x" column to "movieid"
merged_df = merged_df.rename(columns={"_id_x": "movieid"})

# rename "_id_y" column to "user_id"
merged_df = merged_df.rename(columns={"_id_y": "user_id"})

# Sort merged_df by timestamp
merged_df = merged_df.sort_values(by=['timestamp'])

# Reset index
merged_df = merged_df.reset_index(drop=True)

# Drop the first entire line because of the odd size of the dataset
merged_df = merged_df.drop(merged_df.index[0])

print('Taille de merged_df :', len(merged_df))
merged_df.tail(10)

In [None]:
# Number of ratings per movie
movies_counts = merged_df['movieid'].value_counts()
print(movies_counts.describe())

print('\n')

# Number of ratings per user
cusers_counts = merged_df['user_id'].value_counts()
print(cusers_counts.describe())

# Define the thresholds under which we drop the movies
movies_threshold = 33
users_threshold = 44

# Drop movies with less than 33 ratings
merged_df = merged_df[merged_df['movieid'].isin(movies_counts[movies_counts > movies_threshold].index)]

# Drop users with less than 44 ratings
merged_df = merged_df[merged_df['user_id'].isin(cusers_counts[cusers_counts > users_threshold].index)]

print('\n')
print('Nouvelle taille de merged_df :', len(merged_df))

In [None]:
# Split merged_df into train (80%) and test (20%) dataframes
train_size = int(0.8 * len(merged_df))
df_train = merged_df[:train_size]
df_test = merged_df[train_size:]

print(len(df_train))
print(len(df_test))


In [None]:
# Drop every line of df_test that contains a movieid that is not in df_train
films_df_train = df_train['movieid'].unique()
df_test = df_test[df_test['movieid'].isin(films_df_train)]

# Drop every line of df_test that contains a user_id that is not in df_train
users_df_train = df_train['user_id'].unique()
df_test = df_test[df_test['user_id'].isin(users_df_train)]


print(len(df_train))
print(len(df_test))

In [None]:
# Pivot train dataframe to get a matrix of users and their ratings for movies
ratings_train = df_train.pivot(index='user_id', columns='movieid', values='rating')

ratings_train

In [None]:
# Fill NaN values with 
ratings_train = ratings_train.fillna(0)

# Drop lines with only zeros
ratings_train = ratings_train[ratings_train.sum(axis=1) > 0]

# Sparse ratings train dataframe
ratings_train_sparse = ratings_train.astype(pd.SparseDtype("float", 0))

ratings_train_sparse

In [None]:
n_components = 10
max_iter = 100
nmf = NMF(n_components=n_components, max_iter=max_iter)

# Fit the model to the user-item train matrix
U_train = nmf.fit_transform(ratings_train_sparse)  # User matrix train
M = nmf.components_  # Item matrix

pred_matrix = np.dot(U_train, M)
pred_matrix

In [None]:
# "unpivot" the matrix returned to get
pred_df = pd.DataFrame(pred_matrix, columns=ratings_train.columns, index=ratings_train.index)

pred_df

In [None]:
# Stack the dataframe
pred_df = pred_df.stack().reset_index()
pred_df.columns = ['user_id', 'movieid', 'user_movie_position'] # Rename columns

# Merge the train and test dataframes with the predictions dataframe
train_pred_df = pd.merge(df_train, pred_df, on=['user_id', 'movieid'])
test_pred_df = pd.merge(df_test, pred_df, on=['user_id', 'movieid'])

test_pred_df

In [None]:
# Calculate the MSE
mse_train = mean_squared_error(train_pred_df['rating'], train_pred_df['user_movie_position'])
mse_test = mean_squared_error(test_pred_df['rating'], test_pred_df['user_movie_position'])

pred_df

In [None]:
with mlflow.start_run() as run:
    mlflow.sklearn.log_model(nmf, "Model")
    mlflow.log_params({"n_components": n_components})
    mlflow.log_metric("Training MSE", mse_train)
    mlflow.log_metric("Test MSE", mse_test)

In [None]:
# sort test_pred_df by user_movie_position for every user ans reset index
test_pred_df = test_pred_df.sort_values(by=['user_id', 'user_movie_position'], ascending=[True, False]).reset_index(drop=True)

# Create a dataframe with the top 10 ratings for each user
top_10_df = test_pred_df.groupby('user_id').head(10)

# Pour chaque groupe, calculer le coefficient de corrélation de Spearman et l'ajouter au dataframe des résultats
grouped = top_10_df.groupby('user_id')
results = pd.DataFrame(columns=['user_id', 'spearman_corr'])
for user, group in grouped:
    spearman_corr = group['rating'].corr(group['user_movie_position'], method='spearman')
    results.loc[len(results)] = [user, spearman_corr]

# Afficher les résultats
results.describe()


In [None]:
# Regarder les meilleurs notes pourchaque user (parmis les films qu'il a déjà regarder) et regarder leurs positions dans la matrice de prédiction.
# OU Faire un top 10 des films pour chaque user et calculer la moyenne de ces notes (à mettre dans un dataframe à log dans MLflow)