In [1]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import pandas as pd
import os
import mlflow

import functions as f

import warnings
warnings.filterwarnings("ignore")

# MLflow UI Identification
mlflow.set_tracking_uri("file://" + os.path.expanduser('~/mlruns'))
# MLflow Project Name Identification
mlflow.set_experiment("moviewise")

# Define the model parameters we'll use for training
N_COMPONENTS = 10
MAX_ITER = 100

In [2]:
# Load the data
merged_df = f.csv_to_df('data/merged_df.csv')

merged_df

Unnamed: 0,movie_id,title,user_id,rating,timestamp
0,593,"Silence of the Lambs, The (1991)",6040,5,956703954
1,2384,Babe: Pig in the City (1998),6040,4,956703954
2,1961,Rain Man (1988),6040,4,956703977
3,2019,Seven Samurai (The Magnificent Seven) (Shichin...,6040,5,956703977
4,1419,Walkabout (1971),6040,3,956704056
...,...,...,...,...,...
942210,2399,Santa Claus: The Movie (1985),4958,1,1046454338
942211,1407,Scream (1996),4958,5,1046454443
942212,3264,Buffy the Vampire Slayer (1992),4958,4,1046454548
942213,2634,"Mummy, The (1959)",4958,3,1046454548


In [3]:
# Partition the data
df_train, df_test, df_train_mini, df_test_mini = f.partition(merged_df)

for df in [df_train, df_test, df_train_mini, df_test_mini]:
    print(df.shape)

(753772, 5)
(188443, 5)
(22613, 5)
(5653, 5)


In [4]:
# Drop every line of df_test that contains a movieid that is not in df_train
films_df_train = df_train['movie_id'].unique()
df_test = df_test[df_test['movie_id'].isin(films_df_train)]

# Drop every line of df_test that contains a user_id that is not in df_train
users_df_train = df_train['user_id'].unique()
df_test = df_test[df_test['user_id'].isin(users_df_train)]

print(len(df_train))
print(len(df_test))

753772
188443


In [5]:
# Train the model
nmf, pred_matrix, pred_df = f.NMF_training(N_COMPONENTS, MAX_ITER, df_train)

pred_df

Unnamed: 0,user_id,movie_id,user_movie_position
0,1,1,1.754872
1,1,2,0.577280
2,1,3,0.013348
3,1,4,0.012251
4,1,5,0.015764
...,...,...,...
12474193,6040,3948,0.229524
12474194,6040,3949,0.789015
12474195,6040,3950,0.107069
12474196,6040,3951,0.147290


In [6]:
# Merge the train and test dataframes with the predictions dataframe
train_pred_df = pd.merge(df_train, pred_df, on=['user_id', 'movie_id'])
test_pred_df = pd.merge(df_test, pred_df, on=['user_id', 'movie_id'])

test_pred_df

Unnamed: 0,movie_id,title,user_id,rating,timestamp,user_movie_position
0,1258,"Shining, The (1980)",3260,4,968256288,1.652005
1,480,Jurassic Park (1993),2035,5,974667191,1.788125
2,316,Stargate (1994),1753,3,974703085,2.185789
3,1282,Fantasia (1940),757,4,975542850,0.581022
4,2085,101 Dalmatians (1961),3665,4,973903131,2.002492
...,...,...,...,...,...,...
188438,594,Snow White and the Seven Dwarfs (1937),4387,4,965170407,3.941033
188439,1199,Brazil (1985),3346,5,967776051,1.372913
188440,3068,"Verdict, The (1982)",4595,4,964654593,0.158435
188441,1371,Star Trek: The Motion Picture (1979),5650,3,1027920537,1.031233


In [7]:
# Calculate the MSE between the position of the movie in the predicted ranking and the actual ratings of the movie
mse_train = mean_squared_error(train_pred_df['rating'], train_pred_df['user_movie_position'])
mse_test = mean_squared_error(test_pred_df['rating'], test_pred_df['user_movie_position'])

pred_df

Unnamed: 0,user_id,movie_id,user_movie_position
0,1,1,1.754872
1,1,2,0.577280
2,1,3,0.013348
3,1,4,0.012251
4,1,5,0.015764
...,...,...,...
12474193,6040,3948,0.229524
12474194,6040,3949,0.789015
12474195,6040,3950,0.107069
12474196,6040,3951,0.147290


In [8]:
# sort test_pred_df by user_movie_position for every user ans reset index
test_pred_df = test_pred_df.sort_values(by=['user_id', 'user_movie_position'], ascending=[True, False]).reset_index(drop=True)

# Create a dataframe with the top 10 ratings for each user
top_10_df = test_pred_df.groupby('user_id').head(10)

# Group the dataframe by user_id
grouped = top_10_df.groupby('user_id')

grouped.head(10)

Unnamed: 0,movie_id,title,user_id,rating,timestamp,user_movie_position
0,919,"Wizard of Oz, The (1939)",1,4,978301368,1.536860
1,2355,"Bug's Life, A (1998)",1,5,978824291,1.462199
2,1197,"Princess Bride, The (1987)",1,3,978302268,1.056324
3,2294,Antz (1998),1,4,978824291,0.990801
4,783,"Hunchback of Notre Dame, The (1996)",1,4,978824291,0.958475
...,...,...,...,...,...,...
188387,260,Star Wars: Episode IV - A New Hope (1977),6040,4,956716873,3.089267
188388,908,North by Northwest (1959),6040,4,957716673,3.080410
188389,1952,Midnight Cowboy (1969),6040,5,957717017,2.701697
188390,3504,Network (1976),6040,4,960971857,2.190601


In [9]:
# For each group, calculate the R² score between the actual ratings and the position of the movie in the predicted ranking
R2_results = pd.DataFrame(columns=['user_id', 'R2_score'])
for user_id, group in grouped:
    R2 = r2_score(group['rating'], group['user_movie_position'])
    R2_results = R2_results.append({'user_id': user_id, 'R2_score': R2}, ignore_index=True)

print(R2_results)

     user_id   R2_score
0        1.0 -18.534402
1        2.0 -11.938815
2        3.0  -12.53631
3        5.0  -7.016077
4        6.0 -58.194743
...      ...        ...
4513  6035.0  -0.113991
4514  6036.0   -0.05613
4515  6037.0  -4.235989
4516  6039.0 -20.355855
4517  6040.0  -8.205176

[4518 rows x 2 columns]


In [10]:
# For each group, calculate the Spearman correlation and store the results in a dataframe
spearman_results = pd.DataFrame(columns=['user_id', 'spearman_corr'])
for user, group in grouped:
    spearman_corr = group['rating'].corr(group['user_movie_position'], method='spearman')
    spearman_results.loc[len(spearman_results)] = [user, spearman_corr]

# Print results
spearman_results

Unnamed: 0,user_id,spearman_corr
0,1.0,-0.194625
1,2.0,0.134840
2,3.0,0.517935
3,5.0,0.150795
4,6.0,0.189934
...,...,...
4513,6035.0,0.582616
4514,6036.0,0.434272
4515,6037.0,0.674200
4516,6039.0,-0.097096


In [11]:
# For every group, calculate the Pearson correlation coefficient and add it to the results dataframe
pearson_results = pd.DataFrame(columns=['user_id', 'pearson_corr'])
for user, group in grouped:
    pearson_corr = group['rating'].corr(group['user_movie_position'], method='pearson')
    pearson_results.loc[len(pearson_results)] = [user, pearson_corr]

# Print results
pearson_results

Unnamed: 0,user_id,pearson_corr
0,1.0,0.004150
1,2.0,0.075992
2,3.0,0.403296
3,5.0,0.281827
4,6.0,0.287291
...,...,...
4513,6035.0,0.618436
4514,6036.0,0.347702
4515,6037.0,0.794801
4516,6039.0,0.093442


In [12]:
# Calculate the mean of the Pearson and Spearman coefficients
pearson_mean_coeff = pearson_results['pearson_corr'].mean
spearman_mean_coeff = spearman_results['spearman_corr'].mean

# Convert Pearson and Spearman results to csv
spearman_results.to_csv('results/spearman_results.csv', index=False)
pearson_results.to_csv('results/pearson_results.csv', index=False)

# Log the results to MLflow
with mlflow.start_run() as run:
    mlflow.sklearn.log_model(nmf, "Model")
    mlflow.log_params({"n_components": N_COMPONENTS})
    mlflow.log_params({"max_iter": MAX_ITER})
    mlflow.log_metric("Training MSE", mse_train)
    mlflow.log_metric("Test MSE", mse_test)
    mlflow.log_metric("Pearson mean coefficient", pearson_mean_coeff())
    mlflow.log_metric("Spearman mean coefficient", spearman_mean_coeff())
    mlflow.log_artifact("results/spearman_results.csv")
    mlflow.log_artifact("results/pearson_results.csv")

