In [101]:
from sklearn.metrics import mean_squared_error
import pandas as pd
import os
import mlflow

import functions as f

import warnings
warnings.filterwarnings("ignore")

# MLflow UI Identification
mlflow.set_tracking_uri("file://" + os.path.expanduser('~/mlruns'))
# MLflow Project Name Identification
mlflow.set_experiment("moviewise")

# Define the model parameters we'll use for training
N_COMPONENTS = 10
MAX_ITER = 100

In [102]:
# Load the data
merged_df = f.csv_to_df('data/merged_df.csv')

merged_df

Unnamed: 0,movie_id,title,user_id,rating,timestamp
0,593,"Silence of the Lambs, The (1991)",6040,5,956703954
1,2384,Babe: Pig in the City (1998),6040,4,956703954
2,1961,Rain Man (1988),6040,4,956703977
3,2019,Seven Samurai (The Magnificent Seven) (Shichin...,6040,5,956703977
4,1419,Walkabout (1971),6040,3,956704056
...,...,...,...,...,...
942210,2399,Santa Claus: The Movie (1985),4958,1,1046454338
942211,1407,Scream (1996),4958,5,1046454443
942212,3264,Buffy the Vampire Slayer (1992),4958,4,1046454548
942213,2634,"Mummy, The (1959)",4958,3,1046454548


In [103]:
# Partition the data
df_train, df_test, df_train_mini, df_test_mini = f.partition(merged_df)

for df in [df_train, df_test, df_train_mini, df_test_mini]:
    print(df.shape)

(753772, 5)
(188443, 5)
(22613, 5)
(5653, 5)


In [104]:
# Drop every line of df_test that contains a movieid that is not in df_train
films_df_train = df_train['movie_id'].unique()
df_test = df_test[df_test['movie_id'].isin(films_df_train)]

# Drop every line of df_test that contains a user_id that is not in df_train
users_df_train = df_train['user_id'].unique()
df_test = df_test[df_test['user_id'].isin(users_df_train)]

print(len(df_train))
print(len(df_test))

753772
188443


In [105]:
# Train the model
nmf, pred_matrix, pred_df = f.NMF_training(N_COMPONENTS, MAX_ITER, df_train)

pred_df

Unnamed: 0,user_id,movie_id,user_movie_position
0,1,1,1.485173
1,1,2,0.472429
2,1,3,0.258579
3,1,4,0.104619
4,1,5,0.206577
...,...,...,...
12474193,6040,3948,0.522534
12474194,6040,3949,0.539397
12474195,6040,3950,0.123403
12474196,6040,3951,0.088646


In [106]:
# Merge the train and test dataframes with the predictions dataframe
train_pred_df = pd.merge(df_train, pred_df, on=['user_id', 'movie_id'])
test_pred_df = pd.merge(df_test, pred_df, on=['user_id', 'movie_id'])

test_pred_df

Unnamed: 0,movie_id,title,user_id,rating,timestamp,user_movie_position
0,1258,"Shining, The (1980)",3260,4,968256288,2.101901
1,480,Jurassic Park (1993),2035,5,974667191,1.341987
2,316,Stargate (1994),1753,3,974703085,1.405135
3,1282,Fantasia (1940),757,4,975542850,0.451690
4,2085,101 Dalmatians (1961),3665,4,973903131,1.163843
...,...,...,...,...,...,...
188438,594,Snow White and the Seven Dwarfs (1937),4387,4,965170407,4.343489
188439,1199,Brazil (1985),3346,5,967776051,1.309624
188440,3068,"Verdict, The (1982)",4595,4,964654593,0.165429
188441,1371,Star Trek: The Motion Picture (1979),5650,3,1027920537,0.986878


In [107]:
# Calculate the MSE between the position of the movie in the predicted ranking and the actual ratings of the movie
mse_train = mean_squared_error(train_pred_df['rating'], train_pred_df['user_movie_position'])
mse_test = mean_squared_error(test_pred_df['rating'], test_pred_df['user_movie_position'])

pred_df

Unnamed: 0,user_id,movie_id,user_movie_position
0,1,1,1.485173
1,1,2,0.472429
2,1,3,0.258579
3,1,4,0.104619
4,1,5,0.206577
...,...,...,...
12474193,6040,3948,0.522534
12474194,6040,3949,0.539397
12474195,6040,3950,0.123403
12474196,6040,3951,0.088646


In [108]:
# sort test_pred_df by user_movie_position for every user ans reset index
test_pred_df = test_pred_df.sort_values(by=['user_id', 'user_movie_position'], ascending=[True, False]).reset_index(drop=True)

# Create a dataframe with the top 10 ratings for each user
top_10_df = test_pred_df.groupby('user_id').head(10)

# Group the dataframe by user_id
grouped = top_10_df.groupby('user_id')

# For every group, calculate the Spearman correlation and store the results in a dataframe
spearman_results = pd.DataFrame(columns=['user_id', 'spearman_corr'])
for user, group in grouped:
    spearman_corr = group['rating'].corr(group['user_movie_position'], method='spearman')
    spearman_results.loc[len(spearman_results)] = [user, spearman_corr]

# Print results
spearman_results

Unnamed: 0,user_id,spearman_corr
0,1.0,-0.113904
1,2.0,0.359573
2,3.0,0.583497
3,5.0,-0.026041
4,6.0,0.575640
...,...,...
4513,6035.0,-0.111454
4514,6036.0,0.311464
4515,6037.0,0.700649
4516,6039.0,0.138708


In [109]:
# For every group, calculate the Pearson correlation coefficient and add it to the results dataframe
pearson_results = pd.DataFrame(columns=['user_id', 'pearson_corr'])
for user, group in grouped:
    pearson_corr = group['rating'].corr(group['user_movie_position'], method='pearson')
    pearson_results.loc[len(pearson_results)] = [user, pearson_corr]

# Print results
pearson_results

Unnamed: 0,user_id,pearson_corr
0,1.0,-0.179666
1,2.0,0.346508
2,3.0,0.549337
3,5.0,0.000968
4,6.0,0.366026
...,...,...
4513,6035.0,0.091446
4514,6036.0,0.197635
4515,6037.0,0.831049
4516,6039.0,0.169787


In [110]:
# Calculate the mean of the Pearson and Spearman coefficients
pearson_mean_coeff = pearson_results['pearson_corr'].mean
spearman_mean_coeff = spearman_results['spearman_corr'].mean

# Convert Pearson and Spearman results to csv
spearman_results.to_csv('results/spearman_results.csv', index=False)
pearson_results.to_csv('results/pearson_results.csv', index=False)

# Log the results to MLflow
with mlflow.start_run() as run:
    mlflow.sklearn.log_model(nmf, "Model")
    mlflow.log_params({"n_components": N_COMPONENTS})
    mlflow.log_params({"max_iter": MAX_ITER})
    mlflow.log_metric("Training MSE", mse_train)
    mlflow.log_metric("Test MSE", mse_test)
    mlflow.log_metric("Pearson mean coefficient", pearson_mean_coeff())
    mlflow.log_metric("Spearman mean coefficient", spearman_mean_coeff())
    mlflow.log_artifact("results/spearman_results.csv")
    mlflow.log_artifact("results/pearson_results.csv")

