In [34]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import pandas as pd
import os
import mlflow

import model_processing as f

import warnings
warnings.filterwarnings("ignore")

# MLflow UI Identification
mlflow.set_tracking_uri("file://" + os.path.expanduser('~/mlruns'))
# MLflow Project Name Identification
mlflow.set_experiment("moviewise")


<Experiment: artifact_location='file:///home/kevin/mlruns/693904953874716375', creation_time=1707990500633, experiment_id='693904953874716375', last_update_time=1707990500633, lifecycle_stage='active', name='moviewise', tags={}>

In [35]:
# Load the data
merged_df = pd.read_csv('data/merged_df.csv')

merged_df

Unnamed: 0,movie_id,title,user_id,rating,timestamp
0,593,"Silence of the Lambs, The (1991)",6040,5,956703954
1,2384,Babe: Pig in the City (1998),6040,4,956703954
2,1961,Rain Man (1988),6040,4,956703977
3,2019,Seven Samurai (The Magnificent Seven) (Shichin...,6040,5,956703977
4,1419,Walkabout (1971),6040,3,956704056
...,...,...,...,...,...
942210,2399,Santa Claus: The Movie (1985),4958,1,1046454338
942211,1407,Scream (1996),4958,5,1046454443
942212,3264,Buffy the Vampire Slayer (1992),4958,4,1046454548
942213,2634,"Mummy, The (1959)",4958,3,1046454548


In [51]:
# Partition the data
df_train, df_test, df_train_mini, df_test_mini = f.partition(merged_df)

for df in [df_train, df_test, df_train_mini, df_test_mini]:
    print(df.shape)

df_train

(753772, 5)
(188444, 5)
(22613, 5)
(5653, 5)


Unnamed: 0,movie_id,title,user_id,rating,timestamp
556726,3916,Remember the Titans (2000),2370,4,974508054
421655,1225,Amadeus (1984),3249,4,968296616
818699,2422,"Karate Kid III, The (1989)",1448,2,976128796
28766,990,Maximum Risk (1996),5831,2,957899468
728908,1280,Raise the Red Lantern (1991),1016,4,975009310
...,...,...,...,...,...
276258,1084,Bonnie and Clyde (1967),4079,5,965446322
388071,2322,Soldier (1998),3457,4,967240833
140150,319,Shallow Grave (1994),5032,5,962561024
715124,2739,"Color Purple, The (1985)",2030,4,974930448


In [37]:
# Drop every line of df_test that contains a movieid that is not in df_train
films_df_train = df_train['movie_id'].unique()
df_test = df_test[df_test['movie_id'].isin(films_df_train)]

# Drop every line of df_test that contains a user_id that is not in df_train
users_df_train = df_train['user_id'].unique()
df_test = df_test[df_test['user_id'].isin(users_df_train)]

print(len(df_train))
print(len(df_test))

753772
188443


In [38]:
model_options = {
    'n_components': 50,
    'max_iter': 200,
    'normalize': {
        'should': False,
        'min': 1,
        'max': 5
    }
}

# Train the model
nmf, pred_df = f.run_model(df_train, model_options)

pred_df

Unnamed: 0,user_id,movie_id,predict
0,1,1,4.125737
1,1,2,0.211606
2,1,3,0.031881
3,1,4,0.000327
4,1,5,0.008055
...,...,...,...
12474193,6040,3948,0.335029
12474194,6040,3949,0.727632
12474195,6040,3950,0.114082
12474196,6040,3951,0.179815


In [39]:
# Merge the train and test dataframes with the predictions dataframe
train_pred_df = pd.merge(df_train_mini, pred_df, on=['user_id', 'movie_id'])
test_pred_df = pd.merge(df_test_mini, pred_df, on=['user_id', 'movie_id'])

test_pred_df

Unnamed: 0,movie_id,title,user_id,rating,timestamp,predict
0,1258,"Shining, The (1980)",3260,4,968256288,1.906530
1,480,Jurassic Park (1993),2035,5,974667191,2.115206
2,316,Stargate (1994),1753,3,974703085,1.656784
3,1282,Fantasia (1940),757,4,975542850,0.457930
4,2085,101 Dalmatians (1961),3665,4,973903131,1.614968
...,...,...,...,...,...,...
5648,912,Casablanca (1942),2246,5,974596283,1.638526
5649,1449,Waiting for Guffman (1996),5042,4,962656673,1.530608
5650,745,"Close Shave, A (1995)",5956,5,959739004,3.726289
5651,2968,Time Bandits (1981),1395,5,975012119,2.257008


In [40]:
# Calculate the MSE between the position of the movie in the predicted ranking and the actual ratings of the movie
mse_train = mean_squared_error(train_pred_df['rating'], train_pred_df['predict'])
mse_test = mean_squared_error(test_pred_df['rating'], test_pred_df['predict'])

pred_df

Unnamed: 0,user_id,movie_id,predict
0,1,1,4.125737
1,1,2,0.211606
2,1,3,0.031881
3,1,4,0.000327
4,1,5,0.008055
...,...,...,...
12474193,6040,3948,0.335029
12474194,6040,3949,0.727632
12474195,6040,3950,0.114082
12474196,6040,3951,0.179815


In [41]:
# sort test_pred_df by user_movie_position for every user ans reset index
test_pred_df = test_pred_df.sort_values(by=['user_id', 'predict'], ascending=[True, False]).reset_index(drop=True)

# Create a dataframe with the top 10 ratings for each user
top_10_df = test_pred_df.groupby('user_id').head(10)

# Group the dataframe by user_id
grouped = top_10_df.groupby('user_id')

grouped.head(10)

Unnamed: 0,movie_id,title,user_id,rating,timestamp,predict
0,2236,Simon Birch (1998),2,5,978299220,0.282901
1,1372,Star Trek VI: The Undiscovered Country (1991),2,3,978299941,0.130186
2,506,Orlando (1993),5,4,978245999,0.270444
3,1650,Washington Square (1997),5,3,978245314,0.096493
4,588,Aladdin (1992),10,4,978225900,3.802395
...,...,...,...,...,...,...
5648,2124,"Addams Family, The (1991)",6036,3,956753816,0.920043
5649,173,Judge Dredd (1995),6036,2,956755038,0.736016
5650,1674,Witness (1985),6037,4,956709914,0.789404
5651,2745,"Mission, The (1986)",6040,3,956716157,1.067719


In [42]:
# For each group, calculate the Spearman correlation and store the results in a dataframe
spearman_results = pd.DataFrame(columns=['user_id', 'spearman_corr'])
for user, group in grouped:
    spearman_corr = group['rating'].corr(group['predict'], method='spearman')
    spearman_results.loc[len(spearman_results)] = [user, spearman_corr]

# Print results
spearman_results

Unnamed: 0,user_id,spearman_corr
0,2.0,1.000000
1,5.0,1.000000
2,10.0,-0.447214
3,13.0,
4,15.0,
...,...,...
2636,6030.0,
2637,6032.0,0.000000
2638,6036.0,0.353094
2639,6037.0,


In [43]:
# For every group, calculate the Pearson correlation coefficient and add it to the results dataframe
pearson_results = pd.DataFrame(columns=['user_id', 'pearson_corr'])
for user, group in grouped:
    pearson_corr = group['rating'].corr(group['predict'], method='pearson')
    pearson_results.loc[len(pearson_results)] = [user, pearson_corr]

# Print results
pearson_results

Unnamed: 0,user_id,pearson_corr
0,2.0,1.000000
1,5.0,1.000000
2,10.0,-0.444820
3,13.0,
4,15.0,
...,...,...
2636,6030.0,
2637,6032.0,0.239054
2638,6036.0,0.486208
2639,6037.0,


In [44]:
# Calculate the mean of the Pearson and Spearman coefficients
pearson_mean_coeff = pearson_results['pearson_corr'].mean
spearman_mean_coeff = spearman_results['spearman_corr'].mean

# Convert Pearson and Spearman results to csv
spearman_results.to_csv('results/spearman_results.csv', index=False)
pearson_results.to_csv('results/pearson_results.csv', index=False)

# Log the results to MLflow
with mlflow.start_run() as run:
    mlflow.sklearn.log_model(nmf, "Model")
    mlflow.log_params({"n_components": model_options['n_components']})
    mlflow.log_params({"max_iter": model_options['max_iter']})
    mlflow.log_metric("Training MSE", mse_train)
    mlflow.log_metric("Test MSE", mse_test)
    mlflow.log_metric("Pearson mean coefficient", pearson_mean_coeff())
    mlflow.log_metric("Spearman mean coefficient", spearman_mean_coeff())
    mlflow.log_artifact("results/spearman_results.csv")
    mlflow.log_artifact("results/pearson_results.csv")

