In [46]:
import warnings
import joblib
import plotly.graph_objects as go 
from plotly.subplots import make_subplots
import pandas as pd

warnings.filterwarnings('ignore', category=FutureWarning)


<div class="alert alert-block alert-danger"><b>Caution:</b> Always clear all output before pushing to GitHub to reduce size!</div>

### Import cv_results (dict) and transform to DataFrame

In [116]:
cv_results = joblib.load('../models/surp_cv_results.json')

# create empty DataFrame with columns according to cv_results
keys = list(cv_results.keys()) # list of keys, which hold the model names
df_cv_results = pd.DataFrame(columns=cv_results[keys[0]].keys()) # e.g. use first model to retrieve coumns

# iterate over keys to fill df_cv_results successively
for model in keys:
    df = pd.DataFrame.from_dict(cv_results[model])
    df['model'] = model[3:] # write model name in new column, starting at position 3 to drop "cv_"
    df_cv_results = pd.concat([df_cv_results, df], axis=0, ignore_index=True) # append df to df_cv_results

# rename 'rand' to 'NormalPredictor'
df_cv_results['model'].loc[df_cv_results.model == 'rand'] = 'NormalPredictor'
display(df_cv_results.head(6))

agg_cv_results = df_cv_results.groupby(by=['model']).agg(['mean','std']) #, as_index=False
display(agg_cv_results)

# df_avg = df_cv_results.groupby(by=['model'], as_index=False).mean().add_suffix('_mean')
# df_std = df_cv_results.groupby(by=['model'], as_index=False).std().add_suffix('_std')
# agg_cv_results = pd.concat([df_avg, df_std], axis=1) #, as_index=False
# display(agg_cv_results)

Unnamed: 0,test_mae,train_mae,test_mse,train_mse,test_rmse,train_rmse,fit_time,test_time,model
0,0.667824,0.540952,0.780517,0.506024,0.883469,0.711354,16.011562,97.324295,knnBasic
1,0.666695,0.541205,0.777922,0.50672,0.881999,0.711843,21.801503,80.346286,knnBasic
2,0.667406,0.541022,0.778061,0.506493,0.882078,0.711683,17.293665,74.418446,knnBasic
3,0.667822,0.540955,0.780288,0.506196,0.883339,0.711475,22.30297,88.960689,knnBasic
4,0.667438,0.541085,0.779064,0.50643,0.882646,0.711639,14.682178,82.02858,knnBasic
5,0.634364,0.311406,0.706886,0.182756,0.840765,0.4275,44.004267,123.979569,knnMeans


Unnamed: 0_level_0,test_mae,test_mae,train_mae,train_mae,test_mse,test_mse,train_mse,train_mse,test_rmse,test_rmse,train_rmse,train_rmse,fit_time,fit_time,test_time,test_time
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
BaselineOnly,0.665462,0.00051,0.648018,0.000158,0.758161,0.00168,0.718052,0.000364,0.870724,0.000965,0.84738,0.000215,11.665887,1.330852,9.472557,1.12446
CoClustering,0.695054,0.000313,0.655322,0.000197,0.80669,0.00044,0.713863,0.000417,0.898159,0.000245,0.844904,0.000247,244.141468,1.897795,10.712382,1.940114
NMF,0.667881,0.006444,0.639751,0.008478,0.768519,0.017914,0.704841,0.022002,0.876605,0.010194,0.839467,0.013066,109.361153,25.573685,6.784035,1.2574
NormalPredictor,1.154854,0.00065,1.154153,0.000258,2.095736,0.002278,2.093567,0.000689,1.447665,0.000787,1.446916,0.000238,4.207122,0.699897,8.805251,1.631601
SVD,0.632689,0.000564,0.487641,0.000313,0.696614,0.001489,0.408494,0.000558,0.834634,0.000892,0.639135,0.000437,32.772085,7.332372,8.980779,2.293598
SlopeOne,0.661866,0.000395,0.625044,0.000101,0.757213,0.001055,0.673256,0.000257,0.87018,0.000606,0.820522,0.000157,47.095886,1.22044,70.712111,0.693561
knnBaseline,0.632439,0.000256,0.310237,0.000202,0.703076,0.000579,0.180352,0.000205,0.838496,0.000346,0.424678,0.000241,41.774422,5.127289,108.472653,37.580902
knnBasic,0.667437,0.000461,0.541044,0.000106,0.77917,0.001211,0.506373,0.00027,0.882706,0.000686,0.711599,0.00019,18.418376,3.447917,84.615659,8.793355
knnMeans,0.634011,0.000428,0.311516,0.000139,0.706145,0.001402,0.182854,0.000148,0.840324,0.000834,0.427614,0.000174,41.797299,3.253731,95.995085,31.168538
knnZScore,0.632987,0.000219,0.311013,0.000178,0.705597,0.000603,0.182478,0.000224,0.839998,0.000359,0.427174,0.000262,43.438091,9.301936,91.403362,17.150491


In [140]:
fig = go.Figure()


fig.add_trace(go.Bar(x = agg_cv_results.index, y = agg_cv_results['test_mae']['mean'], 
                     error_y=dict(type = 'data', array = agg_cv_results['test_mae']['std'], visible = True), name = 'testset', orientation='v')) 
fig.add_trace(go.Bar(x = agg_cv_results.index, y = agg_cv_results['train_mae']['mean'], 
                     error_y=dict(type = 'data', array = agg_cv_results['train_mae']['std'], visible = True),name = 'trainset', orientation='v')) 


fig.update_layout(title = 'Average MAE from 5-fold cross-validation of Surprise models with optimized parameters', title_x = 0.5, title_y=0.87, 
                  xaxis_title = 'Model', yaxis_title = 'Average MAE (cv=5)') # Title and axis titles
fig.update_layout(autosize=False, width=1000, height=400) #,legend=dict(orientation="h", y=-0.1)) # Figure size
fig.update_layout(legend=dict(x=0, y=1))

# sort ascending
order = agg_cv_results['test_mae']['mean'].sort_values().index
fig.update_xaxes(categoryorder='array', categoryarray= order)

<div class="alert alert-block alert-danger"><b>Caution:</b> Always clear all output before pushing to GitHub to reduce size!</div>

In [143]:
fig = go.Figure()


fig.add_trace(go.Bar(x = agg_cv_results.index, y = agg_cv_results['test_time']['mean'], 
                     error_y=dict(type = 'data', array = agg_cv_results['test_time']['std'], visible = True), name = 'test time', orientation='v')) 
fig.add_trace(go.Bar(x = agg_cv_results.index, y = agg_cv_results['fit_time']['mean'], 
                     error_y=dict(type = 'data', array = agg_cv_results['fit_time']['std'], visible = True),name = 'fit time', orientation='v')) 


fig.update_layout(title = 'Average fit and test times in 5-fold cross-validation', title_x = 0.5, title_y=0.87, 
                  xaxis_title = 'Model', yaxis_title = 't [s]') # Title and axis titles
fig.update_layout(autosize=False, width=1000, height=400) #,legend=dict(orientation="h", y=-0.1)) # Figure size
fig.update_layout(legend=dict(x=0, y=1))

# sort like other graph
order = agg_cv_results['test_mae']['mean'].sort_values().index
fig.update_xaxes(categoryorder='array', categoryarray= order)