# Time to get down to some modeling

In [1]:
# imports 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from surprise import Dataset, Reader
from surprise import accuracy

from surprise.model_selection import train_test_split, cross_validate

from surprise.prediction_algorithms import SVD
from surprise.prediction_algorithms import SVDpp
from surprise.prediction_algorithms import SlopeOne
from surprise.prediction_algorithms import NMF
from surprise.prediction_algorithms import NormalPredictor
from surprise.prediction_algorithms import KNNBaseline
from surprise.prediction_algorithms import KNNBasic
from surprise.prediction_algorithms import KNNWithMeans
from surprise.prediction_algorithms import KNNWithZScore
from surprise.prediction_algorithms import BaselineOnly
from surprise.prediction_algorithms import CoClustering


Bad key "text.kerning_factor" on line 4 in
/opt/anaconda3/envs/learn-env/lib/python3.6/site-packages/matplotlib/mpl-data/stylelib/_classic_test_patch.mplstyle.
You probably need to get an updated matplotlibrc file from
https://github.com/matplotlib/matplotlib/blob/v3.1.3/matplotlibrc.template
or from the matplotlib source distribution


Read in the joined dataframe

In [2]:
df= pd.read_csv('../../../data/joined_dfs_lc')

### Start FSM

In [4]:
# instantiate the Reader and the rating scale
reader = Reader(rating_scale=(0, 5))

# load the dataset 
data = Dataset.load_from_df(df[['userId', 'movieId', 'rating']], reader)

# sample random trainset and testset
trainset, testset = train_test_split(data, test_size=.25, random_state=11)

#### Find the best algorithm to use

Research lead me to an article by Susan Li, who provided a ethod to test a variety of algorithms at once to determine the best option.

I'm going to iterate over all the algorithms to see which one returns the best RMSE value.
This one will take a while...

In [5]:
# thank you to Susan Li for this helpful code
benchmark = []
# Iterate over all algorithms
for algorithm in [SVD(), SVDpp(), SlopeOne(), NMF(), NormalPredictor(), 
                  KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), 
                  BaselineOnly(), CoClustering()]:
    # Perform cross validation
    results = cross_validate(algorithm, data, measures=['RMSE'], cv=3, verbose=False)
    
    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)
    
pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')    

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...


Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SVDpp,0.869079,615.348371,17.049377
BaselineOnly,0.8772,0.252318,0.291551
SVD,0.880669,6.619767,0.434949
KNNBaseline,0.884573,0.364895,4.375455
KNNWithMeans,0.904257,0.142054,3.303707
KNNWithZScore,0.904915,0.240285,3.888966
SlopeOne,0.911984,5.459299,13.820087
NMF,0.935363,7.163997,0.257524
CoClustering,0.956829,2.552615,0.38307
KNNBasic,0.958312,0.116736,2.951808


RESULT: SVDpp has the lowest RMSE... and the longest test time (hooray).
I'm going to start with that as my model.

    The SVDpp algorithm is an extension of SVD that takes into account implicit ratings.

In [8]:
# Let's pick the algorithm and run the first model on its own
algo = SVDpp()

# Train the algorithm on the trainset, and predict ratings for the testset
algo.fit(trainset)
predictions = algo.test(testset)

# Then compute RMSE
accuracy.rmse(predictions)

# Run 5-fold cross-validation and print results
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)


Evaluating RMSE, MAE of algorithm SVDpp on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8605  0.8610  0.8616  0.8600  0.8597  0.8606  0.0007  
MAE (testset)     0.6608  0.6637  0.6596  0.6575  0.6595  0.6602  0.0020  
Fit time          1002.23 992.01  1047.20 1366.41 970.14  1075.60 147.56  
Test time         14.23   15.68   16.98   18.10   14.35   15.87   1.50    


{'test_rmse': array([0.86046732, 0.86102331, 0.86160432, 0.85999563, 0.85971291]),
 'test_mae': array([0.66081161, 0.66371268, 0.65960076, 0.65750865, 0.65953556]),
 'fit_time': (1002.2321150302887,
  992.0148119926453,
  1047.1997780799866,
  1366.40558218956,
  970.1362309455872),
 'test_time': (14.232326030731201,
  15.683311939239502,
  16.97956395149231,
  18.1004638671875,
  14.346259832382202)}

It definitely takes a while to run, especially with the cross-validation. I will probably leave that out of every iteration for the sake of time.

Before I go on, just a quick test to see that it is working as we want it to.
Let's get a rating prediction for a user.

In [11]:
algo.predict(2,60756)

Prediction(uid=2, iid=60756, r_ui=None, est=3.928182283000523, details={'was_impossible': False})

OK. Time to iterate.

Note: I've duplicated this notebook and named the copy '04_dcm_iteration'.
I don't know how much time it may buy me, but I am going to run iterations in both notebookswith staggered start times. Even numbers in notebook 04, odd numbers in this notebook.

### Iteration 3
increased n_factors to 50 and regularization to ~~0.005~~ 0.05

_NOTE: a typo and a poor choice to copy and paste lead to decreasing the regularization when I intended to increase it. The model continued to improve nonethless_

In [15]:
# Let's tune
algo3 = SVDpp(n_factors=50, reg_all=0.05, verbose=True)

# Train the algorithm on the trainset, and predict ratings for the testset
algo3.fit(trainset)
predictions = algo3.test(testset)

# Then compute RMSE
accuracy.rmse(predictions)

 processing epoch 0
 processing epoch 1
 processing epoch 2
 processing epoch 3
 processing epoch 4
 processing epoch 5
 processing epoch 6
 processing epoch 7
 processing epoch 8
 processing epoch 9
 processing epoch 10
 processing epoch 11
 processing epoch 12
 processing epoch 13
 processing epoch 14
 processing epoch 15
 processing epoch 16
 processing epoch 17
 processing epoch 18
 processing epoch 19
RMSE: 0.8601


0.8601488871465017

In [16]:
accuracy.mae(predictions)

MAE:  0.6617


0.6616796642106886

Both the RMSE and MAE scores are getting smaller bit by bit...

### Iteration 5
adding an adjusted learning rate of 0.01

In [74]:
# Let's tune
algo5 = SVDpp(n_factors=50, reg_all=0.05, lr_all=0.01, verbose=False)

# Train the algorithm on the trainset, and predict ratings for the testset
algo5.fit(trainset)
predictions = algo5.test(testset)

# Then compute RMSE
accuracy.rmse(predictions)

RMSE: 0.8551


0.8551444209265346

RMSE of .8551... we were hoping to get to .86 so this is a little bonus.

We need to test the prototype app, so I am going to pickle this and we will try it with the interface.

In [75]:
import pickle

with open("../../../model_files/SVDpp.bin", 'wb') as f_out:
    pickle.dump(algo5, f_out) 
    f_out.close()

Do want to make sure and cross-validate

In [None]:
# Run 5-fold cross-validation and print results
cross_validate(algo5, data, measures=['RMSE'], cv=5, verbose=False)

Taking a closer look at this model's performance.

In [31]:
# thank you again to Susan Li for this helpful code

def get_Iu(uid):
    """ return the number of items rated by given user
    args: 
      uid: the id of the user
    returns: 
      the number of items rated by the user
    """
    try:
        return len(trainset.ur[trainset.to_inner_uid(uid)])
    except ValueError: # user was not part of the trainset
        return 0
    
def get_Ui(iid):
    """ return number of users that have rated given item
    args:
      iid: the raw id of the item
    returns:
      the number of users that have rated the item.
    """
    try: 
        return len(trainset.ir[trainset.to_inner_iid(iid)])
    except ValueError:
        return 0
    
df = pd.DataFrame(predictions, columns=['uid', 'iid', 'rui', 'est', 'details'])
df['Iu'] = df.uid.apply(get_Iu)
df['Ui'] = df.iid.apply(get_Ui)
df['err'] = abs(df.est - df.rui)
best_predictions = df.sort_values(by='err')[:100]
worst_predictions = df.sort_values(by='err')[-100:]

pd.set_option('display.max_rows', 100)


In [32]:
best_predictions

Unnamed: 0,uid,iid,rui,est,details,Iu,Ui,err
13498,122,296,5.0,5.0,{'was_impossible': False},222,235,0.0
23667,43,457,5.0,5.0,{'was_impossible': False},90,145,0.0
18195,486,110,5.0,5.0,{'was_impossible': False},48,186,0.0
4155,30,318,5.0,5.0,{'was_impossible': False},24,229,0.0
18234,594,648,5.0,5.0,{'was_impossible': False},176,117,0.0
11272,122,1208,5.0,5.0,{'was_impossible': False},222,76,0.0
11506,171,1104,5.0,5.0,{'was_impossible': False},65,13,0.0
9660,594,587,5.0,5.0,{'was_impossible': False},176,82,0.0
9608,519,318,5.0,5.0,{'was_impossible': False},19,229,0.0
19865,435,318,5.0,5.0,{'was_impossible': False},29,229,0.0


In [33]:
worst_predictions

Unnamed: 0,uid,iid,rui,est,details,Iu,Ui,err
17014,580,8494,0.5,3.471931,{'was_impossible': False},309,0,2.971931
11899,224,2167,1.0,3.972384,{'was_impossible': False},40,44,2.972384
22913,45,2672,1.0,3.979373,{'was_impossible': False},304,6,2.979373
21702,603,3481,1.0,3.980787,{'was_impossible': False},701,55,2.980787
20738,104,110,1.0,3.981497,{'was_impossible': False},198,186,2.981497
18227,522,419,0.5,3.482759,{'was_impossible': False},141,14,2.982759
11331,495,45722,1.0,3.990616,{'was_impossible': False},204,56,2.990616
14658,19,1658,5.0,2.00589,{'was_impossible': False},535,4,2.99411
9690,587,1345,1.0,4.005457,{'was_impossible': False},112,30,3.005457
12520,77,4878,1.0,4.006046,{'was_impossible': False},21,79,3.006046
