# Time to get down to some modeling

In [2]:
# imports 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from surprise import Dataset, Reader
from surprise import accuracy

from surprise.model_selection import train_test_split, cross_validate

from surprise.prediction_algorithms import SVD
from surprise.prediction_algorithms import SVDpp
from surprise.prediction_algorithms import SlopeOne
from surprise.prediction_algorithms import NMF
from surprise.prediction_algorithms import NormalPredictor
from surprise.prediction_algorithms import KNNBaseline
from surprise.prediction_algorithms import KNNBasic
from surprise.prediction_algorithms import KNNWithMeans
from surprise.prediction_algorithms import KNNWithZScore
from surprise.prediction_algorithms import BaselineOnly
from surprise.prediction_algorithms import CoClustering

Read in the joined dataframe

In [3]:
df= pd.read_csv('../../../data/joined_dfs_lc')

### Start FSM

In [4]:
# instantiate the Reader and the rating scale
reader = Reader(rating_scale=(0, 5))

# load the dataset 
data = Dataset.load_from_df(df[['userId', 'movieId', 'rating']], reader)

# sample random trainset and testset
trainset, testset = train_test_split(data, test_size=.25, random_state=15)

#### Find the best algorithm to use

Research lead me to an article by Susan Li, who provided a method to test a variety of algorithms at once to determine the best option.

I'm going to iterate over all the algorithms to see which one returns the best RMSE value.
This one will take a while...

In [4]:
# thank you to Susan Li for this helpful code
benchmark = []
# Iterate over all algorithms
for algorithm in [SVD(), SVDpp(), SlopeOne(), NMF(), NormalPredictor(), 
                  KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), 
                  BaselineOnly(), CoClustering()]:
    # Perform cross validation
    results = cross_validate(algorithm, data, measures=['RMSE'], cv=3, verbose=False)
    
    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)
    
pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')    

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...


Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SVDpp,0.869284,1608.443041,17.062126
BaselineOnly,0.877168,0.311693,0.256816
SVD,0.879094,6.526585,0.708741
KNNBaseline,0.884045,0.421873,3.285246
KNNWithZScore,0.90534,0.224105,3.026917
KNNWithMeans,0.905589,0.152292,2.777985
SlopeOne,0.912887,5.26794,12.567827
NMF,0.936118,5.984205,0.212911
CoClustering,0.953777,2.770812,0.347148
KNNBasic,0.956738,0.119349,2.5593


RESULT: SVDpp has the lowest RMSE... and the longest test time (hooray).
I'm going to start with that as my model.

    The SVDpp algorithm is an extension of SVD that takes into account implicit ratings.

In [None]:
# Let's pick the algorithm and run the first model on its own
algo = SVDpp(random_state=15)

# Train the algorithm on the trainset, and predict ratings for the testset
algo.fit(trainset)
predictions = algo.test(testset)

# Then compute RMSE
accuracy.rmse(predictions)

# Run 5-fold cross-validation and print results
cross_validate(algo, data, measures=['RMSE'], cv=5, verbose=True)


It definitely takes a while to run, especially with the cross-validation. I will probably leave that out of every iteration for the sake of time.

Before I go on, just a quick test to see that it is working as we want it to.
Let's get a rating prediction for a user.

In [None]:
algo.predict(2,60756)

OK. Time to iterate.

Note: I've duplicated this notebook and named the copy '04_dcm_iteration'.
I don't know how much time it may buy me, but I am going to run iterations in both notebookswith staggered start times. Even numbers in notebook 04, odd numbers in this notebook.

### Iteration 3
increased n_factors to 50 and regularization to ~~0.005~~ 0.05

_NOTE: a typo and a poor choice to copy and paste lead to decreasing the regularization when I intended to increase it. The model continued to improve nonethless_

In [None]:
# Let's tune
algo3 = SVDpp(n_factors=50, reg_all=0.05, verbose=True, random_state=15)

# Train the algorithm on the trainset, and predict ratings for the testset
algo3.fit(trainset)
predictions = algo3.test(testset)

# Then compute RMSE
accuracy.rmse(predictions)

In [None]:
accuracy.mae(predictions)

Both the RMSE and MAE scores are getting smaller bit by bit...

### Iteration 5
adding an adjusted learning rate of 0.01

In [4]:
# Let's tune
algo5 = SVDpp(n_factors=50, reg_all=0.05, lr_all=0.01, verbose=True, random_state=15)

# Train the algorithm on the trainset, and predict ratings for the testset
algo5.fit(trainset)
predictions = algo5.test(testset)

# Then compute RMSE
accuracy.rmse(predictions)

 processing epoch 0
 processing epoch 1
 processing epoch 2
 processing epoch 3
 processing epoch 4
 processing epoch 5
 processing epoch 6
 processing epoch 7
 processing epoch 8
 processing epoch 9
 processing epoch 10
 processing epoch 11
 processing epoch 12
 processing epoch 13
 processing epoch 14
 processing epoch 15
 processing epoch 16
 processing epoch 17
 processing epoch 18
 processing epoch 19
RMSE: 0.8561


0.8560624080683475

RMSE of .8560... we were hoping to get to .86 so this is a little bonus.

We need to test the prototype app, so I am going to pickle this and we will try it with the interface.

Do want to make sure and cross-validate

In [5]:
# Run 5-fold cross-validation and print results
cross_validate(algo5, data, measures=['RMSE'], cv=5, verbose=False)

 processing epoch 0
 processing epoch 1
 processing epoch 2
 processing epoch 3
 processing epoch 4
 processing epoch 5
 processing epoch 6
 processing epoch 7
 processing epoch 8
 processing epoch 9
 processing epoch 10
 processing epoch 11
 processing epoch 12
 processing epoch 13
 processing epoch 14
 processing epoch 15
 processing epoch 16
 processing epoch 17
 processing epoch 18
 processing epoch 19
 processing epoch 0
 processing epoch 1
 processing epoch 2
 processing epoch 3
 processing epoch 4
 processing epoch 5
 processing epoch 6
 processing epoch 7
 processing epoch 8
 processing epoch 9
 processing epoch 10
 processing epoch 11
 processing epoch 12
 processing epoch 13
 processing epoch 14
 processing epoch 15
 processing epoch 16
 processing epoch 17
 processing epoch 18
 processing epoch 19
 processing epoch 0
 processing epoch 1
 processing epoch 2
 processing epoch 3
 processing epoch 4
 processing epoch 5
 processing epoch 6
 processing epoch 7
 processing epoch 8


{'test_rmse': array([0.8616375 , 0.85217975, 0.85067281, 0.85782651, 0.85714544]),
 'fit_time': (3092.0690972805023,
  2807.411642074585,
  2076.289571046829,
  1977.3715116977692,
  2112.6022658348083),
 'test_time': (11.505327224731445,
  12.394096851348877,
  12.507059812545776,
  15.4168860912323,
  20.021172046661377)}

now i'm going to pickle this so we can test the prototype app.

In [None]:
#import pickle

#with open("../../../model_files/SVDpp.bin", 'wb') as f_out:
#    pickle.dump(algo5, f_out) 
#    f_out.close()

Taking a closer look at this model's performance.

In [1]:
# thank you again to Susan Li for this helpful code

def get_Iu(uid):
    """ return the number of items rated by given user
    args: 
      uid: the id of the user
    returns: 
      the number of items rated by the user
    """
    try:
        return len(trainset.ur[trainset.to_inner_uid(uid)])
    except ValueError: # user was not part of the trainset
        return 0
    
def get_Ui(iid):
    """ return number of users that have rated given item
    args:
      iid: the raw id of the item
    returns:
      the number of users that have rated the item.
    """
    try: 
        return len(trainset.ir[trainset.to_inner_iid(iid)])
    except ValueError:
        return 0
    
df = pd.DataFrame(predictions, columns=['uid', 'iid', 'rui', 'est', 'details'])
df['Iu'] = df.uid.apply(get_Iu)
df['Ui'] = df.iid.apply(get_Ui)
df['err'] = abs(df.est - df.rui)
best_predictions = df.sort_values(by='err')[:10]
worst_predictions = df.sort_values(by='err')[-10:]

NameError: name 'pd' is not defined

In [7]:
best_predictions

Unnamed: 0,uid,iid,rui,est,details,Iu,Ui,err
13098,69,318,5.0,5.0,{'was_impossible': False},33,248,0.0
20868,266,296,5.0,5.0,{'was_impossible': False},136,229,0.0
24395,348,2329,5.0,5.0,{'was_impossible': False},43,89,0.0
5022,45,2571,5.0,5.0,{'was_impossible': False},312,204,0.0
24014,452,1222,5.0,5.0,{'was_impossible': False},157,79,0.0
25184,122,2959,5.0,5.0,{'was_impossible': False},222,160,0.0
8248,122,1196,5.0,5.0,{'was_impossible': False},222,152,0.0
6846,523,3147,5.0,5.0,{'was_impossible': False},54,82,0.0
4936,30,318,5.0,5.0,{'was_impossible': False},23,248,0.0
14438,391,296,5.0,5.0,{'was_impossible': False},292,229,0.0


In [8]:
worst_predictions

Unnamed: 0,uid,iid,rui,est,details,Iu,Ui,err
22532,567,25771,5.0,2.088404,{'was_impossible': False},292,1,2.911596
11751,85,1358,1.0,3.913403,{'was_impossible': False},26,32,2.913403
22995,121,272,1.0,3.915821,{'was_impossible': False},44,21,2.915821
15865,71,17,1.0,3.925866,{'was_impossible': False},27,52,2.925866
9901,91,515,1.0,3.930654,{'was_impossible': False},402,15,2.930654
12291,477,783,0.5,3.432656,{'was_impossible': False},433,30,2.932656
2471,301,1994,0.5,3.435832,{'was_impossible': False},83,36,2.935832
21161,393,74458,0.5,3.444025,{'was_impossible': False},88,48,2.944025
10487,393,5445,0.5,3.44801,{'was_impossible': False},88,89,2.94801
8344,113,2791,1.0,3.95016,{'was_impossible': False},113,62,2.95016
