# Time to get down to some modeling

In [1]:
# imports 
import pandas as pd
import numpy as np


from surprise import Dataset, Reader
from surprise import accuracy

from surprise.model_selection import train_test_split, cross_validate

from surprise.prediction_algorithms import SVD, SVDpp, SlopeOne, NMF, NormalPredictor, KNNBaseline, KNNBasic, KNNWithMeans, KNNWithZScore, BaselineOnly, CoClustering



using my filtered dataframe with thresholds for movies and users

In [36]:
df = pd.read_csv('../../../data/joined_dfs_lc')
df.shape

(100836, 7)

In [37]:
reader = Reader(rating_scale=(0, 5))
# Load the dataset 
data = Dataset.load_from_df(df[['userId', 'movieId', 'rating']], reader)

# sample random trainset and testset
# test set is made of 25% of the ratings.
trainset, testset = train_test_split(data, test_size=.25)



In [5]:
# instantiate the algorithm
algo = SVDpp()

# Train the algorithm on the trainset, and predict ratings for the testset
algo.fit(trainset)
predictions = algo.test(testset)

# Then compute RMSE
accuracy.rmse(predictions)

RMSE: 0.8720


0.8719712611260318

In [6]:
# Run 3-fold cross-validation and print results
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=3, verbose=True)

Evaluating RMSE, MAE of algorithm SVDpp on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.8718  0.8727  0.8697  0.8714  0.0013  
MAE (testset)     0.6689  0.6678  0.6676  0.6681  0.0006  
Fit time          674.78  649.39  668.42  664.20  10.79   
Test time         20.05   19.58   21.61   20.42   0.87    


{'test_rmse': array([0.87176279, 0.87271706, 0.86965529]),
 'test_mae': array([0.66887946, 0.66778529, 0.66756038]),
 'fit_time': (674.7831881046295, 649.3946812152863, 668.4170718193054),
 'test_time': (20.05371618270874, 19.582926034927368, 21.60849094390869)}

In [30]:
algo.predict(5,'Blade Runner (1982)')

Prediction(uid=5, iid='Blade Runner (1982)', r_ui=None, est=4.00460414925547, details={'was_impossible': False})

In [13]:
algo.predict(15,'Matrix, The (1999)')

Prediction(uid=15, iid=1, r_ui=None, est=3.1212547093839014, details={'was_impossible': False})

In [14]:
algo.predict(2,60756)

Prediction(uid=2, iid=60756, r_ui=None, est=3.5957409537815046, details={'was_impossible': False})

In [8]:
# Let's tune
algo2 = SVDpp(n_factors=50)

# Train the algorithm on the trainset, and predict ratings for the testset
algo2.fit(trainset)
predictions = algo2.test(testset)

# Then compute RMSE
accuracy.rmse(predictions)

RMSE: 0.8722


0.8722392488769417

In [19]:
# thank you to Susan Li for this helpful code

def get_Iu(uid):
    """ return the number of items rated by given user
    args: 
      uid: the id of the user
    returns: 
      the number of items rated by the user
    """
    try:
        return len(trainset.ur[trainset.to_inner_uid(uid)])
    except ValueError: # user was not part of the trainset
        return 0
    
def get_Ui(iid):
    """ return number of users that have rated given item
    args:
      iid: the raw id of the item
    returns:
      the number of users that have rated the item.
    """
    try: 
        return len(trainset.ir[trainset.to_inner_iid(iid)])
    except ValueError:
        return 0
    
df = pd.DataFrame(predictions, columns=['uid', 'iid', 'rui', 'est', 'details'])
df['Iu'] = df.uid.apply(get_Iu)
df['Ui'] = df.iid.apply(get_Ui)
df['err'] = abs(df.est - df.rui)
best_predictions = df.sort_values(by='err')[:100]
worst_predictions = df.sort_values(by='err')[-100:]

In [23]:
pd.set_option('display.max_rows', 500)
best_predictions.head(100)

Unnamed: 0,uid,iid,rui,est,details,Iu,Ui,err
24324,382,"Matrix, The (1999)",5.0,5.0,{'was_impossible': False},218,218,0.0
583,32,"Shawshank Redemption, The (1994)",5.0,5.0,{'was_impossible': False},79,231,0.0
14429,515,Forrest Gump (1994),5.0,5.0,{'was_impossible': False},22,255,0.0
13051,610,Blade Runner (1982),5.0,5.0,{'was_impossible': False},984,91,0.0
162,555,Star Wars: Episode IV - A New Hope (1977),5.0,5.0,{'was_impossible': False},446,191,0.0
1265,480,Schindler's List (1993),5.0,5.0,{'was_impossible': False},637,174,0.0
24152,417,"Shawshank Redemption, The (1994)",5.0,5.0,{'was_impossible': False},59,231,0.0
12227,251,Forrest Gump (1994),5.0,5.0,{'was_impossible': False},18,255,0.0
1241,122,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),5.0,5.0,{'was_impossible': False},222,129,0.0
13943,171,Cool Hand Luke (1967),5.0,5.0,{'was_impossible': False},62,48,0.0


In [24]:
worst_predictions

Unnamed: 0,uid,iid,rui,est,details,Iu,Ui,err
23571,184,The Emoji Movie (2017),0.5,3.551966,{'was_impossible': False},105,0,3.051966
8391,177,Bad Santa (2003),0.5,3.554588,{'was_impossible': False},665,22,3.054588
18498,483,Secret Society (2002),0.5,3.554722,{'was_impossible': False},559,1,3.054722
24988,418,Mission: Impossible - Rogue Nation (2015),0.5,3.557655,{'was_impossible': False},60,12,3.057655
17425,51,Star Trek: Insurrection (1998),0.5,3.560755,{'was_impossible': False},274,25,3.060755
24991,175,Stay (2005),0.5,3.563384,{'was_impossible': False},18,3,3.063384
10256,115,American Beauty (1999),1.0,4.076182,{'was_impossible': False},92,140,3.076182
18646,354,Happy Feet (2006),0.5,3.57704,{'was_impossible': False},164,4,3.07704
22075,34,"Lord of the Rings: The Two Towers, The (2002)",1.0,4.081258,{'was_impossible': False},56,138,3.081258
24449,594,"Exterminating Angel, The (Ángel exterminador, ...",0.5,3.588433,{'was_impossible': False},170,2,3.088433


In [29]:
# Let's tune again
algo4 = SVDpp(n_factors=50, n_epochs=50,reg_all=0.05, verbose=True)

# Train the algorithm on the trainset, and predict ratings for the testset
algo4.fit(trainset)
predictions = algo4.test(testset)

# Then compute RMSE
accuracy.rmse(predictions)

 processing epoch 0
 processing epoch 1
 processing epoch 2
 processing epoch 3
 processing epoch 4
 processing epoch 5
 processing epoch 6
 processing epoch 7
 processing epoch 8
 processing epoch 9
 processing epoch 10
 processing epoch 11
 processing epoch 12
 processing epoch 13
 processing epoch 14
 processing epoch 15
 processing epoch 16
 processing epoch 17
 processing epoch 18
 processing epoch 19
 processing epoch 20
 processing epoch 21
 processing epoch 22
 processing epoch 23
 processing epoch 24
 processing epoch 25
 processing epoch 26
 processing epoch 27
 processing epoch 28
 processing epoch 29
 processing epoch 30
 processing epoch 31
 processing epoch 32
 processing epoch 33
 processing epoch 34
 processing epoch 35
 processing epoch 36
 processing epoch 37
 processing epoch 38
 processing epoch 39
 processing epoch 40
 processing epoch 41
 processing epoch 42
 processing epoch 43
 processing epoch 44
 processing epoch 45
 processing epoch 46
 processing epoch 47
 p

0.863924686296065

In [31]:
accuracy.mae(predictions)

MAE:  0.6645


0.6644956302577323

In [None]:
# Let's tune again
algo6 = SVDpp(n_factors=100, n_epochs=100, lr_all=0.005, reg_all=0.4, verbose=True)

# Train the algorithm on the trainset, and predict ratings for the testset
algo6.fit(trainset)
predictions = algo6.test(testset)

# Then compute RMSE
accuracy.rmse(predictions)