# Time to get down to some modeling

In [1]:
# imports 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from surprise import Dataset, Reader
from surprise import accuracy

from surprise.model_selection import train_test_split, cross_validate

from surprise.prediction_algorithms import SVD
from surprise.prediction_algorithms import SVDpp
from surprise.prediction_algorithms import SlopeOne
from surprise.prediction_algorithms import NMF
from surprise.prediction_algorithms import NormalPredictor
from surprise.prediction_algorithms import KNNBaseline
from surprise.prediction_algorithms import KNNBasic
from surprise.prediction_algorithms import KNNWithMeans
from surprise.prediction_algorithms import KNNWithZScore
from surprise.prediction_algorithms import BaselineOnly
from surprise.prediction_algorithms import CoClustering


Read in the joined dataframe

In [36]:
df = pd.read_csv('../../../data/joined_dfs_lc')
df.shape

(100836, 7)

In [37]:
reader = Reader(rating_scale=(0, 5))
# Load the dataset 
data = Dataset.load_from_df(df[['userId', 'movieId', 'rating']], reader)

# sample random trainset and testset
# test set is made of 25% of the ratings.
trainset, testset = train_test_split(data, test_size=.25)


### Iteration 2 
increase n_factors to 50

In [8]:
# Let's tune
algo2 = SVDpp(n_factors=50)

# Train the algorithm on the trainset, and predict ratings for the testset
algo2.fit(trainset)
predictions = algo2.test(testset)

# Then compute RMSE
accuracy.rmse(predictions)

RMSE: 0.8722


0.8722392488769417

Not much of a change from the FSM

### Iteration 4
increase epochs to 50

In [29]:
# Let's tune again
algo4 = SVDpp(n_factors=50, n_epochs=50,reg_all=0.05, verbose=True)

# Train the algorithm on the trainset, and predict ratings for the testset
algo4.fit(trainset)
predictions = algo4.test(testset)

# Then compute RMSE
accuracy.rmse(predictions)

 processing epoch 0
 processing epoch 1
 processing epoch 2
 processing epoch 3
 processing epoch 4
 processing epoch 5
 processing epoch 6
 processing epoch 7
 processing epoch 8
 processing epoch 9
 processing epoch 10
 processing epoch 11
 processing epoch 12
 processing epoch 13
 processing epoch 14
 processing epoch 15
 processing epoch 16
 processing epoch 17
 processing epoch 18
 processing epoch 19
 processing epoch 20
 processing epoch 21
 processing epoch 22
 processing epoch 23
 processing epoch 24
 processing epoch 25
 processing epoch 26
 processing epoch 27
 processing epoch 28
 processing epoch 29
 processing epoch 30
 processing epoch 31
 processing epoch 32
 processing epoch 33
 processing epoch 34
 processing epoch 35
 processing epoch 36
 processing epoch 37
 processing epoch 38
 processing epoch 39
 processing epoch 40
 processing epoch 41
 processing epoch 42
 processing epoch 43
 processing epoch 44
 processing epoch 45
 processing epoch 46
 processing epoch 47
 p

0.863924686296065

In [31]:
accuracy.mae(predictions)

MAE:  0.6645


0.6644956302577323

Still heading in the right direction. 
Also the improvements are miniscule, I'm not sure it's really worth the computational cost.
While there is time and improvement, we'll keep tweaking 

### Iteration 6
Let's push the n_factors and epochs to 100, and adjust the learning rate to 0.005.

_Note: did not get time to run this before we started testing the app...

In [None]:
# Let's tune again
algo6 = SVDpp(n_factors=100, n_epochs=100, lr_all=0.005, reg_all=0.05, verbose=True)

# Train the algorithm on the trainset, and predict ratings for the testset
algo6.fit(trainset)
predictions = algo6.test(testset)

# Then compute RMSE
accuracy.rmse(predictions)