In [37]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import os
from surprise import Dataset
from surprise import Reader
from surprise import BaselineOnly
from surprise import accuracy
from surprise import SVD
from surprise import SVDpp
from surprise.model_selection import cross_validate
from surprise.model_selection import GridSearchCV
from surprise.model_selection import RandomizedSearchCV


In [2]:
#notify me when a long running cell is complete
%load_ext jupyternotify
import time

<IPython.core.display.Javascript object>

In [3]:
data = pd.read_csv('data/1mdf.csv')

#only columns relevant to surprise
data = data.drop(['m_decade'], axis=1)

#for split
data['r_date'] = data['r_date'].astype('datetime64[ns]')

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 4 columns):
 #   Column   Non-Null Count    Dtype         
---  ------   --------------    -----         
 0   mid      1000000 non-null  int64         
 1   cust_id  1000000 non-null  int64         
 2   rating   1000000 non-null  float64       
 3   r_date   1000000 non-null  datetime64[ns]
dtypes: datetime64[ns](1), float64(1), int64(2)
memory usage: 30.5 MB


In [4]:
#reorder for suprise
data = data[['cust_id', 'mid', 'rating', 'r_date']]
data.head()

Unnamed: 0,cust_id,mid,rating,r_date
0,510180,3113,3.0,1999-12-12
1,1589382,3113,4.0,2000-03-11
2,1878798,3113,2.0,2000-08-08
3,1259176,3113,4.0,2000-11-10
4,873369,3113,3.0,2000-11-20


# Train/Test Split: out of time approach

In [5]:
testsize = round(len(data) * 0.2)
testsize

200000

In [6]:
data.groupby('cust_id')['rating'].count().sort_values(ascending=False)

cust_id
305344     192
2439493    161
1664010    155
387418     154
2118461    154
          ... 
1933293      1
834542       1
834484       1
461356       1
360932       1
Name: rating, Length: 290022, dtype: int64

This would work better with more data and less users who don't only vote once

In [7]:
data = data.sort_values(by=['cust_id'])
data

Unnamed: 0,cust_id,mid,rating,r_date
7258,6,13462,3.0,2004-11-13
361326,6,5112,3.0,2005-01-12
251023,6,6339,1.0,2004-09-25
173559,6,15105,3.0,2005-12-04
458082,6,14112,3.0,2005-10-26
...,...,...,...,...
829084,2649426,17207,3.0,2004-05-11
952614,2649426,5909,3.0,2005-07-13
756283,2649426,111,4.0,2004-05-25
458974,2649429,16792,5.0,2002-06-06


In [8]:
#splitting data into 3 groups based on how many reviews they've given
for cust in data['cust_id'].unique():
    i = data.index[data['cust_id'] == cust]
    
    #if more than 2, the final review will be in holdout, 2nd to last will be in the test
    if len(i)> 2:
        data.loc[i[-1], 'split'] = 3
        data.loc[i[-2], 'split'] = 2
        data.loc[i[:-2], 'split'] = 1
        
    #if 2, the final review will be in the test
    if len(i) == 2:
        data.loc[i[-1], 'split'] = 2
        data.loc[i[0], 'split'] = 1
        
    #everyone with one review gets a linear temporal split
    else:
        data.loc[i[0], 'split'] = 1
        
holdout = data[(data['split'] == 3)]
test = data[(data['split'] == 2)]
train = data[(data['split'] == 1 )]

In [9]:
%notify

<IPython.core.display.Javascript object>

In [10]:
print('holdout:', holdout.shape)
print('test:', test.shape)
print('train:', train.shape)

holdout: (128183, 5)
test: (185571, 5)
train: (686246, 5)


In [11]:
#sort vals by date, prep to split linearly
train = train.sort_values(by=['r_date'])
train

Unnamed: 0,cust_id,mid,rating,r_date,split
0,510180,3113,3.0,1999-12-12,1.0
731,830363,11242,3.0,1999-12-21,1.0
974,1435350,8327,5.0,1999-12-27,1.0
4106,1267764,4670,3.0,1999-12-30,1.0
4883,882798,16438,4.0,1999-12-30,1.0
...,...,...,...,...,...
678419,330800,5495,3.0,2005-12-31,1.0
747707,2536322,5919,4.0,2005-12-31,1.0
203960,2376301,2395,5.0,2005-12-31,1.0
427098,1125499,9591,4.0,2005-12-31,1.0


In [12]:
#adding only the end of training set to test
holdout2 = train[-(testsize -len(holdout)):]
holdout = pd.concat([holdout, holdout2])

#200k
print(holdout.shape)

#sanity check
holdout = holdout.sort_values(by=['r_date'])
holdout

(200000, 5)


Unnamed: 0,cust_id,mid,rating,r_date,split
6318,2114455,17563,3.0,1999-12-30,3.0
3651,872963,1642,4.0,1999-12-30,3.0
14361,1192830,16788,4.0,2000-01-05,3.0
14511,1426824,3870,2.0,2000-01-05,3.0
13315,1972971,17560,3.0,2000-01-05,3.0
...,...,...,...,...,...
14810,28205,10371,4.0,2005-12-31,1.0
995954,2235350,8993,5.0,2005-12-31,1.0
771825,361630,17149,1.0,2005-12-31,1.0
81713,1662699,16242,4.0,2005-12-31,1.0


In [13]:
#remove holdout rows
train = train.loc[~train.index.isin(holdout.index)]
train.shape

(614429, 5)

In [14]:
#fill in test with random samples
test1 = train.sample(n=(200000-len(test)), replace=False, random_state=1)
test = pd.concat([test, test1])

#200k
test.shape

(200000, 5)

In [15]:
train = train.loc[~train.index.isin(test1.index)]
train.shape

(600000, 5)

In [16]:
#train.to_csv('data/train_1M.csv', index = False)
#test.to_csv('data/test_1M.csv', index=False)
#holdout.to_csv('data/ho_1M.csv', index=False)

In [17]:
%notify

<IPython.core.display.Javascript object>

# Baseline using Surprise

In [52]:
reader = Reader(rating_scale=(1,5))

In [53]:
train_data = Dataset.load_from_df(train[['cust_id','mid','rating']], reader)
test_data = Dataset.load_from_df(test[['cust_id','mid','rating']], reader)
ho_data = Dataset.load_from_df(holdout[['cust_id','mid','rating']], reader)

In [54]:
#correct surprise dataset format
train_sr = train_data.build_full_trainset()


test_sr1 = test_data.build_full_trainset()
test_sr = test_sr1.build_testset()

ho_sr1 = ho_data.build_full_trainset()
ho_sr = ho_sr1.build_testset()

In [55]:
#using stochastic gradient descent bc that is used in svd
#20 epochs because that is the svd default
bsl_options = {'method': 'sgd', 'n_epochs':20}
bias_baseline = BaselineOnly(bsl_options)
bias_baseline.fit(train_sr)
predictions = bias_baseline.test(test_sr)

Estimating biases using sgd...


In [57]:
accuracy.mae(predictions)

MAE:  0.7935


0.7935138946573231

In [58]:
accuracy.rmse(predictions, verbose=True)

RMSE: 0.9922


0.9921980884454588

In [59]:
test_user_list = ['1458102', '484142', '1347916']

In [60]:
def get_recs(user_list):
    all_recommendations = []
    
    def rec_content(cust_id):
        # Get a list of all the movies the user has not rated
        all_content = data['mid'].unique()
        user_content = data[data['cust_id'] == 'cust_id']['mid'].unique()
        new_content = list(set(all_content) - set(user_content))

        # Predict the ratings for the new movies
        predictions = [bias_baseline.predict('cust_id', mid) for mid in new_content]

        # Sort the predictions by estimated rating
        predictions.sort(key=lambda x: x.est, reverse=True)

        # Get the top 10 recommendations
        top_recommendations = [prediction.iid for prediction in predictions[:10]]

        return [(user_id, movie_id) for movie_id in top_recommendations]

    for user_id in user_list:
        user_recs = rec_content(user_id)
        all_recommendations.extend(user_recs)

    recommendations_df = pd.DataFrame(all_recommendations, columns=["userId", "movieId"])
    return recommendations_df

In [61]:
testrecs = get_recs(test_user_list)
testrecs

Unnamed: 0,userId,movieId
0,1458102,7230
1,1458102,7057
2,1458102,2102
3,1458102,7751
4,1458102,16587
5,1458102,3928
6,1458102,3456
7,1458102,12891
8,1458102,14961
9,1458102,1476


In [None]:
def Diversity(topNPredicted, simsAlgo):
 n = 0
 total = 0
 simsMatrix = simsAlgo.compute_similarities()
 for userID in topNPredicted.keys():
 pairs = itertools.combinations(topNPredicted[userID], 2)
 for pair in pairs:
 movie1 = pair[0][0]
 movie2 = pair[1][0]
 innerID1 = simsAlgo.trainset.to_inner_iid(str(movie1))
 innerID2 = simsAlgo.trainset.to_inner_iid(str(movie2))
 similarity = simsMatrix[innerID1][innerID2]
 total += similarity
 n += 1
S = total / n
 return (1-S)
view raw

# First Simple Model

In [27]:
#instance of SVD algorithm, fit using training set
svd = SVD(random_state=1)
svd.fit(train_sr)

#predicting based on validation set
preds = svd.test(test_sr)

In [28]:
#mae
accuracy.mae(preds)

MAE:  0.8052


0.8051670661228093

In [29]:
#rmse score
accuracy.rmse(preds)

RMSE: 1.0067


1.006748871084584

# Tuning First Simple Model

This model performed slightly worse. Tuning model below using GridSearchCV and RandomizedSearchCV

**Note:** From my research, I have learned that SVD models are very sensitive and prone to overfitting. ***Because this is an academic exercise, I am taking the time to test minute changes.***

### Gridsearch SVD Model

In [48]:
#gridsearch params
param_grid = {'n_factors':[50, 100, 120, 150, 200], 
              #default 100, quality of training set preds tends to grow as n grows
              'n_epochs': [20, 30, 40, 60], #default 20
              'lr_all': [0.005, 0.008, 0.01, 0.125], 
              #learning rate all params, default .005
              'reg_all': [0.02, 0.08, 0.1, 0.12, 0.16]} 
                #reg term for all params, default .02


#default measures = mae and rmse
#default cv = 5
#refit = false
gridsearch = GridSearchCV(SVD,
                  param_grid,
                  n_jobs = -1)

gridsearch.fit(train_data)

print(gridsearch.best_score['rmse'])
print(gridsearch.best_params['rmse'])

0.9893126323300756
{'n_factors': 50, 'n_epochs': 30, 'lr_all': 0.005, 'reg_all': 0.12}


In [50]:
algo = SVD(n_factors= 50, n_epochs= 30, lr_all= 0.005, reg_all= 0.12)
algo.fit(train_sr)
preds_final = algo.test(test_sr)

accuracy.rmse(preds_final)
accuracy.mae(preds_final)

RMSE: 0.9927
MAE:  0.7948


0.7947657472013869

In [51]:
preds_holdout = algo.test(ho_sr)

accuracy.rmse(preds_holdout)
accuracy.mae(preds_holdout)

RMSE: 0.9977
MAE:  0.8001


0.800059632300629

### RandomizedSearchCV

In [23]:
param_grid = {'n_factors':[50, 100, 120, 150, 200], 
              #default 100, quality of training set preds tends to grow as n grows
              'n_epochs': [20, 30, 40, 60], #default 20
              'lr_all': [0.005, 0.008, 0.01, 0.125], 
              #learning rate all params, default .005
              'reg_all': [0.02, 0.08, 0.1, 0.12, 0.16]} 
                #reg term for all params, default .02

rs = RandomizedSearchCV(SVD,
                        param_grid,
                       n_jobs = -1,
                       random_state=1) #defined above
#n_iter default = 10
#cv = 5
rs.fit(train_data)

print(rs.best_score['rmse'])
print(rs.best_params['rmse'])

0.992213592877159
{'n_factors': 120, 'n_epochs': 30, 'lr_all': 0.008, 'reg_all': 0.16}


In [28]:
algo1 = SVD(n_factors= 120, n_epochs= 30, lr_all= 0.008, reg_all= 0.16)
algo1.fit(train_sr)
preds_final1 = algo1.test(test_sr)

accuracy.rmse(preds_final1)
accuracy.mae(preds_final1)

RMSE: 0.9962
MAE:  0.7961


0.796111061064986

In [31]:
preds_holdout1 = algo1.test(ho_sr)

accuracy.rmse(preds_holdout1)
accuracy.mae(preds_holdout1)

RMSE: 1.0004
MAE:  0.8012


0.8012347588979716

In [46]:
algo2 = SVD(n_factors= 120, n_epochs= 30)
algo2.fit(train_sr)
preds_final2 = algo2.test(test_sr)

accuracy.rmse(preds_final2)
accuracy.mae(preds_final2)

RMSE: 1.0141
MAE:  0.8096


0.8096008532349762

In [47]:
%notify

<IPython.core.display.Javascript object>

# SVD++

In [38]:
svdpp = SVDpp()
svdpp.fit(train_sr)
preds_svdpp = svdpp.test(test_sr)

accuracy.rmse(preds_svdpp)
accuracy.mae(preds_svdpp)

RMSE: 1.0090
MAE:  0.8079


0.8078874856407008

In [40]:
#gridsearch params
param_grid = {'n_factors':[50, 100, 120, 150, 200], 
              #default 100, quality of training set preds tends to grow as n grows
              'n_epochs': [20, 30, 40, 60], #default 20
              'lr_all': [0.005, 0.008, 0.01, 0.125], 
              #learning rate all params, default .005
              'reg_all': [0.02, 0.08, 0.1, 0.12, 0.16]} 
                #reg term for all params, default .02


#default measures = mae and rmse
#default cv = 5
#refit = false
gridsearch = GridSearchCV(SVDpp,
                  param_grid,
                  n_jobs = -1)

gridsearch.fit(train_data)

print(gridsearch.best_score['rmse'])
print(gridsearch.best_params['rmse'])



0.9900869758257425
{'n_factors': 50, 'n_epochs': 30, 'lr_all': 0.005, 'reg_all': 0.16}


In [41]:
param_grid = {'n_factors':[50, 100, 120, 150, 200], 
              #default 100, quality of training set preds tends to grow as n grows
              'n_epochs': [20, 30, 40, 60], #default 20
              'lr_all': [0.005, 0.008, 0.01, 0.125], 
              #learning rate all params, default .005
              'reg_all': [0.02, 0.08, 0.1, 0.12, 0.16]} 
                #reg term for all params, default .02

rs = RandomizedSearchCV(SVDpp,
                        param_grid,
                       n_jobs = -1,
                       random_state=1) #defined above
#n_iter default = 10
#cv = 5
rs.fit(train_data)

print(rs.best_score['rmse'])
print(rs.best_params['rmse'])

0.9923626753625271
{'n_factors': 150, 'n_epochs': 20, 'lr_all': 0.008, 'reg_all': 0.16}


In [45]:
svdpp = SVDpp(n_factors= 150, n_epochs= 20, lr_all= 0.008, reg_all=0.16)
svdpp.fit(train_sr)
preds_svdpp = svdpp.test(test_sr)

accuracy.rmse(preds_svdpp)
accuracy.mae(preds_svdpp)

RMSE: 0.9955
MAE:  0.7989


0.7988548684114468

In [48]:
svdpp3 = SVDpp(n_factors= 150, n_epochs= 20)
svdpp3.fit(train_sr)
preds_svdpp3 = svdpp3.test(test_sr)

accuracy.rmse(preds_svdpp3)
accuracy.mae(preds_svdpp3)

RMSE: 1.0276
MAE:  0.8276


0.8275835461546746

In [42]:
%notify

<IPython.core.display.Javascript object>

references: 
https://towardsdatascience.com/build-a-recommender-system-yelp-rating-prediction-example-collaborative-filtering-28a6e48a8cc

surprise tutorialL
https://towardsdatascience.com/building-and-testing-recommender-systems-with-surprise-step-by-step-d4ba702ef80b

