In [38]:
import pandas as pd
from sklearn import model_selection
from datetime import datetime
import numpy as np
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split
from surprise import KNNBasic
from surprise import SVDpp
from surprise import SlopeOne
from surprise import accuracy
from surprise.model_selection import GridSearchCV

## Data Loading

In [5]:
df = pd.read_json('Digital_Music_5.json', lines=True)
df.head()

Unnamed: 0,overall,vote,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,image
0,5,3.0,True,"06 3, 2013",A2TYZ821XXK2YZ,3426958910,{'Format:': ' Audio CD'},Garrett,"This is awesome to listen to, A must-have for ...",Slayer Rules!,1370217600,
1,5,,True,"10 11, 2014",A3OFSREZADFUDY,3426958910,{'Format:': ' Audio CD'},Ad,bien,Five Stars,1412985600,
2,5,,True,"02 11, 2014",A2VAMODP8M77NG,3426958910,{'Format:': ' Audio CD'},JTGabq,It was great to hear the old stuff again and I...,SLAYER!!!!!!!!!!!!!!!!!!!!!,1392076800,
3,4,3.0,False,"12 7, 2013",AAKSLZ9IDTEH0,3426958910,{'Format:': ' Audio CD'},john F&#039;n doe,well best of's are a bit poison normally but t...,slayer greatest hits! you mean everything righ...,1386374400,
4,5,,True,"06 12, 2016",A3OH43OZJLKI09,5557706259,{'Format:': ' Audio CD'},melinda a goodman,What can I say? This is Casting Crowns!!!This ...,"This is a good, blessing filled",1465689600,


In [6]:
df['unixReviewTime'] = df['unixReviewTime'].map(lambda x: datetime.fromtimestamp(x))

In [7]:
df['style'] = df['style'].astype(str)
df['style'] = df['style'].apply(lambda x: x.split(':')[-1].strip(" }'"))

In [8]:
df.head()

Unnamed: 0,overall,vote,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,image
0,5,3.0,True,"06 3, 2013",A2TYZ821XXK2YZ,3426958910,Audio CD,Garrett,"This is awesome to listen to, A must-have for ...",Slayer Rules!,2013-06-02 17:00:00,
1,5,,True,"10 11, 2014",A3OFSREZADFUDY,3426958910,Audio CD,Ad,bien,Five Stars,2014-10-10 17:00:00,
2,5,,True,"02 11, 2014",A2VAMODP8M77NG,3426958910,Audio CD,JTGabq,It was great to hear the old stuff again and I...,SLAYER!!!!!!!!!!!!!!!!!!!!!,2014-02-10 16:00:00,
3,4,3.0,False,"12 7, 2013",AAKSLZ9IDTEH0,3426958910,Audio CD,john F&#039;n doe,well best of's are a bit poison normally but t...,slayer greatest hits! you mean everything righ...,2013-12-06 16:00:00,
4,5,,True,"06 12, 2016",A3OH43OZJLKI09,5557706259,Audio CD,melinda a goodman,What can I say? This is Casting Crowns!!!This ...,"This is a good, blessing filled",2016-06-11 17:00:00,


## KNN Basic Model

In [12]:
reader = Reader(rating_scale=(1, 5))

data = Dataset.load_from_df(df[['reviewerID', 'asin', 'overall']], reader)

trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

param_grid = {'n_epochs': [5, 10, 15], 'lr_all': [0.002, 0.005, 0.01],
              'reg_all': [0.02, 0.04, 0.06]}

gs = GridSearchCV(KNNBasic, param_grid, measures=['rmse'], cv=3)

gs.fit(data)


best_params = gs.best_params['rmse']

print("Best Parameters:", best_params)

best_knn_basic_model = KNNBasic(n_epochs=best_params['n_epochs'], 
                     lr_all=best_params['lr_all'], reg_all=best_params['reg_all'])

best_knn_basic_model.fit(data.build_full_trainset())

predictions = best_knn_basic_model.test(testset)

rmse = accuracy.rmse(predictions)
print("RMSE with Tuned Model:", rmse)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computi

## SVD++ Model

In [15]:
reader = Reader(rating_scale=(1, 5))

data = Dataset.load_from_df(df[['reviewerID', 'asin', 'overall']], reader)

trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

param_grid = {'n_epochs': [5, 10, 15], 'lr_all': [0.002, 0.005, 0.01],
              'reg_all': [0.02, 0.04, 0.06]}

gs = GridSearchCV(SVDpp, param_grid, measures=['rmse'], cv=3)

gs.fit(data)


best_params = gs.best_params['rmse']

print("Best Parameters:", best_params)

best_svd_pp_model = SVDpp(n_epochs=best_params['n_epochs'], 
                     lr_all=best_params['lr_all'], reg_all=best_params['reg_all'])

best_svd_pp_model.fit(data.build_full_trainset())

predictions = best_svd_pp_model.test(testset)

rmse = accuracy.rmse(predictions)
print("RMSE with Tuned Model:", rmse)

Best Parameters: {'n_epochs': 15, 'lr_all': 0.01, 'reg_all': 0.02}
RMSE: 0.4015
RMSE with Tuned Model: 0.4015394721076082


## Slope One Model

In [18]:
reader = Reader(rating_scale=(1, 5))

data = Dataset.load_from_df(df[['reviewerID', 'asin', 'overall']], reader)

trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

slope_one_model = SlopeOne()

slope_one_model.fit(data.build_full_trainset())

predictions = slope_one_model.test(testset)

rmse = accuracy.rmse(predictions)
print("RMSE with Tuned Model:", rmse)

RMSE: 0.1230
RMSE with Tuned Model: 0.1229537349211004


## Baseline

predict using the mean rating for each existing product in the training dataset. If item does not exist, predict with the overall mean rating across the entire training set.

In [72]:
data = df[['reviewerID', 'asin', 'overall']]
trainset, testset = model_selection.train_test_split(data, test_size=0.2, random_state=42)

In [73]:
train_mean = trainset.groupby('asin').mean()
train_mean.head()

Unnamed: 0_level_0,overall
asin,Unnamed: 1_level_1
3426958910,4.75
5557505946,5.0
5557706259,4.555556
5559166928,4.875
7799420340,4.803571


In [74]:
def predict(test):
    overall_mean = trainset['overall'].mean()
    userId, productId, rating = test
    if productId in train_mean.index:
        return train_mean.loc[productId]['overall']
    return overall_mean

In [75]:
def rmse(predicted, expected):
    return np.sqrt(sum((predicted - expected) ** 2) / len(testset))

In [76]:
predictions = [predict(i) for i in testset.to_numpy()]

In [77]:
predictions = np.array(predictions)

In [78]:
predictions

array([4.8372093 , 4.86666667, 4.63636364, ..., 5.        , 4.88888889,
       4.4       ])

In [79]:
testset

Unnamed: 0,reviewerID,asin,overall
89071,A2C2S9BBPWTK9Y,B00CZF8B68,5
131119,AQR5ZB4BIJFLT,B00122Z6ZY,5
116727,A3QEOGJXHI6UEG,B0186SEPNU,5
76480,A18REVWRZ4GA7M,B007JCPEVM,5
160338,A2IJCEG2J34W61,B00E67KLDE,5
...,...,...,...
85246,A3IUDYSAD6LCA4,B00B3EUYAS,5
34762,A2BYV7S1QP2YIG,B0011XCZ1M,4
124658,A2JXX1BAZ19Y4X,B000VRSXHE,3
3248,A2XLYJIQHITC3X,B000THG2K6,5


In [80]:
actual = np.array(testset['overall'].values)

In [81]:
rmse(predictions, actual)

0.7063984511780854