In [75]:
# Our previous model was using ALS with Pyspark. Here we take the approach of using other matrix factorization approaches.

import pandas as pd
import numpy as np

In [76]:
# We can read in our data from our other notebook
ii = pd.read_csv('Users18324DesktopElectronics_5.json')

In [77]:
ii.head()
ii.drop('Unnamed: 0', axis=1, inplace=True)

In [78]:
ii.head()

Unnamed: 0,ProductId,overall,reviewerID,title
0,446697192,4.0,A1A8QJ282YIUQ0,Hollywood Is like High School with Money
1,446697192,4.0,AX9C10JD538D9,Hollywood Is like High School with Money
2,446697192,3.0,A2UAM0ITC30078,Hollywood Is like High School with Money
3,528881469,2.0,A2CPBQ5W4OGBX,Rand McNally 528881469 7-inch Intelliroute TND...
4,528881469,5.0,A3H86FCI0QZH7T,Rand McNally 528881469 7-inch Intelliroute TND...


In [79]:
ii.shape

(1049630, 4)

### Reducing the dataset

We can get a breakdown of how many reviews there are for a product. We can reduce our dataset to products that have at least 50 ratings.

In [80]:
new = pd.DataFrame(ii.groupby('ProductId')['overall'].count())

In [81]:
new = new[new.overall >= 50]

In [82]:
new.shape

(3802, 1)

In [83]:
new = new.sort_values(by='overall',ascending=False)
new

Unnamed: 0_level_0,overall
ProductId,Unnamed: 1_level_1
B00004ZCJJ,2786
B00004ZCJI,2656
B00009KLAE,2642
B003L1ZYYW,2549
B0019HL8Q8,2513
...,...
B004E5J61G,50
B000WEOWM6,50
B0000C8VU8,50
B0000CD0B7,50


In [84]:
df = ii[ii.ProductId.isin(list(new.index))]

In [85]:
df.shape

(575349, 4)

To make the recommender system more relevant, we can also filter the users to those who have at least 15 reviews. This reduces our dataset down to a less computationally taxing size. It will also improve the performance of the recommender system.

In [86]:
users = pd.DataFrame(ii.groupby('reviewerID')['overall'].count())
users = users[users.overall >= 15]

In [87]:
df = df[df.reviewerID.isin(users.index)]
df.shape

(16241, 4)

In [88]:
df1 = df.copy().reset_index(drop=True)

In [89]:
df1

Unnamed: 0,ProductId,overall,reviewerID,title
0,B00001OWYM,5.0,A25E8V5JMW43RY,Maxell 290058 Vhs Cleaner Dry
1,B00001OWYM,5.0,A25E8V5JMW43RY,Maxell 290058 Vhs Cleaner Dry
2,B00001OWYM,4.0,A3W4D8XOGLWUN5,Maxell 290058 Vhs Cleaner Dry
3,B00001OWYM,4.0,A3W4D8XOGLWUN5,Maxell 290058 Vhs Cleaner Dry
4,B00001P4XA,3.0,A2FD5LWT2BXO94,Koss 'The Plug' In-Ear Headphones (Black)
...,...,...,...,...
16236,B0091V0A9U,4.0,A54S9CIUV5VNB,"Pioneer Single DIN In-Dash CD/CD-R/RW, MP3/WMA..."
16237,B0091V0A9U,5.0,A2SRG247VDFTAX,"Pioneer Single DIN In-Dash CD/CD-R/RW, MP3/WMA..."
16238,B0096T7TQE,1.0,A3IY316DRNF5F2,JBL Micro II Ultra-Portable Multimedia Speaker...
16239,B0096T7TQE,4.0,AHS6PX6H22WW1,JBL Micro II Ultra-Portable Multimedia Speaker...


### Creating unique ids

In [90]:
checklist = list(df1.ProductId.unique())

In [91]:
# Creating unique product integer ids
count = 0
df1['prod_int_id'] = 0
for i in checklist:
    df1['prod_int_id'] = np.where(df1.ProductId == i, count + 1, df1.prod_int_id)
    count += 1

In [92]:
checklist2 = list(df1.reviewerID.unique())

In [93]:
# Creating unique user integer ids
count = 0
df1['user_int_id'] = 0
for i in checklist2:
    df1['user_int_id'] = np.where(df1.reviewerID == i, count + 1, df1.user_int_id)
    count += 1

In [94]:
df1 = df1.drop(['ProductId', 'reviewerID'], axis=1)
df1

Unnamed: 0,overall,title,prod_int_id,user_int_id
0,5.0,Maxell 290058 Vhs Cleaner Dry,1,1
1,5.0,Maxell 290058 Vhs Cleaner Dry,1,1
2,4.0,Maxell 290058 Vhs Cleaner Dry,1,2
3,4.0,Maxell 290058 Vhs Cleaner Dry,1,2
4,3.0,Koss 'The Plug' In-Ear Headphones (Black),2,3
...,...,...,...,...
16236,4.0,"Pioneer Single DIN In-Dash CD/CD-R/RW, MP3/WMA...",3161,275
16237,5.0,"Pioneer Single DIN In-Dash CD/CD-R/RW, MP3/WMA...",3161,141
16238,1.0,JBL Micro II Ultra-Portable Multimedia Speaker...,3162,444
16239,4.0,JBL Micro II Ultra-Portable Multimedia Speaker...,3162,1126


### Summary Statistics

In [95]:
# Avg num ratings per users
print("Avg num ratings per user: {}".format(df.groupby('reviewerID')['overall'].count().mean()))

# Avg num ratings per product
print("Avg num ratings per product: {}".format(df.groupby('ProductId')['overall'].count().mean()))

Avg num ratings per user: 10.02530864197531
Avg num ratings per product: 5.136306135357369


## The Surprise library for recommendations

We can use the surprise library to compare several approaches to our recommendation system. Here we run cross validation on several different prediction algorithms to identify accuracy and computation times and pick a top performing algorithm.

In [96]:
from surprise import Reader, Dataset
from surprise.model_selection import train_test_split
from surprise import SVDpp, accuracy
from surprise.model_selection import cross_validate
from surprise import SVD, SlopeOne, NMF, KNNBasic, KNNWithMeans, BaselineOnly, CoClustering

In [97]:
reader = Reader(rating_scale=(1,5))
data = Dataset.load_from_df(df1[['user_int_id', 'prod_int_id', 'overall']], reader)

In [98]:
benchmark = []

# Iterate over all algorithms
for algorithm in [SVD(), SlopeOne(), NMF(), KNNBasic(), KNNWithMeans(), BaselineOnly(), CoClustering()]:
    
    # Perform cross validation
    results = cross_validate(algorithm, data, measures=['RMSE'], cv=3, verbose=False)
    
    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)
    
pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')  

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...


Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SVD,0.898673,0.515327,0.034333
SlopeOne,0.921118,0.152011,0.034661
KNNBasic,0.937714,0.053651,0.076683
BaselineOnly,0.940794,0.01365,0.018668
KNNWithMeans,0.941776,0.074367,0.070324
CoClustering,1.016477,0.472633,0.019013
NMF,1.06898,0.683662,0.023013


### Identifying the optimal appraoch: SVD

SVD seems to have performed best, so we expand on that below. Parameters can always be further optimized through hyperparameter tuning. The SVD uses stochastic gradient descent to minimize the error and so we can adjust parameters such as the learning rate, epochs, and other regularization parameters.

In [99]:
print('Using ALS')

algo = SVD(n_epochs=5, lr_all = 0.01, reg_all = 0.02, random_state= 42)
cross_validate(algo, data, measures=['RMSE'], cv=3, verbose=False)

Using ALS


{'test_rmse': array([0.92062425, 0.92525717, 0.93906774]),
 'fit_time': (0.15303277969360352, 0.14400029182434082, 0.1389479637145996),
 'test_time': (0.025000572204589844, 0.05300021171569824, 0.02499985694885254)}

### Train/test split and model building using SVD

We now create the train/test sets and can make predictions for the ratings using our SVD model.

In [100]:
trainset, testset = train_test_split(data, test_size=0.25)

algo = SVD(n_epochs=5, lr_all = 0.01, reg_all = 0.02, random_state= 42)

predictions = algo.fit(trainset).test(testset)
accuracy.rmse(predictions)

RMSE: 0.9309


0.9308537018543841

In [112]:
predictions[:15]

[Prediction(uid=168, iid=56, r_ui=4.0, est=4.391373283274269, details={'was_impossible': False}),
 Prediction(uid=717, iid=564, r_ui=5.0, est=4.43242118627292, details={'was_impossible': False}),
 Prediction(uid=509, iid=214, r_ui=4.0, est=4.714994559764845, details={'was_impossible': False}),
 Prediction(uid=684, iid=1684, r_ui=5.0, est=4.181861953014449, details={'was_impossible': False}),
 Prediction(uid=36, iid=2954, r_ui=4.0, est=4.395872017559877, details={'was_impossible': False}),
 Prediction(uid=930, iid=1631, r_ui=5.0, est=4.709698870037305, details={'was_impossible': False}),
 Prediction(uid=246, iid=172, r_ui=4.0, est=4.631199178773854, details={'was_impossible': False}),
 Prediction(uid=1024, iid=2813, r_ui=5.0, est=4.769432210855679, details={'was_impossible': False}),
 Prediction(uid=953, iid=1271, r_ui=5.0, est=4.504474960222594, details={'was_impossible': False}),
 Prediction(uid=1117, iid=521, r_ui=5.0, est=4.4907126999421285, details={'was_impossible': False}),
 Pred

In [102]:
def get_Iu(uid):
    try:
        return len(trainset.ur[trainset.to_inner_uid(uid)])
    except ValueError: # user was not part of the trainset
        return 0
    
def get_Ui(iid):
    try: 
        return len(trainset.ir[trainset.to_inner_iid(iid)])
    except ValueError:
        return 0

In [103]:
df = pd.DataFrame(predictions, columns=['user_int_id', 'prod_int_id', 'rating', 'predicted_rating', 'details'])

In [104]:
df['items_rated_by_user'] = df.user_int_id.apply(get_Iu)
df['num_ratings_for_item'] = df.prod_int_id.apply(get_Ui)

In [105]:
df['error'] = abs(df.predicted_rating - df.rating)
df

Unnamed: 0,user_int_id,prod_int_id,rating,predicted_rating,details,items_rated_by_user,num_ratings_per_item,error
0,168,56,4.0,4.391373,{'was_impossible': False},23,2,0.391373
1,717,564,5.0,4.432421,{'was_impossible': False},8,2,0.567579
2,509,214,4.0,4.714995,{'was_impossible': False},7,5,0.714995
3,684,1684,5.0,4.181862,{'was_impossible': False},13,4,0.818138
4,36,2954,4.0,4.395872,{'was_impossible': False},23,2,0.395872
...,...,...,...,...,...,...,...,...
4056,1431,1611,3.0,4.306039,{'was_impossible': False},2,4,1.306039
4057,315,2709,5.0,4.742744,{'was_impossible': False},14,3,0.257256
4058,821,1242,1.0,4.187865,{'was_impossible': False},10,0,3.187865
4059,1336,795,3.0,4.330160,{'was_impossible': False},9,9,1.330160


In [106]:
df2 = df1.drop(columns = ['overall', 'user_int_id'], axis = 1)

In [107]:
df2 = df2.drop_duplicates().reset_index(drop=True)

In [108]:
result = pd.merge(df,df2, on='prod_int_id').drop('details',axis=1)

### Best predictions

In [109]:
best_predictions = result.sort_values(by='error')[:10]
best_predictions

Unnamed: 0,user_int_id,prod_int_id,rating,predicted_rating,items_rated_by_user,num_ratings_per_item,error,title
2805,524,1590,5.0,5.0,13,28,0.0,Belkin 8-Outlet Power Strip Surge Protector wi...
2441,595,211,5.0,5.0,8,18,0.0,BlueRigger High Speed HDMI to DVI Adapter Cabl...
627,302,653,5.0,5.0,10,40,0.0,AmazonBasics USB 2.0 Extension Cable - A-Male ...
141,1273,776,5.0,5.0,10,27,0.0,Tripp Lite Isobar 2 Outlet Surge Protector Pow...
2470,1422,3007,5.0,5.0,15,21,0.0,Crucial 2GB Single DDR3/DDR3L 1600 MT/S (PC3-1...
2472,193,3007,5.0,5.0,17,21,0.0,Crucial 2GB Single DDR3/DDR3L 1600 MT/S (PC3-1...
2473,338,3007,5.0,5.0,9,21,0.0,Crucial 2GB Single DDR3/DDR3L 1600 MT/S (PC3-1...
364,287,1281,5.0,5.0,8,22,0.0,Hosa CPR-203 Dual 1/4&quot; TS to Dual RCA Ste...
2485,883,2375,5.0,5.0,19,13,0.0,Belkin 6-Outlet Pivot-Plug Wall Mount Power St...
2356,308,1272,5.0,5.0,43,21,0.0,"NETGEAR 8-Port Fast Ethernet Unmanaged Switch,..."


### Worst predictions

In [110]:
worst_predictions = result.sort_values(by='error')[-10:]
worst_predictions

Unnamed: 0,user_int_id,prod_int_id,rating,predicted_rating,items_rated_by_user,num_ratings_per_item,error,title
3005,1442,972,1.0,4.556105,8,0,3.556105,JLab JBuds J5M Metal Earbuds Style Headphones ...
3874,630,2180,1.0,4.560319,4,1,3.560319,BUFFALO AirStation HighPower N300 Gigabit Wire...
2370,157,53,1.0,4.57211,4,6,3.57211,"Snap-N-Store CD Storage Box, 13.25 x 5.125 x 5..."
2167,955,1997,1.0,4.573888,9,17,3.573888,SanDisk Extreme PRO 32GB up to 95MB/s UHS-I/U3...
3111,759,1609,1.0,4.603056,8,0,3.603056,PQI USB 2.0 CompactFlash (CF) Card Reader
1551,937,739,1.0,4.626104,16,6,3.626104,iNassen Compatible Earphone Adapter Headphone
93,437,1679,1.0,4.627656,9,11,3.627656,Monoprice 6-Feet USB 2.0 A Male to A Female Ex...
2514,246,994,1.0,4.631199,10,0,3.631199,Asus ProArt PA248Q 24.1 Inch LED Monitor
283,1546,1365,1.0,4.686406,2,5,3.686406,Logitech S150 USB Speakers with Digital Sound
3250,621,3046,1.0,4.813808,6,7,3.813808,Logitech K360 Wireless USB Desktop Keyboard &m...
