# Packages

In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import pairwise_distances 
from scipy.sparse.linalg import svds
from surprise import Reader
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise import NormalPredictor
from surprise import KNNBasic
from surprise import KNNWithMeans
from surprise import KNNWithZScore
from surprise import KNNBaseline
from surprise import SVD
from surprise import BaselineOnly
from surprise import SVDpp
from surprise import NMF
from surprise import SlopeOne
from surprise import CoClustering
from surprise.accuracy import rmse
from surprise import accuracy
from surprise.model_selection import train_test_split
from surprise import evaluate
from surprise.model_selection import GridSearchCV

# Helper Functions

In [19]:
#Pandas DataFrame with all Predictions
def get_Iu(uid):
    #return the number of items rated by given user
    #args: 
    #uid: the id of the user
    #returns: 
    #the number of items rated by the user

    try:
        return len(trainset.ur[trainset.to_inner_uid(uid)])
    except ValueError: # user was not part of the trainset
        return 0
    
def get_Ui(iid):
    #return number of users that have rated given item
    #args:
    #iid: the raw id of the item
    #returns:
    #the number of users that have rated the item.
    try: 
        return len(trainset.ir[trainset.to_inner_iid(iid)])
    except ValueError:
        return 0

# Data Collection

The Data was taken from two different beer review sites: Ratebeer.com, BeerAdvocate.com 

To narrow down the 1.5 Million reviews I opted to only recommend canned beer. Beer Data was obtained from: CraftCans.com

# Data Preperation

In [3]:
# Import the Data
df1 = pd.read_csv('beer_reviews.csv')
df1.rename(columns={'beer_name': 'name'}, inplace=True)
df2 = pd.read_csv('beers.csv', index_col='Unnamed: 0')

In [4]:
#Merge DataFrames and drop uneeded features
final = df1.merge(df2, on='name')
final.drop(columns = 'brewery_id_x', inplace=True)
final.drop(columns = 'brewery_id_y', inplace=True)
final.drop(columns = 'beer_style', inplace=True)
final.drop(columns = 'beer_abv', inplace=True)
final.drop(columns = 'beer_beerid', inplace=True)

In [5]:
# Since there was no user id with the data we have to make our own
#Create User ID
final['user_id'] = final.review_profilename.astype('category').cat.codes

In [6]:
final.head()

Unnamed: 0,brewery_name,review_time,review_overall,review_aroma,review_appearance,review_profilename,review_palate,review_taste,name,abv,ibu,id,style,ounces,user_id
0,Caldera Brewing Company,1251327677,4.0,3.5,3.5,NJpadreFan,4.0,4.0,Caldera Pale Ale,0.056,55.0,1419,American Pale Ale (APA),12.0,2966
1,Caldera Brewing Company,1250928902,2.5,3.0,3.5,vacax,3.5,2.5,Caldera Pale Ale,0.056,55.0,1419,American Pale Ale (APA),12.0,10492
2,Caldera Brewing Company,1249866208,4.0,3.5,4.0,d0ggnate,4.0,3.5,Caldera Pale Ale,0.056,55.0,1419,American Pale Ale (APA),12.0,5938
3,Caldera Brewing Company,1249847121,4.5,3.5,4.0,babyhobbes,3.5,4.0,Caldera Pale Ale,0.056,55.0,1419,American Pale Ale (APA),12.0,4856
4,Caldera Brewing Company,1249556277,4.5,3.5,4.0,mdagnew,4.0,4.0,Caldera Pale Ale,0.056,55.0,1419,American Pale Ale (APA),12.0,8260


In [7]:
# We have to create a dataframe of beers. To do this we will create a new dataframe from our final dataframe and
# drop the duplicated
#Create Frame with Beer attributes
beers = final[['id', 'brewery_name', 'name', 'abv', 'ibu', 'style', 'ounces' ]]
#Drop Duplicates
beers = beers.drop_duplicates(subset='name')

In [8]:
# Since not all Beers have an IBU value, we can replace them with zero
# Fill NA's with zero so none of the ibus are missing. 
beers.fillna(0, inplace=True)

In [9]:
# Now lets create a dataFrame with just the reviews
ratings = final[['id', 'user_id', 'review_profilename', 'review_taste']]

# Modeling Process

In [10]:
#Load Data Into Surprise Reader and Dataset, I will be using Review Taste as a rating metric. 
reader = Reader(rating_scale=(0, 5))
data = Dataset.load_from_df(final[['user_id', 'id', 'review_taste']], reader)

In [11]:
# We are going to preform cross_validation to determine the test RMSE and MAE of each model to determine which one is the best
benchmark = []
# Iterate over all models
for algorithm in [SVD(), SVDpp(), SlopeOne(), NMF(), NormalPredictor(), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), BaselineOnly(), CoClustering()]:

# Perform cross validation
    results = cross_validate(algorithm, data, measures=['RMSE', 'MAE'], cv=3, verbose=True)
    
# Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)

Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.5474  0.5449  0.5450  0.5458  0.0012  
MAE (testset)     0.4084  0.4066  0.4063  0.4071  0.0009  
Fit time          4.23    4.51    4.18    4.31    0.15    
Test time         0.35    0.30    0.30    0.32    0.02    
Evaluating RMSE, MAE of algorithm SVDpp on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.5412  0.5389  0.5382  0.5394  0.0013  
MAE (testset)     0.4016  0.4012  0.3993  0.4007  0.0010  
Fit time          31.80   31.75   31.71   31.75   0.03    
Test time         1.54    1.55    1.57    1.55    0.01    
Evaluating RMSE, MAE of algorithm SlopeOne on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.5667  0.5641  0.5722  0.5677  0.0034  
MAE (testset)     0.4126  0.4103  0.4167  0.4132  0.0027  
Fit time          0.21    0.22    0.22    0.22    0.00    
Test 

In [12]:
# Turn Results into dataframe and check results
surprise_results = pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')
surprise_results

Unnamed: 0_level_0,test_rmse,test_mae,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
KNNBaseline,0.509561,0.353288,5.121686,18.86993
KNNBasic,0.519836,0.349158,6.297639,20.08012
SVDpp,0.53945,0.400728,31.75016,1.551463
SVD,0.545783,0.407079,4.305197,0.319089
BaselineOnly,0.558112,0.416718,0.181858,0.236002
SlopeOne,0.567677,0.413222,0.219733,1.108178
KNNWithMeans,0.573748,0.422328,6.431787,20.404572
KNNWithZScore,0.575926,0.422875,6.518571,20.469176
CoClustering,0.63638,0.491617,2.059014,0.238973
NMF,0.660172,0.517569,5.006091,0.247454


We can see that the test rmse for KNN Baseline was the best and the NormalPredictor was the worst

# Modeling

We are going to be making 3 models. First the Normal Predictor to establish some base predictions. Then the BaselineOnly model that gives the baseline estimates for users. Finally KNNBaseline, a collaberative filtering algorithm that takes into account the baseline rating os users. 

## Normal Predictor

In [14]:
# This model will give random rating based on the distribution of the data

In [59]:
# Estimating Baseline Model using Alternating Least Squares
print('Using ALS')
algo = NormalPredictor()
cross_validate(algo, data, measures=['RMSE'], cv=3, verbose=False)

Using ALS


{'test_rmse': array([0.92143036, 0.92005711, 0.91653945]),
 'fit_time': (0.10838603973388672, 0.11768722534179688, 0.11794304847717285),
 'test_time': (0.5653979778289795, 0.33387207984924316, 0.32597780227661133)}

In [60]:
trainset, testset = train_test_split(data, test_size=0.25)
algo = NormalPredictor()
predictions = algo.fit(trainset).test(testset)
accuracy.rmse(predictions)

RMSE: 0.9156


0.915608165842713

In [61]:
df = pd.DataFrame(predictions, columns=['uid', 'iid', 'rui', 'est', 'details'])
df['Iu'] = df.uid.apply(get_Iu)
df['Ui'] = df.iid.apply(get_Ui)
df['err'] = abs(df.est - df.rui)
df.rename(columns={'uid': 'user_id'}, inplace=True)
df.rename(columns={'iid': 'beer_id'}, inplace=True)
df.rename(columns={'Ui': 'number_ratings_beer'}, inplace=True)
df.rename(columns={'Iu': 'number_ratings_user'}, inplace=True)

In [62]:
df.head()
#uid is user id, iid is beer id, Ui is number of ratings on beer, Iu is number of ratings by user. 

Unnamed: 0,user_id,beer_id,rui,est,details,number_ratings_user,number_ratings_beer,err
0,5221,2578,3.5,2.09027,{'was_impossible': False},8,1184,1.40973
1,9889,1920,3.5,3.256822,{'was_impossible': False},48,360,0.243178
2,8084,1387,3.5,3.904277,{'was_impossible': False},36,217,0.404277
3,4451,2079,4.0,3.939735,{'was_impossible': False},37,1106,0.060265
4,4246,583,4.0,3.671362,{'was_impossible': False},5,796,0.328638


In [63]:
#Get Best and Worst Predictions out of Dataframe of Predictions
best_predictions = df.sort_values(by='err')[:10]
worst_predictions = df.sort_values(by='err')[-10:]

In [64]:
best_predictions.head()

Unnamed: 0,user_id,beer_id,rui,est,details,number_ratings_user,number_ratings_beer,err
1752,5433,18,5.0,5.0,{'was_impossible': False},5,222,0.0
23168,440,32,5.0,5.0,{'was_impossible': False},27,512,0.0
36,7170,12,5.0,5.0,{'was_impossible': False},12,927,0.0
18917,5584,1111,5.0,5.0,{'was_impossible': False},0,351,0.0
16392,8995,590,5.0,5.0,{'was_impossible': False},3,344,0.0


In [65]:
worst_predictions.head()

Unnamed: 0,user_id,beer_id,rui,est,details,number_ratings_user,number_ratings_beer,err
17933,2673,360,1.5,5.0,{'was_impossible': False},14,1085,3.5
11563,7897,1052,1.0,4.514743,{'was_impossible': False},31,52,3.514743
9169,1179,1905,1.0,4.518613,{'was_impossible': False},1,1913,3.518613
48,5993,2373,1.0,4.590823,{'was_impossible': False},5,1125,3.590823
9493,2440,814,1.0,4.676293,{'was_impossible': False},9,538,3.676293


### Normal Prediction Evaluation

In [66]:
Normal = NormalPredictor()

# Compute the RMSE of the knn algorithm.
evaluate(Normal, data, measures=['RMSE'])

Evaluating RMSE of algorithm NormalPredictor.

------------
Fold 1
RMSE: 0.9204
------------
Fold 2
RMSE: 0.9194
------------
Fold 3
RMSE: 0.9170
------------
Fold 4
RMSE: 0.9169
------------
Fold 5
RMSE: 0.9222
------------
------------
Mean RMSE: 0.9192
------------
------------


CaseInsensitiveDefaultDict(list,
                           {'rmse': [0.9204309683158504,
                             0.9193892973751175,
                             0.9170121451020743,
                             0.9169477519926404,
                             0.9221719487378827]})

In [67]:
Normal = NormalPredictor()

# Compute the RMSE of the knn algorithm.
evaluate(Normal, data, measures=['MAE'])

Evaluating MAE of algorithm NormalPredictor.

------------
Fold 1
MAE:  0.7254
------------
Fold 2
MAE:  0.7258
------------
Fold 3
MAE:  0.7301
------------
Fold 4
MAE:  0.7272
------------
Fold 5
MAE:  0.7226
------------
------------
Mean MAE : 0.7262
------------
------------


CaseInsensitiveDefaultDict(list,
                           {'mae': [0.7254009768391977,
                             0.7258269727945603,
                             0.730107735245176,
                             0.7272377200974167,
                             0.7226088402952252]})

## Baseline Model

In [68]:
# Estimating Baseline Model using Alternating Least Squares
print('Using ALS')
bsl_options = {'method': 'als'}
algo = BaselineOnly(bsl_options=bsl_options)
cross_validate(algo, data, measures=['RMSE'], cv=3, verbose=False)

Using ALS
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...


{'test_rmse': array([0.55907077, 0.55625219, 0.5594949 ]),
 'fit_time': (0.17580699920654297, 0.20587801933288574, 0.296816349029541),
 'test_time': (0.3072171211242676, 0.1973719596862793, 0.19332575798034668)}

In [69]:
trainset, testset = train_test_split(data, test_size=0.25)
algo = BaselineOnly(bsl_options=bsl_options)
predictions = algo.fit(trainset).test(testset)
accuracy.rmse(predictions)

Estimating biases using als...
RMSE: 0.5565


0.5565432940757297

In [70]:
df = pd.DataFrame(predictions, columns=['uid', 'iid', 'rui', 'est', 'details'])
df['Iu'] = df.uid.apply(get_Iu)
df['Ui'] = df.iid.apply(get_Ui)
df['err'] = abs(df.est - df.rui)
df.rename(columns={'uid': 'user_id'}, inplace=True)
df.rename(columns={'iid': 'beer_id'}, inplace=True)
df.rename(columns={'Ui': 'number_ratings_beer'}, inplace=True)
df.rename(columns={'Iu': 'number_ratings_user'}, inplace=True)

In [71]:
df.head()

Unnamed: 0,user_id,beer_id,rui,est,details,number_ratings_user,number_ratings_beer,err
0,4287,46,2.0,4.017294,{'was_impossible': False},37,261,2.017294
1,2791,2418,4.0,3.853563,{'was_impossible': False},50,16,0.146437
2,419,400,4.5,4.196333,{'was_impossible': False},16,1939,0.303667
3,8240,6,4.5,3.658764,{'was_impossible': False},4,516,0.841236
4,1342,2439,4.0,3.83246,{'was_impossible': False},39,1053,0.16754


In [72]:
#Get Best and Worst Predictions out of Dataframe of Predictions
best_predictions = df.sort_values(by='err')[:10]
worst_predictions = df.sort_values(by='err')[-10:]

In [73]:
best_predictions.head()

Unnamed: 0,user_id,beer_id,rui,est,details,number_ratings_user,number_ratings_beer,err
20310,539,646,3.5,3.50005,{'was_impossible': False},18,946,5e-05
3111,6821,1387,4.0,3.999925,{'was_impossible': False},6,205,7.5e-05
8345,6708,1327,4.0,4.000075,{'was_impossible': False},57,769,7.5e-05
10189,7752,1444,4.0,4.000095,{'was_impossible': False},7,1068,9.5e-05
2527,4816,113,4.0,3.999895,{'was_impossible': False},64,28,0.000105


In [74]:
worst_predictions.head()

Unnamed: 0,user_id,beer_id,rui,est,details,number_ratings_user,number_ratings_beer,err
21393,9525,45,1.0,3.655336,{'was_impossible': False},16,258,2.655336
2430,440,1618,1.0,3.657008,{'was_impossible': False},26,526,2.657008
4557,3008,1281,1.0,3.732898,{'was_impossible': False},1,1387,2.732898
188,10454,2578,1.0,3.746118,{'was_impossible': False},6,1186,2.746118
930,8543,346,1.0,3.800689,{'was_impossible': False},4,175,2.800689


### Baseline Model Evaluation

In [75]:
Baseline = BaselineOnly()

# Compute the RMSE of the knn algorithm.
evaluate(Baseline, data, measures=['RMSE'])

Evaluating RMSE of algorithm BaselineOnly.

------------
Fold 1
Estimating biases using als...
RMSE: 0.5607
------------
Fold 2
Estimating biases using als...
RMSE: 0.5583
------------
Fold 3
Estimating biases using als...
RMSE: 0.5540
------------
Fold 4
Estimating biases using als...
RMSE: 0.5541
------------
Fold 5
Estimating biases using als...
RMSE: 0.5520
------------
------------
Mean RMSE: 0.5558
------------
------------


CaseInsensitiveDefaultDict(list,
                           {'rmse': [0.560742567537501,
                             0.5582704790847489,
                             0.5539616231099461,
                             0.5540679305581494,
                             0.5519636611344069]})

In [76]:
Baseline = BaselineOnly()

# Compute the RMSE of the knn algorithm.
evaluate(Baseline, data, measures=['MAE'])

Evaluating MAE of algorithm BaselineOnly.

------------
Fold 1
Estimating biases using als...
MAE:  0.4172
------------
Fold 2
Estimating biases using als...
MAE:  0.4164
------------
Fold 3
Estimating biases using als...
MAE:  0.4141
------------
Fold 4
Estimating biases using als...
MAE:  0.4139
------------
Fold 5
Estimating biases using als...
MAE:  0.4134
------------
------------
Mean MAE : 0.4150
------------
------------


CaseInsensitiveDefaultDict(list,
                           {'mae': [0.4172296695208557,
                             0.41644067060714424,
                             0.41408621105222443,
                             0.4139135214586265,
                             0.4133690761923398]})

# KNN Baseline w. Grid Search

In [41]:
param_grid = {'bsl_options': {'method': ['als'], #Best is ALS
                             'reg_i': [3], # Best is 3, 
                             'reg_u': [25], # Best is 25
                             'n_epochs': [37], # best is 37
                             'reg': [.01], # .01
                             'learning_rate': [.001]
                             },
              'sim_options': {'name': ['msd'], # Best was MSD
                              'min_support': [1], # Best was 1
                              'user_based': [True],
                              'shrinkage': [1] # best was 1
                             }
              }
gs = GridSearchCV(KNNBaseline, param_grid, measures=['rmse', 'mae'], cv=3)

gs.fit(data)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.


KeyboardInterrupt: 

In [77]:
bsl_options = {'method': 'als',
                'reg_i': 3, # Best is 3, 
                'reg_u': 25, # Best is 25
                'n_epochs': 37
               }
sim_options = {'name': 'msd',
               'min_support': 1, # Best was 1
                'user_based': True,
                'shrinkage': 1 # best was 1
              }
algo = KNNBaseline(k=40, min_k=1, sim_options=sim_options,  bsl_options=bsl_options)
cross_validate(algo, data, measures=['RMSE'], cv=3, verbose=False)

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.


{'test_rmse': array([0.51219125, 0.50458284, 0.50946824]),
 'fit_time': (6.570694208145142, 6.406170845031738, 6.270360946655273),
 'test_time': (23.446837902069092, 18.69985795021057, 19.62760305404663)}

In [None]:
trainset, testset = train_test_split(data, test_size=0.25)
algo = KNNBaseline(sim_options=sim_options,  bsl_options=bsl_options)
predictions = algo.fit(trainset).test(testset)
accuracy.rmse(predictions)

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.


In [None]:
df = pd.DataFrame(predictions, columns=['uid', 'iid', 'rui', 'est', 'details'])
df['Iu'] = df.uid.apply(get_Iu)
df['Ui'] = df.iid.apply(get_Ui)
df['err'] = abs(df.est - df.rui)
df.rename(columns={'uid': 'user_id'}, inplace=True)
df.rename(columns={'iid': 'beer_id'}, inplace=True)
df.rename(columns={'Ui': 'number_ratings_beer'}, inplace=True)
df.rename(columns={'Iu': 'number_ratings_user'}, inplace=True)

In [None]:
df.head()

In [None]:
#Get Best and Worst Predictions out of Dataframe of Predictions
best_predictions = df.sort_values(by='err')[:10]
worst_predictions = df.sort_values(by='err')[-10:]

In [None]:
best_predictions.head()

In [None]:
worst_predictions.head()

### KNN Baseline w. Gridsearch Evaluation

In [None]:
KnnBaseline = KNNBaseline(k=40, min_k=1, sim_options=sim_options,  bsl_options=bsl_options)

# Compute the RMSE of the knn algorithm.
evaluate(KnnBaseline, data, measures=['RMSE'])

In [None]:
# Compute the RMSE of the knn algorithm.
evaluate(KnnBaseline, data, measures=['MAE'])


