In [None]:
import awswrangler as wr
# import lux
import pandas as pd
import numpy as np
import copy

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from surprise import SVD, KNNBasic, NMF
from surprise import Dataset
from surprise import Reader
from surprise import accuracy
from surprise import KNNBaseline, SVD
from surprise import get_dataset_dir
from surprise.model_selection import KFold
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import TruncatedSVD



import warnings
warnings.filterwarnings("ignore")

In [None]:
pip install surprise


In [None]:
df_meta = wr.athena.read_sql_table(table='beer_meta', database='beer_data')
type(df_meta)

In [None]:
df_meta

In [None]:
df_rating = wr.athena.read_sql_table(table='beer_rating', database='beer_data')

type(df_rating)

In [None]:
df_rating

In [None]:
df_meta.head()

In [None]:
df_meta.info()

In [None]:
df_meta.describe()

In [None]:
df_rating.head()

In [None]:
df_rating.info()

In [None]:
df_rating.describe()

In [None]:
df = pd.merge(df_rating, df_meta, on=['brewery_id', 'beer_id'])
df.rename(columns={'name': 'beer_name'}, inplace=True)
df

In [None]:
df = df[['brewery_name', 'beer_name', 'beer_style', 'beer_sub_style', 'user_name', 'user_rating', 'rating', 'rating_count', 'review_count', 'ba_score', 'abv']]

In [None]:
df


In [None]:
df.describe()


In [None]:
data= df.beer_name.value_counts()

plt.figure(figsize=(20, 10))

# Plotting hist without kde
ax = sns.distplot(data, bins=75, kde=False)

# Creating another Y axis
second_ax = ax.twinx()

# Plotting kde without hist on the second Y axis
sns.distplot(data, ax=second_ax, kde=True, hist=False)

#Removing Y ticks from the second axis
second_ax.set_yticks([])

# Set labels
ax.set_xlabel('Review Counts')
ax.set_ylabel('No. Beers')
ax.set_title("Histogram of Review Counts")


In [None]:
grouped_beers = df[['beer_name', 'user_name', 'user_rating']].groupby('beer_name')


In [None]:
grouped_beers.count().sort_values(by='user_name', ascending = False).quantile(np.arange(0.99,1,.0005))


In [None]:
grouped_beers.mean().sort_values(by='user_rating', ascending = False).quantile(np.arange(0,1,.05))


In [None]:
data2 = df.user_name.value_counts()

# Plotting hist without kde
ax2 = sns.distplot(data2, kde=False)

# Creating another Y axis
second_ax2 = ax2.twinx()

# Plotting kde without hist on the second Y axis
sns.distplot(data2, ax=second_ax2, kde=True, hist=False)

#Removing Y ticks from the second axis
second_ax2.set_yticks([])

# Set labels
ax2.set_xlabel('Review Counts')
ax2.set_ylabel('No. Users')
ax2.set_title("Histogram of User Counts")

In [None]:
grouped_users = df.groupby('user_name')


In [None]:
grouped_users_count = grouped_users.count()

counts = [1,2,3,4,5,10,15,20, 25, 50, 75, 100, 500, 1000, 10000]
for ct in counts:
    num_users = grouped_users_count[grouped_users_count['rating'] <= ct].count()[0]
    print('{} users rated {} or less beers'.format(num_users,ct))

print('\n')
print('Total Unique Users in this dataset: {}'.format(len(df.user_name.unique())))


In [None]:
df


In [None]:
avg_rating = grouped_beers.mean()


# # I am definining beers with average rating less than 3.5 to be considered subpar 
# # since a 3.5 avg rating would put it in the bottom 10 percentile of these beers
subpar_average_mask = avg_rating['user_rating'] < 3.5
subpar_beers_count = len(avg_rating[subpar_average_mask])

print('{} beers have a average rating less than 3.5'.format(subpar_beers_count))
print('A 3.5 avg rating puts the beer within the bottom 10%-tile')


In [None]:

bottom_10percent_rating = grouped_beers.mean().quantile(np.arange(0,.11,.01))
bottom_10percent_counts = grouped_beers.count().quantile(np.arange(0,.11,.01))

In [None]:
bottom_10percent_rating


In [None]:
bottom_10percent_counts


In [None]:
data2= df.user_name.value_counts()

# Plotting hist without kde
ax2 = sns.distplot(data2, kde=False)

# Creating another Y axis
second_ax2 = ax2.twinx()

# Plotting kde without hist on the second Y axis
sns.distplot(data2, ax=second_ax2, kde=True, hist=False)

#Removing Y ticks from the second axis
second_ax2.set_yticks([])

# Set labels
ax2.set_xlabel('Review Counts')
ax2.set_ylabel('No. Users')
ax2.set_title("Histogram of User Counts")


In [None]:
grouped_users = df.groupby('user_name')


In [None]:

counts = [1,2,3,4,5,10,15,20]
for ct in counts:
    num_users = grouped_users_count[grouped_users_count['rating'] <= ct].count()[0]
    print('{} users rated {} or less beers'.format(num_users,ct))

print('\n')
print('Total Unique Users in this dataset: {}'.format(len(df.user_name.unique())))

In [None]:
subpar_beers_list = list(avg_rating[subpar_average_mask].index)

ratings_count = grouped_beers.count()
ratings_count


# I am definining beers with ratings count less than 13 to be considered low amount of ratings 
# since a ratings count of 13 would put it in the bottom 10 percentile
low_ratings_count = ratings_count[ratings_count['user_rating'] < 13]
low_ratings_list = list(low_ratings_count.index)

In [None]:
unique_subpar_beers = set(subpar_beers_list)
unique_low_ratings_beers = set(low_ratings_list)
overlaps = unique_subpar_beers.intersection(unique_low_ratings_beers)
print('Number of beers in bottom 10% of avg rating: {}'.format(len(unique_subpar_beers)))
print('Number of beers in bottom 10% of review counts: {}'.format(len(unique_low_ratings_beers)))
print('Number of beers in both of these categories: {}'.format(len(overlaps)))


In [None]:
#remove beers in the bottom 10% of both categories
df1 = df[~df.beer_name.isin(subpar_beers_list)]
df2 = df1[~df1.beer_name.isin(low_ratings_list)]
final_df = copy.deepcopy(df2)


In [None]:
final_df.info()


In [None]:
print('Original number of unique beers: {}'.format(len(df.beer_name.unique())))

print('Revised number of unique beers: {}'.format(len(final_df.beer_name.unique())))


In [None]:
temp_df_user_idx

In [None]:
new_dict_user

In [None]:


# create beerID for each beer

grouped_name = final_df.groupby('beer_name')

temp_df = grouped_name.count()
temp_df_idx = pd.DataFrame(temp_df.index)


temp_df_idx['beer_id'] = temp_df_idx.index
dict_df=temp_df_idx[['beer_id','beer_name']]


desc_dict = dict_df.set_index('beer_name').to_dict()
new_dict = desc_dict['beer_id']

final_df['beer_id'] = final_df.beer_name.map(new_dict)



# create userID for each user
grouped_user = final_df.groupby('user_name')

temp_df_user = grouped_user.count()
temp_df_user_idx = pd.DataFrame(temp_df_user.index)


temp_df_user_idx['user_id'] = temp_df_user_idx.index
dict_df_user = temp_df_user_idx[['user_id','user_name']] 


desc_dict_user = dict_df_user.set_index('user_name').to_dict()
new_dict_user = desc_dict_user['user_id']

final_df['user_id'] = final_df.user_name.map(new_dict_user)

In [None]:

def read_item_names():
    """
    return raw ids  beer names into raw ids.
    """


    file_name = dict_df
    rid_to_name = {}
    name_to_rid = {}

    # there are 3959 unique beers after removing the low rating and review count beers
    unique_beers = len(final_df.beer_name.unique())
    
    for i in range(unique_beers):  
        line = file_name.iloc[i]
        rid_to_name[line[0]] = line[1]
        name_to_rid[line[1]] = line[0]

    return rid_to_name, name_to_rid


In [None]:
def get_rec(beer_name, k):
    """
    Input Beer name and returns k recommendations
    based on item similarity
    
    Input: String, integer
    Output: String
    """
#     output = []
    
    rid_to_name, name_to_rid = read_item_names()

    beer_input_raw_id = name_to_rid[beer_name]
    beer_input_inner_id = algo.trainset.to_inner_iid(beer_input_raw_id)

        
    beer_input_neighbors = algo.get_neighbors(beer_input_inner_id, k=k)

    
    beer_input_neighbors = (algo.trainset.to_raw_iid(inner_id) for inner_id in beer_input_neighbors)
    
    beer_input_neighbors = (rid_to_name[rid]  for rid in beer_input_neighbors)

#     for beer_ in beer_input_neighbors:
#         output.append(beer_)
        
    return list(beer_input_neighbors)

In [None]:
reader = Reader(rating_scale=(1,5))
data = Dataset.load_from_df(final_df[['user_id', 'beer_id', 'rating']], reader)
trainset = data.build_full_trainset()
sim_options = {'name': 'pearson_baseline', 'user_based': False}
algo = KNNBaseline(sim_options=sim_options)
algo.fit(trainset)


In [None]:

# Run 5-fold cross-validation and print results
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)


In [None]:
# top 20 most rated beers
grouped_beer_names = final_df.groupby('beer_name')
grouped_beer_names.count().sort_values(by='user_name', ascending=False)[0:21].index.tolist()

In [None]:
grouped_beer_names.mean().sort_values(by='rating', ascending=False)[0:21].index.tolist()

In [None]:
get_rec('Headroom', 20)

In [None]:
get_rec('Two Hearted Ale', 20)

In [None]:
top20rated = final_df.groupby('beer_name').count().sort_values(by='user_name', ascending =False)[0:21].index.tolist()
top20rated = set(top20rated)

top20rating = final_df.groupby('beer_name').mean().sort_values(by='rating', ascending =False)[0:21].index.tolist()
top20rating = set(top20rating)

beer_1 = set(get_rec('Headroom',20))
beer_2 = set(get_rec('Two Hearted Ale',20))
beer_1

In [None]:
print(beer_1.intersection(top20rated))
print(beer_1.intersection(top20rating))


In [None]:
print(beer_2.intersection(top20rated))
print(beer_2.intersection(top20rating))

In [None]:
beer_3 = set(get_rec("A Little Sumpin' Sumpin' Ale",20))
beer_4 = set(get_rec('Hazy Memory',20))
beer_5 = set(get_rec('Pilsner Urquell',20))
beer_6 = set(get_rec('831 IPA',20))
beer_7 = set(get_rec('Pliny The Elder',20))
beer_8 = set(get_rec('West Coast IPA', 20))


In [None]:
print(beer_3.intersection(top20rated))
print(beer_3.intersection(top20rating))
print(beer_4.intersection(top20rated))
print(beer_4.intersection(top20rating))
print(beer_5.intersection(top20rated))
print(beer_5.intersection(top20rating))
print(beer_6.intersection(top20rated))
print(beer_6.intersection(top20rating))
print(beer_7.intersection(top20rated))
print(beer_7.intersection(top20rating))
print(beer_8.intersection(top20rated))
print(beer_8.intersection(top20rating))


In [None]:
beer_3

In [None]:
beer_6

In [None]:
beer_8

In [None]:
final_df_pivot = final_df.pivot_table(index='user_name', columns='beer_name', values='rating').fillna(0)


In [None]:
final_df_pivot.info()


In [None]:
final_df_pivot_T = final_df_pivot.values.T
final_df_pivot_T.shape


In [None]:

def exp_var(list_n_components):
    
    out = []
    
    for num in list_n_components:
        SVD = TruncatedSVD(n_components=num,random_state=num)
        SVD.fit_transform(final_df_pivot_T)
        evar = np.sum(SVD.explained_variance_ratio_)
        t = (num,evar)
        out.append(t)
    
    return out

In [None]:
n_comp = [5,10,20,50,100,200,300]
explained_variance = exp_var(n_comp)

In [None]:
[print(i) for i in explained_variance]


In [None]:
x, y = zip(*explained_variance)
plt.scatter(x, y)

In [None]:
SVD200 = TruncatedSVD(n_components=200, random_state=43)
matrix200 = SVD200.fit_transform(final_df_pivot_T)
matrix200.shape


In [None]:
corr200 = np.corrcoef(matrix200)
corr200.shape

In [None]:

# name of all beers
beer_rec_names200 = final_df_pivot.columns

# list of all beer names
beer_rec_list200 = list(beer_rec_names200)

In [None]:
def svd200_recs(string, n):
    """
    function returns top n recommendations base on input of beer name and n.
    
    inputs: 
    string (name of beer) -> string
    n (n recommendations) ->  int
    """
    
    # get index of beer name from list of all beers in the training data
    get_index = beer_rec_list200.index(string)
    
    # similarity coeff of all other beers w respect to the input beer
    similarities = corr200[get_index]
    
    # create  & popluate list to hold tuples of beer names and their 
    # respective correlation coeff, sorted in reverse
    closest = []    
    for idx, coeff in enumerate(similarities):
        closest.append((beer_rec_list200[idx], coeff))
    
    closest.sort(key=lambda x: x[1], reverse=True)
    
    out = []
    
    for i in range(1,n+1):
        out.append(closest[i][0])
    return out

In [None]:
svd200_recs('Spotted Cow',20)


In [None]:
A = set(svd200_recs('Two Hearted Ale',20))
B = set(get_rec('Two Hearted Ale',20))
print('common beers: {}'.format(A.intersection(B)))
print('number of common beers: {}'.format(len(A.intersection(B))))


In [None]:

def compare_recs(name_list,n):

    results = []
    
    for idx, name in enumerate(name_list):
        svd = set(svd200_recs(name,n))
        knn = set(get_rec(name,n))
        common = len(svd.intersection(knn))
        tup = (idx, common)
        results.append(tup)
    
    x,y = zip(*results)
    plt.scatter(x, y)
    plt.xlabel('Beer No.')
    plt.ylabel('Common Recs')
    plt.show()


In [None]:
grouped = final_df.groupby('beer_name')
namelist = grouped.mean().sort_values(by='rating', ascending =False)[::40].index.tolist()


In [None]:
compare_recs(namelist, 50)
