# Beer Recommendation KNN Fitting 

This notebook contains the code to fit the KNN algorithm for the beer ratings and save the distances to each beer using centered cosine similarity. 

In [0]:
# Import Dependencies 
from  pyspark.sql.types import StructField, StructType, StringType, LongType, FloatType
import pyspark.sql.functions as f 
import pandas as pd
from pyspark.ml.feature import StringIndexer

# Define all dbfs filepaths 
base_dir = 'dbfs:/FileStore/tables'
project_dir = base_dir + '/Capstone'
beer_dir = project_dir + '/beer'

data_dir = beer_dir + '/data'
mf_model_dir = beer_dir + '/beer_mf_model'

rec_dir = beer_dir + '/beer_recs'
exp_rec_dir = rec_dir + '/experienced'
new_rec_dir = rec_dir + '/new'

for dirname in [project_dir, beer_dir, data_dir, mf_model_dir, rec_dir, exp_rec_dir, new_rec_dir]: 
    try: dbutils.fs.ls(dirname)
    except: dbutils.fs.mkdirs(dirname) 

# Schema For raw beer dataframe 
# Specifies column name, type and whether values are nullable
beerSchema = StructType([
    StructField('brewery_id', LongType(), True), 
    StructField('brewery_name', StringType(), True), 
    StructField('review_time', LongType(), True), 
    StructField('review_overall', FloatType(), True), 
    StructField('review_aroma', FloatType(), True), 
    StructField('review_apperance', FloatType(), True), 
    StructField('review_profilename', StringType(), True), 
    StructField('beer_style', StringType(), True), 
    StructField('review_palate', FloatType(), True), 
    StructField('review_taste', FloatType(), True), 
    StructField('beer_name', StringType(), True), 
    StructField('beer_abv', FloatType(), True), 
    StructField('beer_beerid', LongType(), True)
])

# Checking if file is in correct directory, otherwise moving it 
try: dbutils.fs.ls(data_dir + '/beer_reviews.csv')
except: dbutils.fs.cp(base_dir + '/beer_reviews.csv', data_dir)

# Loading the raw data file 
raw = spark.read.load(path = data_dir + '/beer_reviews.csv', 
                      format='csv', header=True, schema= beerSchema)

## Preprocessing Data

In [0]:
# Selecting requisite columns
# Dropping any missing values in these columns 

taste_revs_raw = raw.select(f.col('review_profilename').alias('username'), 
                            'beer_beerid', 'review_taste', 'review_time')\
    .na.drop()

# Checking for duplicated reviews
n_dups = taste_revs_raw.groupBy(["username", "beer_beerid"]).count().filter("count > 1").count()
print(f'''
Number of Duplicated Reviews: {n_dups}
(Instance of the same person reviewing the same beer more than once.)
''' )

# Aggregating Duplicate Reviews 
taste_revs_raw = taste_revs_raw.groupBy(['username', 'beer_beerid'])\
        .agg(f.mean(f.col('review_taste')).alias('review_taste'), 
             f.mean(f.col('review_time')).alias('review_time'))


Number of Duplicated Reviews: 14274
(Instance of the same person reviewing the same beer more than once.)



In [0]:
# Getting MF Training set to extract beers
time_cutoff = taste_revs_raw.select(f.percentile(f.col('review_time'), 0.8).alias('percentile')).first()['percentile']
full_train = taste_revs_raw.filter(f.col('review_time') <= time_cutoff)\
    .select('username', 'beer_beerid', 'review_taste')


In [0]:
### Setting the Min Reviews per Beer and Min number of reviews per User 
# Min number of total reviews per user 
minRevsPerUser = 5
# Min number of reviews per beer 
minRevsPerBeer = 50

# Getting IDs of beers above review cutoffs
commonBeers = full_train.groupBy('beer_beerid').count()\
    .filter(f.col('count') > minRevsPerBeer).select('beer_beerid')

# Getting usernames of experienced users (at or above cutoff)
expUsers5 = taste_revs_raw.groupBy('username').count()\
    .filter(f.col('count') >= minRevsPerUser).select('username')

# Removing beers and users through broadcast join (most efficient way)
# Centering the Reviews by user
knn_train = taste_revs_raw\
    .join(f.broadcast(commonBeers), 'beer_beerid', 'inner')\
        .join(f.broadcast(expUsers5), 'username', 'inner')\
            .withColumn('user_mean', f.expr('mean(review_taste) over (partition by username)'))\
                .withColumn('centered_rev', f.col('review_taste') - f.col('user_mean'))

# Finally, adding a numeric index based on username
strIndex = StringIndexer(inputCol = 'username', outputCol = 'user_id')
knn_train_ind = strIndex.fit(knn_train).transform(knn_train)

In [0]:
# Display user, beer, and review count (slow)
print(f'Number of Users: {knn_train_ind.select("username").distinct().count()}')
print(f'Number of Beers: {knn_train_ind.select("beer_beerid").distinct().count()}')
print(f'Number of Reviews: {knn_train_ind.count()}')

Number of Users: 14754
Number of Beers: 4189
Number of Reviews: 1148235


## Calculating Distances through KNN

In [0]:
# Import dependencies 
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix

# Get pd df from pyspark df (KNN better through sklearn than pyspark)
pd_train = knn_train_ind.toPandas()

# Get user-item review matrix and fill Nas with 0
uir = pd_train.pivot(index='beer_beerid', columns='user_id', values='centered_rev').fillna(0)
# Get a sparse dataframe for more effient storage 
uir_sparse = csr_matrix(uir.values)

# Initiate and Fit KNN
knn_model = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=11, n_jobs=-1)
nbrs = knn_model.fit(uir_sparse)

# Get 10 closest neighbors for each beer 
distances, indices = nbrs.kneighbors(uir_sparse)

### Turn ID and Distance arrays into a df with one prediction per row 
idx_df = pd.DataFrame(indices).drop(columns=0, inplace=False)
dis_df = pd.DataFrame(distances).drop(columns=0, inplace=False)

# Get full neighbor df from distances and ids 
nbr_df = pd.DataFrame(data=[idx_df.values.tolist(), dis_df.values.tolist()], 
                      index=['nbrs_idx', 'nbrs_dis']).T

# Get a list of id, distance tuples 
nbr_df['nbrs'] = nbr_df.apply(lambda x: list(zip(x.nbrs_idx, x.nbrs_dis)), axis=1)
# Add original beer id as the index 
nbr_df['beer_beerid'] = uir.index

# Turn the df back into a spark df for speed
nbr_ps = spark.createDataFrame(nbr_df[['beer_beerid', 'nbrs']])

# Explode DF and separate id and distances to columns 
nbr_ps = nbr_ps.select(f.col('beer_beerid'), f.explode(f.col('nbrs')).alias('nbr'))\
    .withColumn('nbr_idx', f.col('nbr').getItem(0))\
        .withColumn('nbr_dis', f.col('nbr').getItem(1))\
            .select(f.col('beer_beerid'), f.col('nbr_idx').cast('integer'), f.col('nbr_dis'))

# Turn the matrix ids back into beer IDs for recommendations  
idx_mappers = sc.broadcast({ idx:v for idx, v in enumerate(list(uir.index))})
def mapIdx(x): return idx_mappers.value[x]
mapUDF = f.udf(mapIdx, LongType())
nbr_ps = nbr_ps.withColumn('nbr_id', mapUDF(f.col('nbr_idx')))\
    .select('beer_beerid', 'nbr_id', 'nbr_dis')

# Get the list of beer ids, names and brewery names
beer_mapper = raw.select('beer_beerid', 'brewery_name', 'beer_name').distinct()

# Add both the home beer (beer recommendations are made for) 
# and the recommended beer names and breweries
nbr_ps = nbr_ps.join(beer_mapper.hint('broadcast'), 'beer_beerid', 'left_outer')\
    .withColumnRenamed('beer_name', 'home_beer_name')\
        .withColumnRenamed('brewery_name', 'home_brewery_name')\
            .join(beer_mapper.hint('broadcast'), nbr_ps['nbr_id'] == beer_mapper['beer_beerid'], 'left_outer')\
                .withColumnRenamed('beer_name', 'rec_beer_name')\
                    .withColumnRenamed('brewery_name', 'rec_brewery_name')\
                        .drop('beer_beerid', 'nbr_id')

# Finally, write the recommendations to file
nbr_ps.write.option("header",True)\
    .format("csv") \
        .mode("overwrite") \
            .save(new_rec_dir + "/new_user_recommendations.csv")


Uploading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

## Display Examples of Predicted Ratings CSV 

Showing 20 examples of the recommendations by beer. Here, the 'home' beer is the beer the user indicated and the 'rec' beer is the beer that is recommened. The distance measure is also includeed to sort future queries. 

In [0]:
# Load recommendations
knn_recs = spark.read.load(path = new_rec_dir + '/new_user_recommendations.csv', 
                      format='csv', header=True)

# Shuffle and display examples 
knn_recs.orderBy(f.rand()).limit(20).display()

nbr_dis,home_brewery_name,home_beer_name,rec_brewery_name,rec_beer_name
0.7763071020257143,Mayflower Brewing Company,Mayflower Pale Ale,Rapscallion,Rapscallion Blessing
0.8486765130271251,Broughton Ales Ltd,Merlin's Ale,Broughton Ales Ltd,Black Douglas Ale
0.904616244331192,Brouwerij Lindemans,Lindemans Cassis,Wells & Young's Ltd,Young's Waggledance Honey Ale
0.835189532943269,Highland Brewing,Highland Seven Sisters Abbey Style Ale,Dragonmead Microbrewery,Dragonmead Under The Kilt Wee Heavy
0.8854377301899683,Otter Creek Brewing / Wolaver's,Wolaver's Witbier,O'Fallon Brewery,Goats Breath Bock
0.7809727491170254,Sierra Nevada Brewing Co.,Sierra Nevada Pale Ale,Stone Brewing Co.,Arrogant Bastard Ale
0.9131872660285406,Bar Harbor Brewing Company,Thunder Hole Ale,Allagash Brewing Company,Allagash Victoria Ale
0.8928876304292988,Magic Hat Brewing Company,Hex,Harpoon Brewery,Harpoon Crystal Wheat
0.801079091753079,Brauerei Beck & Co.,Beck's Premier Light,Grupo Modelo S.A. de C.V.,Corona Light
0.8617217122118361,Sprecher Brewing Company,Abbey Triple,"Bell's Brewery, Inc.",Bell's Wheat Six Ale
