In [1]:
# Import the Python packages that will be needed.  
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import pprint as pp
import os
import numpy as np

In [2]:
# Read the beer reviews file
filename = os.path.join("beer_reviews/beer_reviews.csv")
df = pd.read_csv(filename)

In [3]:
# let's limit things to the top 250
n = 250
top_n = df.beer_name.value_counts().index[:n]
df = df[df.beer_name.isin(top_n)]

print df.head()

      brewery_id             brewery_name  review_time  review_overall  \
798         1075  Caldera Brewing Company   1212201268             4.5   
1559       11715  Destiny Brewing Company   1137124057             4.0   
1560       11715  Destiny Brewing Company   1129504403             4.0   
1563       11715  Destiny Brewing Company   1137125989             3.5   
1564       11715  Destiny Brewing Company   1130936611             3.0   

      review_aroma  review_appearance review_profilename  \
798            4.5                  4             grumpy   
1559           3.5                  4    blitheringidiot   
1560           2.5                  4        NeroFiddled   
1563           3.0                  4    blitheringidiot   
1564           3.0                  3             Gavage   

                            beer_style  review_palate  review_taste  \
798   American Double / Imperial Stout            4.0           4.5   
1559           American Pale Ale (APA)            3.

In [4]:
df.shape

(355275, 13)

In [5]:
# Create a pivot table with users as columns and the beer names as rows and the values being
# the 'review_overall'.  This pivot table is needed to determine the cosine similarity of the
# beers using the 'review_overall' values for similarity determination.
print "melting..."
df_wide = pd.pivot_table(df, values=["review_overall"],
                         index=["beer_name", "review_profilename"],
                         aggfunc=np.mean).unstack()

melting...


In [6]:
df_wide.head()

Unnamed: 0_level_0,review_overall,review_overall,review_overall,review_overall,review_overall,review_overall,review_overall,review_overall,review_overall,review_overall,review_overall,review_overall,review_overall,review_overall,review_overall,review_overall,review_overall,review_overall,review_overall,review_overall,review_overall
review_profilename,0110x011,02maxima,03SVTCobra,05Harley,0Naught0,0beerguy0,0runkp0s,0tt0,1000Bottles,1001111.0,...,zuker,zulufactor,zumicroom,zwalk8,zwoehr,zymrgy,zymurgy4all,zymurgywhiz,zythus,zyzygy
beer_name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
#9,,,,,,,,,,,...,,,,,,,,,,
120 Minute IPA,,,,4.0,,,,1.5,,,...,,,,,,,,,,
1554 Enlightened Black Ale,,,,,,,,,,,...,,,,,,,,,,
60 Minute IPA,,,,,,,,,,,...,,,,,,,,,,
90 Minute IPA,5.0,,,4.0,,,,,,,...,,,,,,,,,,


In [7]:
# any cells that are missing data (i.e. a user didn't buy a particular product)
# we're going to set to 0
df_wide = df_wide.fillna(0)

In [8]:
# this is the key. we're going to use cosine_similarity from scikit-learn
# to compute the distance between all beers
print "calculating similarity"
dists = cosine_similarity(df_wide)

# stuff the distance matrix into a dataframe so it's easier to operate on
dists = pd.DataFrame(dists, columns=df_wide.index)

# give the indicies (equivalent to rownames in R) the name of the beer
dists.index = dists.columns

def get_similar(beers,n=None):
    """
    get_sims takes a distance matrix an beer
    and will calculate the 10 most similar products to product based on the
    distance matrix

    dists - a distance matrix
    beers - list of beers that the user likes
    """
    beers = [beer for beer in beers if beer in dists.columns]
    beers_summed = dists[beers].apply(lambda row: np.sum(row), axis=1)
    beers_summed = beers_summed.sort_values(ascending=False)
    ranked_beers = beers_summed.index[beers_summed.index.isin(beers)==False]
    ranked_beers = ranked_beers.tolist()
    if n is None:
        return ranked_beers
    else:
        return ranked_beers[:n]
    
#    p = dists[products].apply(lambda row: np.sum(row), axis=1)
#    p = p.order(ascending=False)
#    if n is None:
#        return p.index[p.index.isin(products)==False]
#    else:
#        return p.index[p.index.isin(products)==False][:n]

get_similar(["Sierra Nevada Pale Ale", "120 Minute IPA", "Coors Light"],10)

calculating similarity


['Samuel Adams Boston Lager',
 'Sierra Nevada Celebration Ale',
 '90 Minute IPA',
 'Arrogant Bastard Ale',
 'Stone IPA (India Pale Ale)',
 '60 Minute IPA',
 'HopDevil Ale',
 'Stone Ruination IPA',
 'Sierra Nevada Bigfoot Barleywine Style Ale',
 'Storm King Stout']

In [10]:
dists.head()

beer_name,#9,120 Minute IPA,1554 Enlightened Black Ale,60 Minute IPA,90 Minute IPA,Aecht Schlenkerla Rauchbier Märzen,AleSmith IPA,AleSmith Speedway Stout,Allagash White,Alpha King Pale Ale,...,Vanilla Porter,Weihenstephaner Hefeweissbier,Weihenstephaner Korbinian,Westmalle Trappist Dubbel,Westmalle Trappist Tripel,World Wide Stout,Yeti Imperial Stout,Young's Double Chocolate Stout,Yuengling Traditional Lager,Éphémère (Apple)
beer_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
#9,1.0,0.275405,0.274103,0.388364,0.365175,0.253841,0.228479,0.227612,0.340681,0.293315,...,0.26657,0.312395,0.276463,0.233554,0.276763,0.286534,0.299032,0.32928,0.348058,0.312499
120 Minute IPA,0.275405,1.0,0.251519,0.378258,0.410366,0.262425,0.315971,0.337541,0.282273,0.336796,...,0.201428,0.312193,0.28232,0.2708,0.301144,0.418214,0.337978,0.285483,0.233014,0.280248
1554 Enlightened Black Ale,0.274103,0.251519,1.0,0.319887,0.314028,0.252486,0.266866,0.261761,0.260275,0.307296,...,0.285846,0.300474,0.292369,0.265445,0.271656,0.262771,0.295029,0.316295,0.225219,0.273763
60 Minute IPA,0.388364,0.378258,0.319887,1.0,0.533042,0.316928,0.312343,0.307627,0.360975,0.385249,...,0.285143,0.413405,0.329941,0.308774,0.355926,0.358224,0.391041,0.39984,0.326916,0.339324
90 Minute IPA,0.365175,0.410366,0.314028,0.533042,1.0,0.312861,0.344218,0.358754,0.356804,0.418582,...,0.262775,0.436398,0.343738,0.333099,0.387312,0.405116,0.414385,0.395031,0.301877,0.332292


In [9]:
dists.to_pickle('beer.pkl') 