Building a Recommendation System in Python
============================
> Based on the blog post and [IPython notebook](http://nbviewer.ipython.org/gist/glamp/20a18d52c539b87de2af) from yHat

In [1]:
import pandas as pd
import numpy as np
import pylab as pl

## Unpack the data file

The dataset is compressed into a tar.gz (tarball) file in the fall_2014_lessons/datasets folder. Decompress it with the following command:

In [3]:
%%sh
tar xvf ~/notebooks/fall_2014_lessons/datasets/beer_reviews.tar.gz

tar: /home/vagrant/Downloads/beer_reviews.tar.gz: Cannot open: No such file or directory
tar: Error is not recoverable: exiting now


In [3]:
# substitute your name here. If you're on windows you'll need a different filepath
df = pd.read_csv("./beer_reviews/beer_reviews.csv")
df.head()


Unnamed: 0,brewery_id,brewery_name,review_time,review_overall,review_aroma,review_appearance,review_profilename,beer_style,review_palate,review_taste,beer_name,beer_abv,beer_beerid
0,10325,Vecchio Birraio,1234817823,1.5,2.0,2.5,stcules,Hefeweizen,1.5,1.5,Sausa Weizen,5.0,47986
1,10325,Vecchio Birraio,1235915097,3.0,2.5,3.0,stcules,English Strong Ale,3.0,3.0,Red Moon,6.2,48213
2,10325,Vecchio Birraio,1235916604,3.0,2.5,3.0,stcules,Foreign / Export Stout,3.0,3.0,Black Horse Black Beer,6.5,48215
3,10325,Vecchio Birraio,1234725145,3.0,3.0,3.5,stcules,German Pilsener,2.5,3.0,Sausa Pils,5.0,47969
4,1075,Caldera Brewing Company,1293735206,4.0,4.5,4.0,johnmichaelsen,American Double / Imperial IPA,4.0,4.5,Cauldron DIPA,7.7,64883


## Finding People Who Have Reviewed 2 Beers

In [4]:
beer_1, beer_2 = "Dale's Pale Ale", "Fat Tire Amber Ale"

beer_1_reviewers = df[df.beer_name==beer_1].review_profilename.unique()
beer_2_reviewers = df[df.beer_name==beer_2].review_profilename.unique()
common_reviewers = set(beer_1_reviewers).intersection(beer_2_reviewers)
print "Users in the sameset: %d" % len(common_reviewers)
list(common_reviewers)[:10]

Users in the sameset: 499


['womencantsail',
 'Marty30',
 'Winter',
 'Lothore',
 'bump8628',
 'gford217',
 'lackenhauser',
 'wspscott',
 'mjurney',
 'LiquidBread219']

## Extracting Reviews

In [5]:
def get_beer_reviews(beer, common_users):
    mask = (df.review_profilename.isin(common_users)) & (df.beer_name==beer)
    reviews = df[mask].sort('review_profilename')
    reviews = reviews[reviews.review_profilename.duplicated()==False]
    return reviews
beer_1_reviews = get_beer_reviews(beer_1, common_reviewers)
beer_2_reviews = get_beer_reviews(beer_2, common_reviewers)

cols = ['beer_name', 'review_profilename', 'review_overall', 'review_aroma', 'review_palate', 'review_taste']
beer_2_reviews[cols].head()

Unnamed: 0,beer_name,review_profilename,review_overall,review_aroma,review_palate,review_taste
202456,Fat Tire Amber Ale,ATPete,4.5,4.0,4.0,4.5
201458,Fat Tire Amber Ale,AdamBear,3.5,2.5,4.5,3.5
201886,Fat Tire Amber Ale,AlCaponeJunior,2.0,3.0,3.5,3.0
202481,Fat Tire Amber Ale,AltBock,4.0,3.0,3.0,3.0
201803,Fat Tire Amber Ale,Andreji,4.0,4.5,4.0,4.0


## Calculating Distance

In [6]:
# choose your own way to calculate distance
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics.pairwise import manhattan_distances
from scipy.stats.stats import pearsonr


ALL_FEATURES = ['review_overall', 'review_aroma', 'review_palate', 'review_taste']
def calculate_similarity(beer1, beer2):
    # find common reviewers
    beer_1_reviewers = df[df.beer_name==beer1].review_profilename.unique()
    beer_2_reviewers = df[df.beer_name==beer2].review_profilename.unique()
    common_reviewers = set(beer_1_reviewers).intersection(beer_2_reviewers)

    # get reviews
    beer_1_reviews = get_beer_reviews(beer1, common_reviewers)
    beer_2_reviews = get_beer_reviews(beer2, common_reviewers)
    dists = []
    for f in ALL_FEATURES:
        dists.append(euclidean_distances(beer_1_reviews[f], beer_2_reviews[f])[0][0])
    
    return dists

calculate_similarity(beer_1, beer_2)

[17.727097901235837,
 17.392527130926087,
 16.598192672697831,
 17.663521732655695]

## Calculate the Similarity for a Set of Beers

In [7]:
# calculate only a subset for the demo
beers = ["Dale's Pale Ale", "Sierra Nevada Pale Ale", "Michelob Ultra",
         "Natural Light", "Bud Light", "Fat Tire Amber Ale", "Coors Light",
         "Blue Moon Belgian White", "60 Minute IPA", "Guinness Draught"]

# calculate everything for real production
# beers = df.beer_name.unique()

simple_distances = []
for beer1 in beers:
    print "starting", beer1
    for beer2 in beers:
        if beer1 != beer2:
            row = [beer1, beer2] + calculate_similarity(beer1, beer2)
            simple_distances.append(row)

starting Dale's Pale Ale
starting Sierra Nevada Pale Ale
starting Michelob Ultra
starting Natural Light
starting Bud Light
starting Fat Tire Amber Ale
starting Coors Light
starting Blue Moon Belgian White
starting 60 Minute IPA
starting Guinness Draught


## Inspect the Results

In [8]:
cols = ["beer1", "beer2", "overall_dist", "aroma_dist", "palate_dist", "taste_dist"]
simple_distances = pd.DataFrame(simple_distances, columns=cols)
simple_distances.tail()

Unnamed: 0,beer1,beer2,overall_dist,aroma_dist,palate_dist,taste_dist
85,Guinness Draught,Bud Light,43.769281,42.219664,45.508241,44.418465
86,Guinness Draught,Fat Tire Amber Ale,23.921747,21.171915,25.455844,22.825424
87,Guinness Draught,Coors Light,40.838095,38.343839,42.985463,40.32989
88,Guinness Draught,Blue Moon Belgian White,26.832816,22.248595,25.641763,23.780244
89,Guinness Draught,60 Minute IPA,31.18894,32.810059,32.622845,34.5


## Allow the User to Customize the Weights

In [9]:
def calc_distance(dists, beer1, beer2, weights):
    mask = (dists.beer1==beer1) & (dists.beer2==beer2)
    row = dists[mask]
    row = row[['overall_dist', 'aroma_dist', 'palate_dist', 'taste_dist']]
    dist = weights * row
    return dist.sum(axis=1).tolist()[0]

weights = [2, 1, 1, 1]
print calc_distance(simple_distances, "Fat Tire Amber Ale", "Dale's Pale Ale", weights)
print calc_distance(simple_distances, "Fat Tire Amber Ale", "Michelob Ultra", weights)

87.1084373388
153.501683272


## Find Similar Beers for Coors Light

In [17]:
my_beer = "Coors Light"
results = []
for b in beers:
    if my_beer!=b:
        results.append((my_beer, b, calc_distance(simple_distances, my_beer, b, weights)))
sorted(results, key=lambda x: x[2])

[('Coors Light', 'Natural Light', 69.52320122830363),
 ('Coors Light', 'Michelob Ultra', 72.4303764655898),
 ('Coors Light', 'Bud Light', 100.45382254092895),
 ('Coors Light', 'Blue Moon Belgian White', 175.24657417286627),
 ('Coors Light', 'Fat Tire Amber Ale', 176.31863930228485),
 ('Coors Light', "Dale's Pale Ale", 181.20123311633913),
 ('Coors Light', 'Guinness Draught', 205.42636799646337),
 ('Coors Light', '60 Minute IPA', 233.40510433819486),
 ('Coors Light', 'Sierra Nevada Pale Ale', 254.78216241090442)]