In [1]:
import pandas as pd

import seaborn as sb
%matplotlib inline

## Prepare the Data

In [2]:
df = pd.read_csv('../datasets/beer.tsv', sep='\t') 
df['WR'] = df.WR.fillna(0)
df['ABV'] = df.ABV.fillna(0)
df['Type'] = df.Type.fillna("na")

In [3]:
df.Brewery.unique()

array(['The Alchemist', 'Russian River Brewing Company',
       'Founders Brewing Company', 'Three Floyds Brewing Co. & Brewpub',
       'Brouwerij Westvleteren (Sint-Sixtusabdij van Westvleteren)',
       'Goose Island Beer Co.', 'Firestone Walker Brewing Co.',
       'Deschute Brewery', "Bell's Brewing Company", 'Brasserie Cantillon',
       'Brasserie de Rochefort', 'Kern River Brewing Company',
       'Cigar City Brewing', 'Dark Horse Brewing Company',
       'Kuhnhenn Brewing Company', 'Surly Bewing Company',
       'Hill Farmstead Brewery', "Lawson's Finest Liquids", 'The Bruery',
       'Brouwerij St. Bernardus NV', 'The Lost Abbey',
       'Ballast Point Brewing Company',
       'Bayerishe Staatsbrauerei Weihenstephan', 'Smith Brewing Company',
       'Pelican Pub & Brewery', 'Brewery and Tasting Room',
       'Stone Brewing Co.', 'TrÌ¦egs Brewing Company',
       'Brasserie Dieu Du Ciel', 'Boston Beer Company (Samuel Adams)',
       'Lagunitas Brewing Company', 'Alpine Beer Co

In [4]:
df.head()


Unnamed: 0,Rank,Name,Brewery,Type,ABV,WR,Reviews
0,1,Heady Topper,The Alchemist,Imperial IPA,8.0,4.69,3146
1,2,Pliny The Younger,Russian River Brewing Company,Imperial IPA,11.0,4.65,1572
2,3,Pliny The Elder,Russian River Brewing Company,Imperial IPA,8.0,4.64,6129
3,4,Founders CBS Imperial Stout,Founders Brewing Company,Imperial Stout,10.6,4.63,2026
4,5,Founders KBS (Kentucky Breakfast Stout),Founders Brewing Company,Imperial Stout,11.2,4.61,4714


## Fit and score the model 

In [56]:
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
model = LogisticRegression()

X = df[['ABV', 'WR']]
y = df['Type']
X_train, X_test, y_train, y_test = train_test_split(X, y)

model.fit(X_train,y_train)
scores = model.score(X_test,y_test)

print(scores)


0.253968253968


## Predict a single entry

In [50]:
l = df.iloc[1][['ABV', 'WR']]
print (l)
print ("Predicted Type: ", model.predict(l))

ABV      11
WR     4.65
Name: 1, dtype: object
Predicted Type:  ['Imperial Stout']


In [57]:
## As we see the real value of this enrty is Imperial IPA. How can we improve the model? 
df.iloc[1]

Rank                                   2
Name                   Pliny The Younger
Brewery    Russian River Brewing Company
Type                        Imperial IPA
ABV                                   11
WR                                  4.65
Reviews                             1572
Name: 1, dtype: object

## Using groupby


In [42]:
df[df['Brewery'] == 'Alpine Beer Company']

Unnamed: 0,Rank,Name,Brewery,Type,ABV,WR,Reviews
58,59,Nelson,Alpine Beer Company,American IPA,7.1,4.37,1054
59,60,Exponential Hoppiness,Alpine Beer Company,Imperial IPA,11.0,4.37,708
66,67,Hoppy Birthday,Alpine Beer Company,American Pale Ale,5.25,4.36,262
70,71,Duet IPA,Alpine Beer Company,American IPA,7.0,4.35,842
79,80,Pure Hoppiness,Alpine Beer Company,Imperial IPA,8.0,4.34,1116
105,106,Bad Boy,Alpine Beer Company,Imperial IPA,9.0,4.31,253
197,198,Great,Alpine Beer Company,American Barleywine,14.0,4.23,205


In [41]:
df.groupby(["Brewery", "Type"])[['Type']].count() 

Unnamed: 0_level_0,Unnamed: 1_level_0,Type
Brewery,Type,Unnamed: 2_level_1
Allagash Brewing Company,Belgian Strong Dark Ale,1
Allagash Brewing Company,Tripel,1
Alpine Beer Company,American Barleywine,1
Alpine Beer Company,American IPA,2
Alpine Beer Company,American Pale Ale,1
Alpine Beer Company,Imperial IPA,3
Anchorage Brewing Company,Belgian IPA,1
Avery Brewing Company,Imperial IPA,1
Ballast Point Brewing Company,American IPA,1
Ballast Point Brewing Company,Porter,1


In [None]:
df['type_Stout'] = df['Type'].map(lambda k: 1 if 'Stout' in k else 0)
df['type_IPA'] = df['Type'].map(lambda k: 1 if 'IPA' in k else 0)
df['type_Ale'] = df['Type'].map(lambda k: 1 if 'Ale' in k else 0)