In [1]:
import urllib
import json
import pandas as pd
import base64
import numpy as np
import random
import re, nltk
from sklearn.feature_extraction.text import CountVectorizer        
from nltk.stem.porter import PorterStemmer
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report

In [2]:
def json_numpy_obj_hook(dct):
    """Decodes a previously encoded numpy ndarray with proper shape and dtype.
    :param dct: (dict) json encoded ndarray
    :return: (ndarray) if input was an encoded ndarray
    """
    if isinstance(dct, dict) and '__ndarray__' in dct:
        data = base64.b64decode(dct['__ndarray__'])
        return np.frombuffer(data, dct['dtype']).reshape(dct['shape'])
    return dct

In [3]:
with open('beer_1000.json') as data_file:    
    data = json.load(data_file)

In [29]:
file1 = urllib.urlopen('https://s3.amazonaws.com/stantemptesting/beers_compressed.json')
beers_compressed = json.load(file1, object_hook=json_numpy_obj_hook)

In [34]:
file2 = urllib.urlopen('https://s3.amazonaws.com/stantemptesting/features_compressed.json')
features_compressed = json.load(file2, object_hook=json_numpy_obj_hook)

#JSON to list

In [31]:
beer_text_sentiment = []
for beer in data:
    for review in data[beer]:
            beer_text_sentiment.append((beer,review['text'],review['overall']))

In [5]:
beer_text_sentiment[0]

(u"Mephistopheles' Metamorphosis",
 u"A - hazy (perhaps slightly from bottle conditioning) golden color with a decent head that leaves a nice lacing  S - strong fruity esters with sweet malt and no hop noticeable  T - complex fruity ester flavor from start to finish; sweet malt with no hops detectable - finishes very sweet; perfumy alcohol noticed on finish, but not really enough to dry out the strong malt sweetness  M - medium body, moderately high carbonation  D - I picked up this bottle in a great beer store in Lexington on a recent summer trip. I've always liked stopping at BBC in Louisville and it appears there starting to bottle special editions so I thought I'd give it a try. This tripel is definitely closer to the abbey version than the trappist versions as it finishes quite sweet. The esters really overpower and spicy phenols also. But what it lacks in phenols it somewhat makes up for in ester complexity - to a degree. It could use a bit more hop as it borders on cloying. And 

#Training Data

In [6]:
#what we want to do
#we want to randomly take 20% of the input json
#we want to create a dataframe with name, text, and overall rating
#we want to replace overall rating with sentiment

In [7]:
random.shuffle(beer_text_sentiment)
training_number = int(len(beer_text_sentiment)*.2)
training_text = beer_text_sentiment[:training_number]

In [8]:
train_data_df = pd.DataFrame(training_text).convert_objects(convert_numeric=True)
train_data_df.columns = ["Beer", "Text", "Sentiment"]

In [9]:
num = train_data_df._get_numeric_data()
num[num <= 3] = 0
num[num > 3] = 1

In [10]:
train_data_df

Unnamed: 0,Beer,Text,Sentiment
0,Point Defiance IPA,"22 oz. bottle from 99 Bottles, shared with bee...",1
1,Pike Street XXXXX Stout,On tap poured into a sampling glass. Pours a...,1
2,Kirkland Signature India Pale Ale,As a Connecticut resident I had to search for ...,1
3,Pike India Pale Ale,"Tried in Seattle in August 2008. A: Cloudy, n...",1
4,Red Cuillin,Bottle at Chaucer's-Deep mahogany liquid sits ...,0
5,Trafalgar Cedar Cream Ale,Pours a mostly clear copper-gold with a weak h...,0
6,Smashin' Berry Dark,Growler from the brewery 84% Smashin Berry 16...,1
7,Pike Pale Ale,Hazy amber with plenty of carbonation. Thin he...,1
8,Trafalgar Smoked Oatmeal Stout,This poured a perfect black with abundant tan ...,1
9,Pike Kilt Lifter Scotch Style Ale,"Lucent, dark orange-amber; butterscotch more s...",1


#Testing Data

In [11]:
testing_text = beer_text_sentiment[training_number:]
test_data_df = pd.DataFrame(testing_text)
test_data_df.columns = ["Beer", "Text", "Sentiment"]
test_data_df = test_data_df.drop('Sentiment', 1)

In [12]:
test_data_df

Unnamed: 0,Beer,Text
0,Mean Manalishi Double I.P.A.,Poured from 22 oz bomber A - Hazy burnt orang...
1,Barrel Aged B.O.R.I.S. Oatmeal Imperial Stout,I went into my one of my favorite beer stores ...
2,Golden Hawk Wheat,Nice pour for the style and for the brewpubs i...
3,Caldera IPA,"Clear orange-copper, or weak iced-tea in color..."
4,Harboe Bjørnebryg (Bear Beer),11.2oz brown bottle HAR8262 printed on the bac...
5,Pike Old Bawdy Barley Wine,"Pours a dark amber, thick head, little carbona..."
6,Amstel Light,Appearance: Very pale yellow with not much car...
7,Frog's Hollow Double Pumpkin Ale,"22 ounce bomber - Sherlock's in Marietta, Geor..."
8,La Binchoise Brune Tradition,33 cl brown bottle with a best before date on ...
9,Silk Porter,Poured into a pint glass from a 22oz bomber. ...


#Done with data manipulation

In [13]:
#Let's count how many labels do we have for each sentiment class.
train_data_df.Sentiment.value_counts()

1    1799
0     519
dtype: int64

In [14]:
#Finally, let's calculate the average number of words per sentence. 
#We could do the following using a list comprehension with the number of words per sentence.
np.mean([len(s.split(" ")) for s in train_data_df.Text])

130.76747195858499

#Preparing a corpus


In [15]:
#######
# based on http://www.cs.duke.edu/courses/spring14/compsci290/assignments/lab02.html
stemmer = PorterStemmer()
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    # remove non letters
    text = re.sub("[^a-zA-Z]", " ", text)
    # tokenize
    tokens = nltk.word_tokenize(text)
    # stem
    stems = stem_tokens(tokens, stemmer)
    return stems
######## 

vectorizer = CountVectorizer(
    analyzer = 'word',
    tokenizer = tokenize,
    lowercase = True,
    stop_words = 'english',
    max_features = 85
)

In [16]:
corpus_data_features = vectorizer.fit_transform(
    train_data_df.Text.tolist() + test_data_df.Text.tolist())

In [17]:
corpus_data_features_nd = corpus_data_features.toarray()
corpus_data_features_nd.shape

(11592, 85)

In [18]:
#Let take a look at the words in the vocabulary.
vocab = vectorizer.get_feature_names()
print vocab

[u'alcohol', u'ale', u'amber', u'appear', u'aroma', u'balanc', u'beer', u'big', u'bit', u'bitter', u'black', u'bodi', u'bottl', u'bourbon', u'brew', u'brown', u'caramel', u'carbon', u'chocol', u'citru', u'coffe', u'color', u'come', u'creami', u'd', u'dark', u'decent', u'dri', u'drink', u'drinkabl', u'feel', u'finger', u'finish', u'flavor', u'fruit', u'glass', u'good', u'great', u'ha', u'head', u'hint', u'hop', u'ipa', u'just', u'lace', u'leav', u'light', u'like', u'littl', u'look', u'lot', u'm', u'make', u'malt', u'malti', u'medium', u'mouthfeel', u'nice', u'nose', u'note', u'orang', u'overal', u'oz', u'pour', u'pretti', u'quit', u'realli', u'roast', u's', u'slight', u'slightli', u'smell', u'smooth', u'spice', u'stout', u'strong', u'style', u'sweet', u't', u'tast', u'thi', u'tri', u'veri', u'wa', u'white']


In [19]:
# Sum up the counts of each vocabulary word
dist = np.sum(corpus_data_features_nd, axis=0)

# For each, print the vocabulary word and the number of times it 
# appears in the data set
for tag, count in zip(vocab, dist):
    print count, tag

3263 alcohol
2650 ale
2656 amber
1832 appear
6065 aroma
2839 balanc
14621 beer
2089 big
5558 bit
6937 bitter
3050 black
6022 bodi
4113 bottl
1838 bourbon
3177 brew
3870 brown
3841 caramel
6047 carbon
4870 chocol
2746 citru
3510 coffe
5023 color
2135 come
2392 creami
3050 d
6085 dark
1924 decent
2622 dri
3069 drink
3537 drinkabl
1901 feel
1990 finger
5277 finish
8439 flavor
2286 fruit
4390 glass
7448 good
2517 great
3476 ha
11126 head
3066 hint
11427 hop
2417 ipa
4014 just
5598 lace
2107 leav
7466 light
7303 like
4851 littl
2034 look
2406 lot
4127 m
1820 make
10636 malt
2266 malti
4018 medium
4246 mouthfeel
8289 nice
2984 nose
3864 note
2580 orang
2345 overal
1846 oz
8566 pour
2801 pretti
2286 quit
3449 realli
3848 roast
10222 s
1984 slight
2378 slightli
6224 smell
3217 smooth
1889 spice
2582 stout
1856 strong
2469 style
8274 sweet
8043 t
10152 tast
20081 thi
1908 tri
10830 veri
8498 wa
3343 white


#A bag-of-words linear classifier



In [20]:
# remember that corpus_data_features_nd contains all of our 
# original train and test data, so we need to exclude
# the unlabeled test entries
X_train, X_test, y_train, y_test  = train_test_split(
        corpus_data_features_nd[0:len(train_data_df)], 
        train_data_df.Sentiment,
        train_size=0.80, 
        random_state=1234)

In [21]:
#Now we are ready to train our classifier.
log_model = LogisticRegression()
log_model = log_model.fit(X=X_train, y=y_train)

In [22]:
#Now we use the classifier to label our evaluation set. 
#We can use either predict for classes or predict_proba for probabilities.
y_pred = log_model.predict(X_test)


In [23]:
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

        0.0       0.47      0.33      0.39        97
        1.0       0.84      0.90      0.87       367

avg / total       0.76      0.78      0.77       464



In [24]:
#Finally, we can re-train our model with all the training data and use it for sentiment 
#classification with the original (unlabeled) test set.

# train classifier
log_model = LogisticRegression()
log_model = log_model.fit(X=corpus_data_features_nd[0:len(train_data_df)], y=train_data_df.Sentiment)

# get predictions
test_pred = log_model.predict(corpus_data_features_nd[len(train_data_df):])

# sample some of them
import random
spl = random.sample(xrange(len(test_pred)), 15)

# print text and labels
for text, sentiment in zip(test_data_df.Text[spl], test_pred[spl]):
    print sentiment, text

1.0 Poured from a silver foil wrapped bomber into a snifter  A- pours a dark mahogany, one finger dark tan head emerges and quickly dissipates down to a ring, very nice lacing clinging in a nice ring around the glass  S- dark chocolate hits the nose first, followed immediately by the smell of roasted coffee, mixed in is a hint of vanilla that makes it smell like coffee mixed with cream, a little woodiness from the oak, not too much of a bourbon smell which kind of disappoints me  T- dark roasted malt hits the palate immediately, dark bitter chocolate follows and fades into a taste of black coffee, there is a slight twinge of bourbon but not enough considering the style, the oak flavors come through in the aftertaste, really was hoping for more bourbon flavor  M- medium body which is appropriate considering the abv, creamy and smooth, nice balance of carbonation, slight alcohol burn  D- fairly easy to drink, the flavor comes out more as it warms, i personally think this is better than t