In [1]:
import urllib
import json
import pandas as pd
import base64
import numpy as np
import random
import re, nltk
from sklearn.feature_extraction.text import CountVectorizer        
from nltk.stem.porter import PorterStemmer
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report

In [2]:
def json_numpy_obj_hook(dct):
    """Decodes a previously encoded numpy ndarray with proper shape and dtype.
    :param dct: (dict) json encoded ndarray
    :return: (ndarray) if input was an encoded ndarray
    """
    if isinstance(dct, dict) and '__ndarray__' in dct:
        data = base64.b64decode(dct['__ndarray__'])
        return np.frombuffer(data, dct['dtype']).reshape(dct['shape'])
    return dct

In [3]:
with open('beer_1000.json') as data_file:    
    data = json.load(data_file)

In [29]:
file1 = urllib.urlopen('https://s3.amazonaws.com/stantemptesting/beers_compressed.json')
beers_compressed = json.load(file1, object_hook=json_numpy_obj_hook)

In [34]:
file2 = urllib.urlopen('https://s3.amazonaws.com/stantemptesting/features_compressed.json')
features_compressed = json.load(file2, object_hook=json_numpy_obj_hook)

#JSON to list

In [31]:
beer_text_sentiment = []
for beer in data:
    for review in data[beer]:
            beer_text_sentiment.append((beer,review['text'],review['overall']))

In [51]:
def training_data(beer_input):
    random.shuffle(beer_text_sentiment)
    training_number = int(len(beer_text_sentiment)*.2)
    training_text = beer_text_sentiment[:training_number]
    train_data_df = pd.DataFrame(training_text).convert_objects(convert_numeric=True)
    train_data_df.columns = ["Beer", "Text", "Sentiment"]
    num = train_data_df._get_numeric_data()
    num[num <= 3] = 0
    num[num > 3] = 1
    return train_data_df

In [41]:
def testing_data(beer_input):
    training_number = int(len(beer_text_sentiment)*.2)
    testing_text = beer_text_sentiment[training_number:]
    test_data_df = pd.DataFrame(testing_text)
    test_data_df.columns = ["Beer", "Text", "Sentiment"]
    test_data_df = test_data_df.drop('Sentiment', 1)
    return test_data_df

In [52]:
train_data_df = training_data(beer_text_sentiment)
test_data_df = testing_data(beer_text_sentiment)

#Preparing a corpus


In [15]:
# based on http://www.cs.duke.edu/courses/spring14/compsci290/assignments/lab02.html

stemmer = PorterStemmer()
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    # remove non letters
    text = re.sub("[^a-zA-Z]", " ", text)
    # tokenize
    tokens = nltk.word_tokenize(text)
    # stem
    stems = stem_tokens(tokens, stemmer)
    return stems

vectorizer = CountVectorizer(
    analyzer = 'word',
    tokenizer = tokenize,
    lowercase = True,
    stop_words = 'english',
    max_features = 85
)

In [60]:
corpus_data_features = vectorizer.fit_transform(train_data_df.Text.tolist() + test_data_df.Text.tolist())
corpus_data_features_nd = corpus_data_features.toarray()
vocab = vectorizer.get_feature_names()
# Sum up the counts of each vocabulary word
dist = np.sum(corpus_data_features_nd, axis=0)

#A bag-of-words linear classifier



In [49]:
def bag_of_words_linear_classifier(training_data):
    # remember that corpus_data_features_nd contains all of our 
    # original train and test data, so we need to exclude
    # the unlabeled test entries
    X_train, X_test, y_train, y_test  = train_test_split(
            corpus_data_features_nd[0:len(train_data_df)], 
            train_data_df.Sentiment,
            train_size=0.80, 
            random_state=1234)
    #Now we are ready to train our classifier.
    log_model = LogisticRegression()
    log_model = log_model.fit(X=X_train, y=y_train)
    #Now we use the classifier to label our evaluation set. 
    #We can use either predict for classes or predict_proba for probabilities.
    y_pred = log_model.predict(X_test)
    #Finally, we can re-train our model with all the training data and use it for sentiment 
    #classification with the original (unlabeled) test set.
    # train classifier
    log_model = LogisticRegression()
    log_model = log_model.fit(X=corpus_data_features_nd[0:len(train_data_df)], y=train_data_df.Sentiment)
    # get predictions
    test_pred = log_model.predict(corpus_data_features_nd[len(train_data_df):])
    return test_pred

In [57]:
bag_of_words_linear_classifier(train_data_df)

array([ 1.,  1.,  1., ...,  1.,  1.,  1.])

#Sample and Print

In [59]:
# sample some of them
spl = random.sample(xrange(len(test_pred)), 15)
# print text and labels
for text, sentiment in zip(test_data_df.Text[spl], test_pred[spl]):
    print sentiment, text

1.0 Served from a 750ml bottle poured into a tulip glass Dark brown if not black pour with a tan head that disipates farly quickley leaving a little lacing on the glass. Huge smell of coffee, bitter chocolate, and sweet malts. Great aroma and really inviting. Taste of coffee, chocolate, and malt same as smell but something else like a sweet cigar. After taste leaves the bitter coffee flavor blanketed by the sweetness of the malt. Mouth feel is thin pretty characteristic of BBC's beers which makes this a really drinkable session stout with a lot of flavor. Would be great to see this in 12ozs and distributed to Bowling Green.  
0.0 on tap at pike pub in seattle.  pours a clear amber with a small white head, some lacing.  smell of roasted hops and nuts and a little grassy near the end.   taste is a bit different than the normal pale ale, nutty in flavor, slightly creamy, some caramel, not that bitter and slightly sweet.  
1.0 This beer is clear, straw colored, with a white head that settl