In [1]:
import urllib
import json
import pandas as pd
import base64
import numpy as np
import random
import re, nltk
from sklearn.feature_extraction.text import CountVectorizer        
from nltk.stem.porter import PorterStemmer
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report

In [2]:
def json_numpy_obj_hook(dct):
    """Decodes a previously encoded numpy ndarray with proper shape and dtype.
    :param dct: (dict) json encoded ndarray
    :return: (ndarray) if input was an encoded ndarray
    """
    if isinstance(dct, dict) and '__ndarray__' in dct:
        data = base64.b64decode(dct['__ndarray__'])
        return np.frombuffer(data, dct['dtype']).reshape(dct['shape'])
    return dct

In [4]:
with open('new_transcripts_2.json') as data_file:    
    data = json.load(data_file)

#JSON to list

In [5]:
beer_text_sentiment = []
for beer in data:
    for review in data[beer]:
            beer_text_sentiment.append((beer,review['text'],review['overall']))

In [6]:
len(beer_text_sentiment)

1586614

In [7]:
def training_data(beer_input):
    random.shuffle(beer_text_sentiment)
    training_number = int(len(beer_text_sentiment)*.2)
    training_text = beer_text_sentiment[:training_number]
    train_data_df = pd.DataFrame(training_text).convert_objects(convert_numeric=True)
    train_data_df.columns = ["Beer", "Text", "Sentiment"]
    num = train_data_df._get_numeric_data()
    num[num <= 3] = 0
    num[num > 3] = 1
    return train_data_df

In [8]:
def testing_data(beer_input):
    training_number = int(len(beer_text_sentiment)*.2)
    testing_text = beer_text_sentiment[training_number:]
    test_data_df = pd.DataFrame(testing_text)
    test_data_df.columns = ["Beer", "Text", "Sentiment"]
    test_data_df = test_data_df.drop('Sentiment', 1)
    return test_data_df

In [9]:
train_data_df = training_data(beer_text_sentiment)
test_data_df = testing_data(beer_text_sentiment)



#Preparing a corpus


In [None]:
# based on http://www.cs.duke.edu/courses/spring14/compsci290/assignments/lab02.html

stemmer = PorterStemmer()
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    # remove non letters
    text = re.sub("[^a-zA-Z]", " ", text)
    # tokenize
    tokens = nltk.word_tokenize(text)
    # stem
    stems = stem_tokens(tokens, stemmer)
    return stems

vectorizer = CountVectorizer(
    analyzer = 'word',
    tokenizer = tokenize,
    lowercase = True,
    stop_words = 'english',
    max_features = 85
)

In [None]:
corpus_data_features = vectorizer.fit_transform(train_data_df.Text.tolist() + test_data_df.Text.tolist())
corpus_data_features_nd = corpus_data_features.toarray()
vocab = vectorizer.get_feature_names()
# Sum up the counts of each vocabulary word
dist = np.sum(corpus_data_features_nd, axis=0)

#A bag-of-words linear classifier



In [None]:
train_data_df

In [None]:
def bag_of_words_linear_classifier(training_data):
    # remember that corpus_data_features_nd contains all of our 
    # original train and test data, so we need to exclude
    # the unlabeled test entries
    X_train, X_test, y_train, y_test  = train_test_split(
            corpus_data_features_nd[0:len(train_data_df)], 
            train_data_df.Sentiment,
            train_size=0.80, 
            random_state=1234)
    #Now we are ready to train our classifier.
    log_model = LogisticRegression()
    log_model = log_model.fit(X=X_train, y=y_train)
    #Now we use the classifier to label our evaluation set. 
    #We can use either predict for classes or predict_proba for probabilities.
    y_pred = log_model.predict(X_test)
    #Finally, we can re-train our model with all the training data and use it for sentiment 
    #classification with the original (unlabeled) test set.
    # train classifier
    log_model = LogisticRegression()
    log_model = log_model.fit(X=corpus_data_features_nd[0:len(train_data_df)], y=train_data_df.Sentiment)
    # get predictions
    test_pred = log_model.predict(corpus_data_features_nd[len(train_data_df):])
    return test_pred

In [None]:
test_pred = bag_of_words_linear_classifier(train_data_df)

In [None]:
test_pred[0]

#Sample and Print

In [None]:
len(test_pred)

In [None]:
beer_sentiment  = dict.fromkeys(test_data_df.Beer)
for x in xrange(len(test_data_df.Beer)):
    beer_sentiment[test_data_df.Beer[x]] = test_pred[x]

In [None]:
beer_sentiment["Keystone Light"]