# Setting up our data

In [224]:
# Use Pandas
import pandas as pd
import math
import numpy as np
from numpy import exp
import json
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import linear_model
import string

In [225]:
# define data types for the columns we are importing
dtype_dict = {'name' : str, 'review' : str, 'rating' : int}

In [226]:
# import .csv file as DataFrame object
products = pd.read_csv('amazon_baby.csv', dtype=dtype_dict)

In [227]:
# helper function to remove punctuations from reviews
def remove_punctuation(text):
    return text.translate(None, string.punctuation)

In [228]:
# proprocess empty reviews and apply our punctuation function
products = products.fillna({'review':''}) 
products['review_clean'] = products['review'].apply(remove_punctuation)

In [229]:
# drop 3-star ratings
products = products[products['rating'] != 3]

# reindex DataFrame
products = products.reset_index(drop=True)

In [230]:
products['sentiment'] = products['rating'].apply(lambda rating : +1 if rating > 3 else -1)

# Split into training and test sets

In [None]:
# split into training and test data 

with open('module-2-assignment-test-idx.json') as json_file:
    test_indices = json.load(json_file)

with open('module-2-assignment-train-idx.json') as json_file:
    train_indices = json.load(json_file)
    
test_data = products.ix[test_indices]
train_data = products.ix[train_indices]

# Build the word count vectors

In [None]:
vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
train_matrix = vectorizer.fit_transform(train_data['review_clean'])
test_matrix = vectorizer.transform(test_data['review_clean'])

# Training LogisticRegression model on training data 

In [None]:
# Train a sentiment classifier with logistic regression
sentiment_model = linear_model.LogisticRegression(n_jobs=-1)
sentiment_model.fit(train_matrix, train_data['sentiment'])

In [None]:
print("Number of coefficients in the sentiment model %f" % len(sentiment_model.coef_[0]))
print("### Number of positive coefficients is ", len([x for x in sentiment_model.coef_[0] if x >= 0]))

In [None]:
sample_test_data = test_data[10:13]
sample_test_data

In [None]:
sample_test_matrix = vectorizer.transform(sample_test_data['review_clean'])
scores = sentiment_model.decision_function(sample_test_matrix)
print(scores)

In [None]:
def my_predict(model, test_matrix):
    return [+1 if s >= 0 else -1 for s in model.decision_function(test_matrix)]

In [None]:
my_predict(sentiment_model, sample_test_matrix)

In [None]:
sentiment_model.predict(sample_test_matrix)

In [None]:
logit = lambda x: 1.0/(1+exp(-x))

In [None]:
def my_prob_predict(model, test_matrix):
    return [logit(s) for s in model.decision_function(test_matrix)]

In [None]:
[x for x in my_prob_predict(sentiment_model, sample_test_matrix)]

In [None]:
sentiment_model.predict_proba(sample_test_matrix)

In [None]:
[round(x, 3) for x in my_prob_predict(sentiment_model, sample_test_matrix)]

# Find the most positive and negative reviews

In [None]:
# use the entire test data
test_matrix = vectorizer.transform(test_data['review_clean'])

In [None]:
# use decision scores for ranking, as the decision values differ enough for ranking
test_data['decision'] = sentiment_model.decision_function(test_matrix)
test_data['prediction'] = sentiment_model.predict(test_matrix)

In [None]:
top_20 = test_data.sort_values(by='decision', ascending=False).head(20)
top_20

In [None]:
bottom_20 = test_data.sort_values(by='decision', ascending=True).head(20)
bottom_20

# Compute accuracy of the classifier

In [None]:
total_n = len(test_data)
correct_n = len(test_data[test_data['sentiment'] == test_data['prediction']])
print correct_n / float(total_n)

# Learn another classifier with fewer words

In [None]:
significant_words = ['love', 'great', 'easy', 'old', 'little', 'perfect', 'loves', 
      'well', 'able', 'car', 'broke', 'less', 'even', 'waste', 'disappointed', 
      'work', 'product', 'money', 'would', 'return']

In [None]:
vectorizer_word_subset = CountVectorizer(vocabulary=significant_words) # limit to 20 words
train_matrix_word_subset = vectorizer_word_subset.fit_transform(train_data['review_clean'])
test_matrix_word_subset = vectorizer_word_subset.transform(test_data['review_clean'])

# Train a logistic regression model on a subset of data

In [None]:
simple_model = linear_model.LogisticRegression(n_jobs=-1)
simple_model.fit(train_matrix_word_subset, train_data['sentiment'])

In [None]:
# create a DataFrame containing the words and their evaluated coefficients
simple_words = pd.DataFrame()
simple_words['word'] = significant_words
simple_words['coef'] = simple_model.coef_[0]

In [None]:
# display words with positive coefficients
simple_words_pos = simple_words[simple_words['coef'] >= 0].sort_values(by='coef', ascending=False)

print("There are %d evaluated positive words in the simple model" % len(simple_words_pos))
print simple_words_pos

In [None]:
# check with all words

all_words = list(vectorizer.vocabulary_.keys())
vectorizer_word_set = CountVectorizer(vocabulary=all_words) # limit to 20 words
train_matrix_word_set = vectorizer_word_subset.fit_transform(train_data['review_clean'])
test_matrix_word_set = vectorizer_word_subset.transform(test_data['review_clean'])

In [None]:
sentiment_words = pd.DataFrame()
sentiment_words['word'] = all_words
sentiment_words['sentiment_coef'] = sentiment_model.coef_[0]

In [None]:
print sentiment_words[sentiment_words['word'].isin(simple_words_pos['word'])]

# Comparing models

In [None]:
# accuracy of the sentiment model on training data
train_data['sentiment'] = train_data['rating'].apply(lambda rating : +1 if rating > 3 else -1)
train_data['prediction'] = my_predict(sentiment_model, train_matrix)

total_n = len(train_data)
correct_n = len(train_data[train_data['sentiment'] == train_data['prediction']])
print correct_n / float(total_n)

In [None]:
# accuracy of the simple model on training data
train_data['prediction'] = my_predict(simple_model, train_matrix_word_subset)

total_n = len(train_data)
correct_n = len(train_data[train_data['sentiment'] == train_data['prediction']])
print correct_n / float(total_n)

In [None]:
# accuracy of the sentiment model on test data
test_data['sentiment'] = test_data['rating'].apply(lambda rating : +1 if rating > 3 else -1)
test_data['prediction'] = my_predict(sentiment_model, test_matrix)

total_n = len(test_data)
correct_n = len(test_data[test_data['sentiment'] == test_data['prediction']])
print correct_n / float(total_n)

In [None]:
# accuracy of the simple model on test data
test_data['sentiment'] = test_data['rating'].apply(lambda rating : +1 if rating > 3 else -1)
test_data['prediction'] = my_predict(simple_model, test_matrix_word_subset)

total_n = len(test_data)
correct_n = len(test_data[test_data['sentiment'] == test_data['prediction']])
print correct_n / float(total_n)

# Majority Class classifier

In [None]:
print round(float(sum(train_data['sentiment'] == 1)) / len(train_data), 2)
print round(float(sum(test_data['sentiment'] == 1)) / len(test_data), 2)