In [1]:
import pandas as pd

In [2]:
dtype_dict = {'name' : str, 'review' : str, 'rating' : int}

In [3]:
products = pd.read_csv('amazon_baby.csv', dtype=dtype_dict)
len(products)

183531

In [6]:
def remove_punctuation(text):
    import string
    return text.translate(None, string.punctuation)

In [7]:
products = products.fillna({'review':''}) 
products['review_clean'] = products['review'].apply(remove_punctuation)

In [8]:
import math
import numpy as np
products = products[products['rating'] != 3]
products = products.reset_index(drop=True)

In [9]:
products['sentiment'] = products['rating'].apply(lambda rating : +1 if rating > 3 else -1)

In [10]:
import json

with open('module-2-assignment-test-idx.json') as json_file:
    test_indices = json.load(json_file)

with open('module-2-assignment-train-idx.json') as json_file:
    train_indices = json.load(json_file)

In [11]:
len(test_indices)

33336

In [12]:
len(train_indices)

133416

In [13]:
test_data = products.ix[test_indices]
train_data = products.ix[train_indices]

In [14]:
len(test_data)

33336

In [15]:
len(train_data)

133416

In [16]:
train_data

Unnamed: 0,name,review,rating,review_clean,sentiment
0,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5,it came early and was not disappointed i love ...,1
1,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5,Very soft and comfortable and warmer than it l...,1
2,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5,This is a product well worth the purchase I h...,1
3,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,All of my kids have cried nonstop when I tried...,1
4,Stop Pacifier Sucking without tears with Thumb...,"When the Binky Fairy came to our house, we did...",5,When the Binky Fairy came to our house we didn...,1
5,A Tale of Baby's Days with Peter Rabbit,"Lovely book, it's bound tightly so you may not...",4,Lovely book its bound tightly so you may not b...,1
6,"Baby Tracker&reg; - Daily Childcare Journal, S...",Perfect for new parents. We were able to keep ...,5,Perfect for new parents We were able to keep t...,1
7,"Baby Tracker&reg; - Daily Childcare Journal, S...",A friend of mine pinned this product on Pinter...,5,A friend of mine pinned this product on Pinter...,1
10,"Baby Tracker&reg; - Daily Childcare Journal, S...",This book is perfect! I'm a first time new mo...,5,This book is perfect Im a first time new mom ...,1
11,"Baby Tracker&reg; - Daily Childcare Journal, S...",I originally just gave the nanny a pad of pape...,4,I originally just gave the nanny a pad of pape...,1


In [17]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')

In [18]:
train_matrix = vectorizer.fit_transform(train_data['review_clean'])
test_matrix = vectorizer.transform(test_data['review_clean'])

In [19]:
train_matrix

<133416x121712 sparse matrix of type '<type 'numpy.int64'>'
	with 7326618 stored elements in Compressed Sparse Row format>

In [20]:
# Train a sentiment classifier with logistic regression
from sklearn import linear_model
sentiment_model = linear_model.LogisticRegression(n_jobs=-1)
sentiment_model.fit(train_matrix, train_data['sentiment'])

###
### Wrong answer here, wrong number of coefficients
###
print("Number of coefficients in the sentiment model %f" % len(sentiment_model.coef_[0]))
print("### Number of positive coefficients is ", len([x for x in sentiment_model.coef_[0] if x >= 0]))

Number of coefficients in the sentiment model 121712.000000
('### Number of positive coefficients is ', 85913)


In [210]:
sample_test_data = test_data[10:13]
sample_test_data

Unnamed: 0,name,review,rating,review_clean,sentiment
10,Our Baby Girl Memory Book,Absolutely love it and all of the Scripture in...,5,Absolutely love it and all of the Scripture in...,1
11,Wall Decor Removable Decal Sticker - Colorful ...,Would not purchase again or recommend. The dec...,2,Would not purchase again or recommend. The dec...,-1
12,New Style Trailing Cherry Blossom Tree Decal R...,Was so excited to get this product for my baby...,1,Was so excited to get this product for my baby...,-1


In [211]:
sample_test_data.iloc[0]['review']

'Absolutely love it and all of the Scripture in it.  I purchased the Baby Boy version for my grandson when he was born and my daughter-in-law was thrilled to receive the same book again.'

In [212]:
sample_test_data.iloc[1]['review']

'Would not purchase again or recommend. The decals were thick almost plastic like and were coming off the wall as I was applying them! The would NOT stick! Literally stayed stuck for about 5 minutes then started peeling off.'

In [213]:
sample_test_data.iloc[2]['review']

"Was so excited to get this product for my baby girls bedroom!  When I got it the back is NOT STICKY at all!  Every time I walked into the bedroom I was picking up pieces off of the floor!  Very very frustrating!  Ended up having to super glue it to the wall...very disappointing.  I wouldn't waste the time or money on it."

In [214]:
sample_test_matrix = vectorizer.transform(sample_test_data['review_clean'])
scores = sentiment_model.decision_function(sample_test_matrix)
print(scores)

[  4.95213125  -2.99953526 -10.59510239]


In [215]:
def my_predict(model, test_matrix):
    return [+1 if s >= 0 else -1 for s in model.decision_function(test_matrix)]

In [216]:
my_predict(sentiment_model, sample_test_matrix)

[1, -1, -1]

In [217]:
sentiment_model.predict(sample_test_matrix)

array([ 1, -1, -1])

In [218]:
from numpy import exp
logit = lambda x: 1.0/(1+exp(-x))

In [219]:
def my_prob_predict(model, test_matrix):
    return [logit(s) for s in model.decision_function(test_matrix)]

In [220]:
[x for x in my_prob_predict(sentiment_model, sample_test_matrix)]

[0.99298128209115466, 0.047446872840577192, 2.5037710952405042e-05]

In [221]:
sentiment_model.predict_proba(sample_test_matrix)

array([[  7.01871791e-03,   9.92981282e-01],
       [  9.52553127e-01,   4.74468728e-02],
       [  9.99974962e-01,   2.50377110e-05]])

In [222]:
[round(x, 3) for x in my_prob_predict(sentiment_model, sample_test_matrix)]

[0.99299999999999999, 0.047, 0.0]

In [223]:
test_matrix = vectorizer.transform(test_data['review_clean'])

In [224]:
test_data['predictions'] = sentiment_model.decision_function(test_matrix)

In [225]:
###
### Wrong answer here, wrong products...
###

top_20 = test_data.sort_values(by='predictions', ascending=False).head(20)
top_20

Unnamed: 0,name,review,rating,review_clean,sentiment,predictions
15732,Baby Einstein Around The World Discovery Center,I am so HAPPY I brought this item for my 7 mon...,5,I am so HAPPY I brought this item for my 7 mon...,1,49.334376
25554,"Diono RadianRXT Convertible Car Seat, Plum",I bought this seat for my tall (38in) and thin...,5,I bought this seat for my tall (38in) and thin...,1,47.56899
24286,"Britax 2012 B-Agile Stroller, Red",[I got this stroller for my daughter prior to ...,4,[I got this stroller for my daughter prior to ...,1,45.454652
9555,Evenflo X Sport Plus Convenience Stroller - Ch...,After seeing this in Parent's Magazine and rea...,5,After seeing this in Parent's Magazine and rea...,1,45.069776
18112,"Infantino Wrap and Tie Baby Carrier, Black Blu...",I bought this carrier when my daughter was abo...,5,I bought this carrier when my daughter was abo...,1,44.744202
17558,Freemie Hands-Free Concealable Breast Pump Col...,I absolutely love this product. I work as a C...,5,I absolutely love this product. I work as a C...,1,44.122034
21531,Roan Rocco Classic Pram Stroller 2-in-1 with B...,Great Pram Rocco!!!!!!I bought this pram from ...,5,Great Pram Rocco!!!!!!I bought this pram from ...,1,43.600869
30634,Graco FastAction Fold Jogger Click Connect Str...,Graco's FastAction Jogging Stroller definitely...,5,Graco's FastAction Jogging Stroller definitely...,1,42.283704
24899,Graco Pack 'n Play Element Playard - Flint,My husband and I assembled this Pack n' Play l...,4,My husband and I assembled this Pack n' Play l...,1,41.413444
14482,"Simple Wishes Hands-Free Breastpump Bra, Pink,...","I just tried this hands free breastpump bra, a...",5,"I just tried this hands free breastpump bra, a...",1,40.793042


In [226]:
bottom_20 = test_data.sort_values(by='predictions', ascending=True).head(20)
bottom_20

Unnamed: 0,name,review,rating,review_clean,sentiment,predictions
28184,VTech Communications Safe &amp; Sounds Full Co...,"This is my second video monitoring system, the...",1,"This is my second video monitoring system, the...",-1,-34.220086
21700,Levana Safe N'See Digital Video Baby Monitor w...,This is the first review I have ever written o...,1,This is the first review I have ever written o...,-1,-34.151447
17069,The First Years True Choice P400 Premium Digit...,Note: we never installed batteries in these un...,1,Note: we never installed batteries in these un...,-1,-32.732747
2931,Fisher-Price Ocean Wonders Aquarium Bouncer,We have not had ANY luck with Fisher-Price pro...,2,We have not had ANY luck with Fisher-Price pro...,-1,-32.507105
8818,Adiri BPA Free Natural Nurser Ultimate Bottle ...,I will try to write an objective review of the...,2,I will try to write an objective review of the...,-1,-29.839273
13939,Safety 1st Exchangeable Tip 3 in 1 Thermometer,I thought it sounded great to have different t...,1,I thought it sounded great to have different t...,-1,-25.594672
1810,Cosco Alpha Omega Elite Convertible Car Seat,I bought this car seat after both seeing the ...,1,I bought this car seat after both seeing the ...,-1,-24.36869
9655,Safety 1st High-Def Digital Monitor,We bought this baby monitor to replace a diffe...,1,We bought this baby monitor to replace a diffe...,-1,-23.697124
20594,Motorola Digital Video Baby Monitor with Room ...,DO NOT BUY THIS BABY MONITOR!I purchased this ...,1,DO NOT BUY THIS BABY MONITOR!I purchased this ...,-1,-22.344792
13751,"Peg-Perego Tatamia High Chair, White Latte",I can see why there are so many good reviews o...,2,I can see why there are so many good reviews o...,-1,-22.285908


# Compute accuracy of the classifier

In [227]:
total_n = len(test_data)
tp = len(test_data[(test_data['sentiment'] == 1) & (test_data['predictions'] > 0.)])
tn = len(test_data[(test_data['sentiment'] == -1) & (test_data['predictions'] < 0.)])
accuracy = (tp + tn) / total_n
accuracy

0.9322354211663066

# Learn another classifier with fewer words

In [228]:
significant_words = ['love', 'great', 'easy', 'old', 'little', 'perfect', 'loves', 
      'well', 'able', 'car', 'broke', 'less', 'even', 'waste', 'disappointed', 
      'work', 'product', 'money', 'would', 'return']

In [229]:
vectorizer_word_subset = CountVectorizer(vocabulary=significant_words) # limit to 20 words
train_matrix_word_subset = vectorizer_word_subset.fit_transform(train_data['review_clean'])
test_matrix_word_subset = vectorizer_word_subset.transform(test_data['review_clean'])

# Train a logistic regression model on a subset of data

In [230]:
simple_model = linear_model.LogisticRegression(n_jobs=-1)
simple_model.fit(train_matrix_word_subset, train_data['sentiment'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [231]:
simple_model.coef_

array([[ 1.35643965,  0.94351803,  1.18326806,  0.08280298,  0.51454381,
         1.50690184,  1.677632  ,  0.53008269,  0.19308984,  0.05647776,
        -1.64216244, -0.2016549 , -0.50967924, -2.02685175, -2.35063483,
        -0.62362975, -0.32671385, -0.86948791, -0.36323413, -2.11081145]])

In [232]:
# number of positive coefficients
words_evaluated = pd.DataFrame()
words_evaluated['word'] = significant_words
words_evaluated['coef'] = simple_model.coef_[0]

In [233]:
print("There are %d evaluated positive words in the simple model" % len(words_evaluated[words_evaluated['coef'] >= 0]))
words_evaluated[words_evaluated['coef'] >= 0].sort_values(by='coef', ascending=False)

There are 10 evaluated positive words in the simple model


Unnamed: 0,word,coef
6,loves,1.677632
5,perfect,1.506902
0,love,1.35644
2,easy,1.183268
1,great,0.943518
7,well,0.530083
4,little,0.514544
8,able,0.19309
3,old,0.082803
9,car,0.056478


In [234]:
positive_words = words_evaluated[words_evaluated['coef'] >= 0].sort_values(by='coef', ascending=False)

all_words_evaluated = pd.DataFrame()
all_words_evaluated['word'] = (vectorizer.vocabulary_.keys())
all_words_evaluated['sentiment_coef'] = sentiment_model.coef_[0]

In [235]:
###
### Wrong answer here, positive worlds in the simple model should also be positive in sentimel_model
###

all_words_evaluated[all_words_evaluated['word'].isin(positive_words['word'])]

Unnamed: 0,word,sentiment_coef
2733,able,0.355083
3457,loves,0.055141
4280,love,0.019117
5029,car,-0.729822
10471,old,0.0043
28894,great,0.113521
30424,perfect,-0.001877
36989,little,-0.003206
41222,easy,0.018804
47268,well,-0.519584


# Comparing models

In [236]:
train_data['sentiment'] = train_data['rating'].apply(lambda rating : +1 if rating > 3 else -1)
train_data['predictions'] = sentiment_model.decision_function(train_matrix)

total_n = len(train_data)
tp = len(train_data[(train_data['sentiment'] == 1) & (train_data['predictions'] > 0.)])
tn = len(train_data[(train_data['sentiment'] == -1) & (train_data['predictions'] < 0.)])
accuracy = (tp + tn) / total_n
accuracy

0.9622009354200396

In [237]:
train_data['sentiment'] = train_data['rating'].apply(lambda rating : +1 if rating > 3 else -1)
train_data['predictions'] = simple_model.decision_function(train_matrix_word_subset)

total_n = len(train_data)
tp = len(train_data[(train_data['sentiment'] == 1) & (train_data['predictions'] > 0.)])
tn = len(train_data[(train_data['sentiment'] == -1) & (train_data['predictions'] < 0.)])
accuracy = (tp + tn) / total_n
accuracy

0.8672273190621814

In [238]:
###
### Wrong answer here, sentiment model should still have higher accuracy
###

test_data['sentiment'] = test_data['rating'].apply(lambda rating : +1 if rating > 3 else -1)
test_data['predictions'] = sentiment_model.decision_function(test_matrix)

total_n = len(test_data)
tp = len(test_data[(train_data['sentiment'] == 1) & (test_data['predictions'] > 0.)])
tn = len(test_data[(train_data['sentiment'] == -1) & (test_data['predictions'] < 0.)])
accuracy = (tp + tn) / total_n
accuracy



0.7310715142788576

In [239]:
test_data['sentiment'] = test_data['rating'].apply(lambda rating : +1 if rating > 3 else -1)
test_data['predictions'] = simple_model.decision_function(test_matrix_word_subset)

total_n = len(test_data)
tp = len(test_data[(train_data['sentiment'] == 1) & (test_data['predictions'] > 0.)])
tn = len(test_data[(train_data['sentiment'] == -1) & (test_data['predictions'] < 0.)])
accuracy = (tp + tn) / total_n
accuracy



0.7826073914086873

# Majority Class classifier

In [240]:
from sklearn import dummy
dummy_model = dummy.DummyClassifier()
dummy_model.fit(train_matrix, train_data['sentiment'])

DummyClassifier(constant=None, random_state=None, strategy='stratified')

In [241]:
###
### Wrong answer here
###

test_data['sentiment'] = test_data['rating'].apply(lambda rating : +1 if rating > 3 else -1)
test_data['predictions'] = dummy_model.predict(test_matrix)

total_n = len(test_data)
tp = len(test_data[(train_data['sentiment'] == 1) & (test_data['predictions'] > 0.)])
tn = len(test_data[(train_data['sentiment'] == -1) & (test_data['predictions'] < 0.)])
accuracy = (tp + tn) / total_n
accuracy



0.7155627549796016