In [1]:
import sklearn
import pandas as pd
import numpy as np

In [2]:
products = pd.read_csv('amazon_baby.csv')

In [3]:
products.keys()

Index(['name', 'review', 'rating'], dtype='object')

In [4]:
import string

def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation)) 

In [5]:
remove_punctuation("I'd")

'Id'

In [6]:
products = products.fillna({'review': ''})

In [7]:
products['review_clean'] = products['review'].apply(remove_punctuation)

In [8]:
products = products[products['rating'] != 3]

In [9]:
def rating_to_sentiment(rating):
    return +1 if rating > 3 else -1

products['sentiment'] = products['rating'].apply(rating_to_sentiment)

In [18]:
import json

def read_idx(idx_file):
    with open(idx_file) as fp:
        return json.loads(fp.readline())

train_idx = read_idx('module-2-assignment-train-idx.json')

test_idx = read_idx('module-2-assignment-test-idx.json')

In [77]:
train_data = products.iloc[train_idx]

test_data = products.iloc[test_idx]

In [117]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
# Use this token pattern to keep single-letter words
# First, learn vocabulary from the training data and assign columns to words
# Then convert the training data into a sparse matrix
train_matrix = vectorizer.fit_transform(train_data['review_clean'])
# Second, convert the test data into a sparse matrix, using the same word-column mapping
test_matrix = vectorizer.transform(test_data['review_clean'])

In [118]:
from sklearn.linear_model import LogisticRegression

In [160]:
sentiment_model = LogisticRegression().fit(train_matrix, train_data['sentiment'])

In [120]:
sentiment_model.coef_.shape

cnt = 0
for i in sentiment_model.coef_[0]:
    if i > 0:
        cnt += 1
print(cnt)

87243


In [121]:
sample_test_data = test_data[10:13]

In [122]:
sample_test_data

Unnamed: 0,name,review,rating,review_clean,sentiment,score,probability,predicted_sentiment,correct
59,Our Baby Girl Memory Book,Absolutely love it and all of the Scripture in...,5,Absolutely love it and all of the Scripture in...,1,5.601538,0.996321,1,True
71,Wall Decor Removable Decal Sticker - Colorful ...,Would not purchase again or recommend. The dec...,2,Would not purchase again or recommend The deca...,-1,-3.170456,0.040293,-1,True
91,New Style Trailing Cherry Blossom Tree Decal R...,Was so excited to get this product for my baby...,1,Was so excited to get this product for my baby...,-1,-10.42328,3e-05,-1,True


In [123]:
sample_test_matrix = vectorizer.transform(sample_test_data['review_clean'])
scores = sentiment_model.decision_function(sample_test_matrix)

In [124]:
scores

array([  5.60153832,  -3.17045551, -10.42328017])

In [125]:
import math

def calc_probability(score):
    return 1 / (1 + math.exp(-score))

In [126]:
for x in scores:
    print('%f\t%f' % (x, calc_probability(x)))

5.601538	0.996321
-3.170456	0.040293
-10.423280	0.000030


In [127]:
test_data['score'] = sentiment_model.decision_function(test_matrix)

test_data['probability'] = test_data['score'].apply(calc_probability)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [85]:
test_data.sort_values('score', ascending=False)

Unnamed: 0,name,review,rating,review_clean,sentiment,score,probability
100166,"Infantino Wrap and Tie Baby Carrier, Black Blu...",I bought this carrier when my daughter was abo...,5,I bought this carrier when my daughter was abo...,1,53.813121,1.000000e+00
87017,Baby Einstein Around The World Discovery Center,I am so HAPPY I brought this item for my 7 mon...,5,I am so HAPPY I brought this item for my 7 mon...,1,52.197238,1.000000e+00
133651,"Britax 2012 B-Agile Stroller, Red",[I got this stroller for my daughter prior to ...,4,I got this stroller for my daughter prior to t...,1,48.572162,1.000000e+00
140816,"Diono RadianRXT Convertible Car Seat, Plum",I bought this seat for my tall (38in) and thin...,5,I bought this seat for my tall 38in and thin 2...,1,47.052687,1.000000e+00
137034,Graco Pack 'n Play Element Playard - Flint,My husband and I assembled this Pack n' Play l...,4,My husband and I assembled this Pack n Play la...,1,44.642744,1.000000e+00
50315,"P'Kolino Silly Soft Seating in Tias, Green",I've purchased both the P'Kolino Little Reader...,4,Ive purchased both the PKolino Little Reader C...,1,43.602838,1.000000e+00
119182,Roan Rocco Classic Pram Stroller 2-in-1 with B...,Great Pram Rocco!!!!!!I bought this pram from ...,5,Great Pram RoccoI bought this pram from Europe...,1,41.868777,1.000000e+00
180646,Mamas &amp; Papas 2014 Urbo2 Stroller - Black,After much research I purchased an Urbo2. It's...,4,After much research I purchased an Urbo2 Its e...,1,41.196018,1.000000e+00
168081,Buttons Cloth Diaper Cover - One Size - 8 Colo...,"We are big Best Bottoms fans here, but I wante...",4,We are big Best Bottoms fans here but I wanted...,1,40.519008,1.000000e+00
80155,"Simple Wishes Hands-Free Breastpump Bra, Pink,...","I just tried this hands free breastpump bra, a...",5,I just tried this hands free breastpump bra an...,1,40.453880,1.000000e+00


In [179]:
test_data.sort_values('score')

Unnamed: 0,name,review,rating,review_clean,sentiment,score,probability,predicted_sentiment,correct
16042,Fisher-Price Ocean Wonders Aquarium Bouncer,We have not had ANY luck with Fisher-Price pro...,2,We have not had ANY luck with FisherPrice prod...,-1,-34.708518,8.438831e-16,-1,True
120209,Levana Safe N'See Digital Video Baby Monitor w...,This is the first review I have ever written o...,1,This is the first review I have ever written o...,-1,-34.066778,1.603194e-15,-1,True
77072,Safety 1st Exchangeable Tip 3 in 1 Thermometer,I thought it sounded great to have different t...,1,I thought it sounded great to have different t...,-1,-30.136170,8.166342e-14,-1,True
48694,Adiri BPA Free Natural Nurser Ultimate Bottle ...,I will try to write an objective review of the...,2,I will try to write an objective review of the...,-1,-29.948755,9.849656e-14,-1,True
155287,VTech Communications Safe &amp; Sounds Full Co...,"This is my second video monitoring system, the...",1,This is my second video monitoring system the ...,-1,-29.278337,1.925660e-13,-1,True
94560,The First Years True Choice P400 Premium Digit...,Note: we never installed batteries in these un...,1,Note we never installed batteries in these uni...,-1,-28.731307,3.327760e-13,-1,True
53207,Safety 1st High-Def Digital Monitor,We bought this baby monitor to replace a diffe...,1,We bought this baby monitor to replace a diffe...,-1,-24.142195,3.274747e-11,-1,True
81332,Cloth Diaper Sprayer--styles may vary,I bought this sprayer out of desperation durin...,1,I bought this sprayer out of desperation durin...,-1,-24.127949,3.321734e-11,-1,True
10677,Philips AVENT Newborn Starter Set,"It's 3am in the morning and needless to say, t...",1,Its 3am in the morning and needless to say thi...,-1,-23.079523,9.477431e-11,-1,True
113995,Motorola Digital Video Baby Monitor with Room ...,DO NOT BUY THIS BABY MONITOR!I purchased this ...,1,DO NOT BUY THIS BABY MONITORI purchased this m...,-1,-23.065220,9.613956e-11,-1,True


## Compute accuracy of the classifier

In [148]:
def get_sentiment(score):
    return +1 if score > 0 else -1

In [149]:
def calc_accuracy(predicted, real):
    assert(predicted.shape == real.shape)
    size = predicted.shape[0]
    correct_cnt = 0
    for i in range(size):
        if get_sentiment(predicted[i]) == real[i]:
            correct_cnt += 1
    return correct_cnt / size


In [150]:
predicted = sentiment_model.decision_function(test_matrix)
real = np.array(test_data['sentiment'])

calc_accuracy(predicted, real)

0.9322954163666907

## Learn another classifier with fewer words

In [151]:
significant_words = [
    'love',
    'great',
    'easy',
    'old',
    'little',
    'perfect',
    'loves', 
    'well',
    'able',
    'car',
    'broke',
    'less',
    'even',
    'waste',
    'disappointed', 
    'work',
    'product',
    'money',
    'would',
    'return'
]

In [152]:
vectorizer_word_subset = CountVectorizer(vocabulary=significant_words) # limit to 20 words

train_matrix_word_subset = vectorizer_word_subset.fit_transform(train_data['review_clean'])

test_matrix_word_subset = vectorizer_word_subset.transform(test_data['review_clean'])

In [161]:
simple_model = LogisticRegression().fit(train_matrix_word_subset, train_data['sentiment'])

In [154]:
simple_model.coef_

array([[ 1.36368976,  0.94399959,  1.19253827,  0.08551278,  0.52018576,
         1.50981248,  1.67307389,  0.50376046,  0.19090857,  0.05885467,
        -1.65157634, -0.20956286, -0.51137963, -2.03369861, -2.34829822,
        -0.62116877, -0.32055624, -0.89803074, -0.36216674, -2.10933109]])

In [155]:
weights = simple_model.coef_[0].tolist()

print('Total %d positive weights' % len(list(filter(lambda x: x > 0, weights))))

for word, weight in zip(significant_words, weights):
    print('%s\t%f' % (word, weight))

Total 10 positive weights
love	1.363690
great	0.944000
easy	1.192538
old	0.085513
little	0.520186
perfect	1.509812
loves	1.673074
well	0.503760
able	0.190909
car	0.058855
broke	-1.651576
less	-0.209563
even	-0.511380
waste	-2.033699
disappointed	-2.348298
work	-0.621169
product	-0.320556
money	-0.898031
would	-0.362167
return	-2.109331


## Comparing models

In [166]:
sentiment_acc_train = calc_accuracy(sentiment_model.decision_function(train_matrix),
                                    np.array(train_data['sentiment']))
print(sentiment_acc_train)

simple_acc_train = calc_accuracy(simple_model.decision_function(train_matrix_word_subset),
                                 np.array(train_data['sentiment']))
print(simple_acc_train)

0.9684895364873778
0.8668225700065959


In [167]:
sentiment_acc_test = calc_accuracy(sentiment_model.decision_function(test_matrix),
                                   np.array(test_data['sentiment']))
print(sentiment_acc_test)

simple_acc_test = calc_accuracy(simple_model.decision_function(test_matrix_word_subset),
                                np.array(test_data['sentiment']))
print(simple_acc_test)

0.9322954163666907
0.8693604511639069


## Baseline: Majority class prediction

In [178]:
good = 0
total = 0

for s in test_data['sentiment']:
    total += 1
    if s == 1:
        good += 1

good / total

0.8427825773938085