In [357]:
import pandas as pd
import string
import re

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.dummy import DummyClassifier

products = pd.read_csv('amazon_baby.csv')

In [358]:
products.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 183531 entries, 0 to 183530
Data columns (total 3 columns):
name      183213 non-null object
review    182702 non-null object
rating    183531 non-null int64
dtypes: int64(1), object(2)
memory usage: 4.2+ MB


In [359]:
products.sample(5)

Unnamed: 0,name,review,rating
98097,GEENNY Musical Mobile For Boutique Airplane Av...,Very cute and easy to put together. Matches we...,4
108801,Chicco Keyfit 22 Pound Infant Car Seat And Bas...,Overall I'm a fan. The snap'n'go makes this ca...,4
53687,Medela Medium-Flow Wide Base Nipples (3-Pack/ ...,I ordered this item thinking that it would hav...,4
79489,Fisher-Price Ocean Wonders Kick and Crawl Gym,I love this product. I saw it an a store but l...,5
89165,Ergo Baby BCP03405 Performance Carrier With a ...,I finally had the freedom I needed when I got ...,5


In [360]:
def remove_punctuation(text):
    return re.sub(r'[^\w\s]','',text)

In [361]:
products = products.fillna({'review':''})
products['review_clean'] = products['review'].apply(remove_punctuation)

In [362]:
products.sample(3)

Unnamed: 0,name,review,rating,review_clean
10320,Snugli Front and Backpack Carrier,I had purchased the old Snugli back in Dec of ...,5,I had purchased the old Snugli back in Dec of ...
140126,[Award Winning] Kidsme Food Feeder (Large size),I purchased this based on the reviews and the ...,3,I purchased this based on the reviews and the ...
165218,Dreambaby Soft Potty Seat - Blue,My 2 year old took interest immediately. He sa...,5,My 2 year old took interest immediately He saw...


In [363]:
products = products[products['rating'] != 3]

In [364]:
products['sentiment'] = products['rating'].apply(lambda rating: 1 if rating > 3 else -1)

In [365]:
X = pd.DataFrame(products['review_clean'])
Y = pd.DataFrame(products['sentiment'])

In [366]:
train_data, test_data = train_test_split(products, test_size=0.20, random_state=1)

In [367]:
vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
     # Use this token pattern to keep single-letter words
# First, learn vocabulary from the training data and assign columns to words
# Then convert the training data into a sparse matrix
train_matrix = vectorizer.fit_transform(train_data['review_clean'])
# Second, convert the test data into a sparse matrix, using the same word-column mapping
test_matrix = vectorizer.transform(test_data['review_clean'])

In [368]:
train_matrix

<133401x121534 sparse matrix of type '<class 'numpy.int64'>'
	with 7325477 stored elements in Compressed Sparse Row format>

In [369]:
X_train = train_matrix

In [370]:
Y_train = train_data['sentiment']

In [371]:
sentiment_model = LogisticRegression(random_state=1)
sentiment_model.fit(X_train, Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=1, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [372]:
print(sentiment_model.coef_)

[[-1.24694112e+00  8.20351005e-04 -6.80556094e-03 ...  1.04013270e-02
   8.83219867e-03 -5.89207195e-05]]


In [373]:
(sentiment_model.coef_ >= 0).sum()

85698

In [374]:
sample_test_data = test_data[10:13]
print(sample_test_data)

                                                     name  \
117165       Lassig Glam Small Messenger Diaper Bag ,navy   
30667   BOB Weather Shield for Single Revolution/Strol...   
60268                Tiny Love Sweet Island Dreams Mobile   

                                                   review  rating  \
117165  While I'm sure this bag is a wonderful diaper ...       5   
30667   This weather shield has been a great accessory...       5   
60268   And we managed to get it to attach to the Grac...       5   

                                             review_clean  sentiment  
117165  While Im sure this bag is a wonderful diaper b...          1  
30667   This weather shield has been a great accessory...          1  
60268   And we managed to get it to attach to the Grac...          1  


In [375]:
sample_test_data.iloc[0]['review']

"While I'm sure this bag is a wonderful diaper bag, I purchased it for traveling purposes.  It's just the right size and doesn't look like a traditional diaper bag at all.  Great pockets - inside and out; easy access to everything I need when I need it.  Love this bag!  I originally bought the navy and have since purchased it in brown and black."

In [376]:
sample_test_data.iloc[1]['review']

'This weather shield has been a great accessory.  It is a little cumbersome at first to put it on the stroller but after the first time it has been very easy.The shield definitely keeps my little girl warm during the windy days with out needing to tuck a blanket around her. Also, dry during the rainy ones.The full-body window provides me with the ability to see that she is buckled up safely at any time during the walk.  It enables her to see things just about as well as when the shield is off.The yellow color is definitely a color contrast to our navy colored stroller but it sure makes us visible to oncoming vehicles.Construction appears to be very good. The material reminds me of a very duty plastic tarp (not like the cheap ones you find at the big discount retailers).  I would imagine this would be repairable with a inner tube patch kit or clear adhesive caulk for the window if it ever needed it.I would definitely purchase this again.'

In [377]:
sample_test_matrix = vectorizer.transform(sample_test_data['review_clean'])
scores = sentiment_model.decision_function(sample_test_matrix)
print(scores)

[ 7.9568419  11.11529274  1.13462755]


In [378]:
Y_sample_predict = sentiment_model.predict(sample_test_matrix)

In [379]:
Y_sample_predict

array([1, 1, 1], dtype=int64)

In [380]:
Y_probability = sentiment_model.predict_proba(sample_test_matrix)
print(Y_probability)

[[3.50134883e-04 9.99649865e-01]
 [1.48827516e-05 9.99985117e-01]
 [2.43308114e-01 7.56691886e-01]]


In [381]:
Y_test_predict = sentiment_model.predict(test_matrix)
print(Y_test_predict)

[1 1 1 ... 1 1 1]


In [382]:
Y_test_probability = sentiment_model.predict_proba(test_matrix)
print(Y_test_probability)

[[7.90740806e-02 9.20925919e-01]
 [3.12654823e-03 9.96873452e-01]
 [2.21623964e-03 9.97783760e-01]
 ...
 [1.21906461e-04 9.99878094e-01]
 [1.31562302e-07 9.99999868e-01]
 [1.17852999e-02 9.88214700e-01]]


In [383]:
positive = pd.DataFrame(Y_test_probability).sort_values(by=1, ascending=False)

In [384]:
negative = pd.DataFrame(Y_test_probability).sort_values(by=0, ascending=False)

In [385]:
accuracy = accuracy_score(test_data['sentiment'], Y_test_predict)
print(accuracy)

0.9317261851218854


In [386]:
Y_train_predict = sentiment_model.predict(train_matrix)

In [387]:
accuracy_train = accuracy_score(train_data['sentiment'], Y_train_predict)
print(accuracy_train)

0.9683660542274796


In [388]:
significant_words = ['love', 'great', 'easy', 'old', 'little', 'perfect', 'loves', 
      'well', 'able', 'car', 'broke', 'less', 'even', 'waste', 'disappointed', 
      'work', 'product', 'money', 'would', 'return']

In [389]:
vectorizer_word_subset = CountVectorizer(vocabulary=significant_words) # limit to 20 words
train_matrix_word_subset = vectorizer_word_subset.fit_transform(train_data['review_clean'])
test_matrix_word_subset = vectorizer_word_subset.transform(test_data['review_clean'])

In [390]:
simple_model = LogisticRegression(random_state=0)
simple_model.fit(train_matrix_word_subset, Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=0, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [391]:
simple_model_coef_table = pd.DataFrame({'word':significant_words,
                                         'coefficient':simple_model.coef_.flatten()})

In [392]:
simple_model_coef_table.sort_values(by='coefficient', ascending=False)

Unnamed: 0,word,coefficient
6,loves,1.730101
5,perfect,1.520583
0,love,1.358388
2,easy,1.172422
1,great,0.953138
7,well,0.516279
4,little,0.48667
8,able,0.208226
3,old,0.092202
9,car,0.062287


In [393]:
Y_test_predict_simple = simple_model.predict(test_matrix_word_subset)

In [394]:
accuracy_simple_test = accuracy_score(test_data['sentiment'], Y_test_predict_simple)
print(accuracy_simple_test)

0.8672003837965878


In [395]:
Y_train_predict_simple = simple_model.predict(train_matrix_word_subset)

In [396]:
accuracy_simple_train = accuracy_score(train_data['sentiment'], Y_train_predict_simple)
print(accuracy_simple_train)

0.867114939168372


In [397]:
base_model = DummyClassifier()
base_model.fit(train_matrix, Y_train)

DummyClassifier(constant=None, random_state=None, strategy='stratified')

In [398]:
Y_base_model_predict = base_model.predict(test_matrix)

In [399]:
accuracy_base_model = accuracy_score(test_data['sentiment'], Y_base_model_predict)
print(accuracy_base_model)

0.7355701478216545


In [400]:
test_data.iloc[positive[0:21].index]


Unnamed: 0,name,review,rating,review_clean,sentiment
179871,"Thirsties Diaper Cover with Hook and Loop, Aqu...",The Thirsties are really an awesome concept th...,5,The Thirsties are really an awesome concept th...,1
125408,"UPPAbaby G-Luxe Stroller, Jake/Black",UPDATE: 2.5 years in and we still love it. We...,5,UPDATE 25 years in and we still love it We ha...,1
60298,"Ju-Ju-Be Be Right Back Backpack Diaper Bag, Bl...",This review is going to compare 3 Ju-Ju-Be bag...,5,This review is going to compare 3 JuJuBe bags ...,1
162687,"Joovy Caboose Too Rear Seat, Greenie",We are thrilled with this rear seat. This litt...,5,We are thrilled with this rear seat This littl...,1
86800,"Baby Jogger 2010 City Mini Single Stroller, Bl...",i love this stroller. It took 4 months researc...,5,i love this stroller It took 4 months research...,1
66059,"Evenflo 6 Pack Classic Glass Bottle, 4-Ounce",It's always fun to write a review on those pro...,5,Its always fun to write a review on those prod...,1
73725,"Chicco Cortina Keyfit 30 Travel System, Miro",UPDATE 11/20/13 - I went ahead and used a tiny...,4,UPDATE 112013 I went ahead and used a tiny bi...,1
93170,"Contours Options Tandem II Stroller, Tangerine","We have 7 month old twins, and have used it si...",5,We have 7 month old twins and have used it sin...,1
172946,Spectra Baby USA S2 Hospital Grade Double/sing...,Long Review but worth the read for those who a...,5,Long Review but worth the read for those who a...,1
111120,Gocrib Portable Baby Travel Crib and Play Yard,My husband says one should be skeptical of 5 s...,5,My husband says one should be skeptical of 5 s...,1


In [401]:
positive[0:20].index

Int64Index([31258, 17753, 27612, 33311, 21595,  9998, 26229, 28340, 20948,
            28494, 15047, 20217,  4300, 25946, 29824, 24375, 18048, 26300,
             7028, 11354],
           dtype='int64')

In [402]:
test_data.iloc[negative[0:21].index]

Unnamed: 0,name,review,rating,review_clean,sentiment
120707,The European NANNY Baby Movement Monitor - EU ...,"The previous reviewers laud the ""piece of mind...",1,The previous reviewers laud the piece of mind ...,-1
120219,Levana Safe N'See Digital Video Baby Monitor w...,I have NEVER written a review before for anyth...,1,I have NEVER written a review before for anyth...,-1
89902,"Peg-Perego Aria Twin Stroller, Java",I am so incredibly disappointed with the strol...,1,I am so incredibly disappointed with the strol...,-1
121755,The First Years Home and Away Portable Video M...,"With modern technology, I really can't believe...",1,With modern technology I really cant believe w...,-1
66359,Levana BABYVIEW20 Interference Free Digital Wi...,where do i even begin? this baby monitor is no...,1,where do i even begin this baby monitor is not...,-1
134999,Infant Optics DXR-5 2.4 GHz Digital Video Baby...,Let me begin with the fact that the monitor wo...,1,Let me begin with the fact that the monitor wo...,-1
89904,"Peg-Perego Aria Twin Stroller, Java",ahhhh where do I begin. I had such high hopes...,1,ahhhh where do I begin I had such high hopes ...,-1
31741,"Regalo My Cot Portable Bed, Royal Blue",If I could give this product zero stars I woul...,1,If I could give this product zero stars I woul...,-1
3746,Playtex Diaper Genie - First Refill Included,"Prior to parenthood, I had heard several paren...",1,Prior to parenthood I had heard several parent...,-1
172090,Belkin WeMo Wi-Fi Baby Monitor for Apple iPhon...,I read so many reviews saying the Belkin WiFi ...,2,I read so many reviews saying the Belkin WiFi ...,-1


In [403]:
sentiment_model_coef_table = pd.DataFrame({'coefficient':sentiment_model.coef_.flatten()})

In [404]:
sentiment_model_coef_table

Unnamed: 0,coefficient
0,-1.246941e+00
1,8.203510e-04
2,-6.805561e-03
3,3.308026e-03
4,4.464479e-05
5,1.448229e-06
6,3.204435e-03
7,2.434166e-01
8,-1.641002e-03
9,2.291315e-02
