In [2]:
from __future__ import division
import math
import graphlab
import string

In [4]:
products = graphlab.SFrame('amazon_baby.gl/')

[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1479937283.log


This non-commercial license of GraphLab Create for academic use is assigned to hashokkumar92@gmail.com and will expire on November 22, 2017.


In [5]:
products

name,review,rating
Planetwise Flannel Wipes,"These flannel wipes are OK, but in my opinion ...",3.0
Planetwise Wipe Pouch,it came early and was not disappointed. i love ...,5.0
Annas Dream Full Quilt with 2 Shams ...,Very soft and comfortable and warmer than it ...,5.0
Stop Pacifier Sucking without tears with ...,This is a product well worth the purchase. I ...,5.0
Stop Pacifier Sucking without tears with ...,All of my kids have cried non-stop when I tried to ...,5.0
Stop Pacifier Sucking without tears with ...,"When the Binky Fairy came to our house, we didn't ...",5.0
A Tale of Baby's Days with Peter Rabbit ...,"Lovely book, it's bound tightly so you may no ...",4.0
"Baby Tracker&reg; - Daily Childcare Journal, ...",Perfect for new parents. We were able to keep ...,5.0
"Baby Tracker&reg; - Daily Childcare Journal, ...",A friend of mine pinned this product on Pinte ...,5.0
"Baby Tracker&reg; - Daily Childcare Journal, ...",This has been an easy way for my nanny to record ...,4.0


In [6]:
#Remove punctuation
def remove_punctuation(text):
    import string
    return text.translate(None,string.punctuation)


In [11]:
review_without_punctuation = products['review'].apply(remove_punctuation)
products['word_count'] = graphlab.text_analytics.count_words(review_without_punctuation) 

In [18]:
#Getting rid of the rating = 3 as they are neutral
products = products[products['rating']!=3]

In [23]:
#creating a new column (Sentiment) to classify the reviews as positive or negative 
products['sentiment'] = products['rating'].apply(lambda rating : +1 if rating > 3 else -1)

In [27]:
#Splitting data into training and test sets
train_data, test_data = products.random_split(.8,seed=1)

In [32]:
#Training the logistic classifier. 
sentiment_model = graphlab.logistic_classifier.create(train_data,
                                                      target = 'sentiment', 
                                                      features = ['word_count'],
                                                     validation_set = None)


In [38]:
weights = sentiment_model.coefficients
weights.column_names()


name,index,class,value,stderr
(intercept),,1,1.30337080544,
word_count,recommend,1,0.303815600015,
word_count,moist,1,0.671556821414,
word_count,osocozy,1,0.426326525702,
word_count,keps,1,7.3963370872,
word_count,leak,1,-0.24658014554,
word_count,holder,1,-0.0300523581013,
word_count,was,1,-0.0530004786379,
word_count,now,1,0.0383787882079,
word_count,wipe,1,0.165506649337,


In [48]:
#Classifying into positive and negative wieghts from the coefficients 

num_positive_weights = (weights['value']>1).sum()
num_negative_weights = (weights['value']<1).sum()

print "Number of positive weights: %s " % num_positive_weights
print "Number of negative weights: %s " % num_negative_weights

Number of positive weights: 37407 
Number of negative weights: 84306 


In [49]:
#MAKING predictoins with the Logistic classifier 
sample_test_data = test_data[10:13]
print sample_test_data['rating']
sample_test_data

[5.0, 2.0, 1.0]


name,review,rating,word_count,sentiment
Our Baby Girl Memory Book,Absolutely love it and all of the Scripture in ...,5.0,"{'and': 2, 'all': 1, 'love': 1, 'purchased': ...",1
Wall Decor Removable Decal Sticker - Colorful ...,Would not purchase again or recommend. The decals ...,2.0,"{'and': 1, 'would': 2, 'almost': 1, 'decals' ...",-1
New Style Trailing Cherry Blossom Tree Decal ...,Was so excited to get this product for my baby ...,1.0,"{'all': 1, 'money': 1, 'into': 1, 'back': 1, ...",-1


In [57]:
print sample_test_data['review'][0]
print sample_test_data['review'][1]

Absolutely love it and all of the Scripture in it.  I purchased the Baby Boy version for my grandson when he was born and my daughter-in-law was thrilled to receive the same book again.
Would not purchase again or recommend. The decals were thick almost plastic like and were coming off the wall as I was applying them! The would NOT stick! Literally stayed stuck for about 5 minutes then started peeling off.


In [72]:
#Logistic regression Model
#Score = W^T H(Xi)

scores = sentiment_model.predict(sample_test_data, output_type = 'margin')
print scores

[6.734619727059352, -5.734130996760369, -14.66846040446858]


In [62]:
yhat = sample_test_data.apply(lambda scores : +1 if scores > 0 else -1)

In [84]:
#function to generate predictions based on the scores

def class_predictions(scores):
    """ make class predictions
    """
    preds = []
    for score in scores:
        if score > 0:
            pred = 1
        else:
            pred = -1
        preds.append(pred)
    return preds


print class_predictions(scores)

[1, -1, -1]


In [77]:
#Making predictions using graphlab create

print "Class predictions according to GraphLab Create:" 
print sentiment_model.predict(sample_test_data)

Class predictions according to GraphLab Create:
[1, -1, -1]


In [97]:
#creating a function to calculate the probability predictions from the scores

#P(yi=+1|xi,w)=1/1+exp(−w^T h(xi)).
#P(yi=+1|xi,w)=1/1+exp⁡(−w^T h(xi)).
 
    
def prob_predictions(scores):
    prob_preds = []
    for score in scores:
        prob_pred = 1/ (1 + math.exp(-score))
        prob_preds.append(prob_pred)
    return prob_preds

print prob_predictions(scores)

[0.9988123848377198, 0.0032232681818003235, 4.261557996655163e-07]


In [98]:
print "Probability predictions according to Graphlab create"
print sentiment_model.predict(sample_test_data, output_type = "probability")

Probability predictions according to Graphlab create
[0.9988123848377198, 0.003223268181800325, 4.2615579966551607e-07]


In [100]:
#Applying this to the entire test dataset 

test_data['prob_pred'] = sentiment_model.predict(test_data, output_type = 'probability')
print test_data

+-------------------------------+-------------------------------+--------+
|              name             |             review            | rating |
+-------------------------------+-------------------------------+--------+
| Baby Tracker&reg; - Daily ... | This has been an easy way ... |  4.0   |
| Baby Tracker&reg; - Daily ... | I love this journal and ou... |  4.0   |
| Nature's Lullabies First Y... | I love this little calende... |  5.0   |
| Nature's Lullabies Second ... | I had a hard time finding ... |  5.0   |
|  Lamaze Peekaboo, I Love You  | One of baby's first and fa... |  4.0   |
|  Lamaze Peekaboo, I Love You  | My son loved this book as ... |  5.0   |
|  Lamaze Peekaboo, I Love You  | Our baby loves this book &... |  5.0   |
| SoftPlay Giggle Jiggle Fun... | This bear is absolutely ad... |  2.0   |
| SoftPlay Peek-A-Boo Where'... | I bought two for recent ba... |  5.0   |
| Baby's First Year Undated ... | I searched high and low fo... |  5.0   |
+------------------------

In [107]:
#Displaying the top 20 predictions for the most positive reviews
print "The reviews with the highest probability of positive comments are"
test_data['name','prob_pred'].topk('prob_pred', k = 20).print_rows(20)
print "The reviews with the lowest probability of positive comments are"
test_data['name','prob_pred'].topk('prob_pred', k = 20,reverse = True ).print_rows(20)

The reviews with the highest probability of positive comments are
+-------------------------------+-----------+
|              name             | prob_pred |
+-------------------------------+-----------+
|   Munchkin Mozart Magic Cube  |    1.0    |
|  BABYBJORN Potty Chair - Red  |    1.0    |
| Safety 1st Tot-Lok Four Lo... |    1.0    |
| Summer Infant Complete Nur... |    1.0    |
| Leachco Snoogle Total Body... |    1.0    |
| HALO SleepSack Micro-Fleec... |    1.0    |
| Peg Perego Primo Viaggio C... |    1.0    |
|   Capri Stroller - Red Tech   |    1.0    |
| Wizard Convertible Car Sea... |    1.0    |
| Britax Marathon Convertibl... |    1.0    |
| Britax Decathlon Convertib... |    1.0    |
| North States Supergate Pre... |    1.0    |
|  Fisher-Price Deluxe Jumperoo |    1.0    |
| Lilly Gold Sit 'n' Stroll ... |    1.0    |
| Fisher-Price Rainforest Me... |    1.0    |
| JP Lizzy Chocolate Ice Cla... |    1.0    |
| Cloud b Sound Machine Soot... |    1.0    |
| Shermag Glid

In [118]:
#Computing the accuracy of the classifier
"""Step 1: Use the trained model to compute class predictions (Hint: Use the predict method)
Step 2: Count the number of data points when the predicted class labels match the ground truth labels (called true_labels below).
Step 3: Divide the total number of correct predictions by the total number of data points in the dataset."""



def prediction_accuracy(model, data, true_labels):
    predictions = model.predict(data)
    pred_correct = sum(predictions == true_labels)
    accuracy = pred_correct/len(data)
    return accuracy


#Computing the classification accuracy of the sentiment model on the test data
print "Prediction accuracy =" ,prediction_accuracy(sentiment_model, test_data, test_data['sentiment'])


Prediction accuracy = 0.914536837053


In [119]:
# Building a Logistic classifier to predict reviews with significant words

#Subset of reviews with significant words
significant_words = ['love', 'great', 'easy', 'old', 'little', 'perfect', 'loves', 
      'well', 'able', 'car', 'broke', 'less', 'even', 'waste', 'disappointed', 
      'work', 'product', 'money', 'would', 'return']

In [122]:
#trim out all words that are not in the significant_words list above.
#SArray dict_trim_by_keys function

train_data['word_count_subset'] = train_data['word_count'].dict_trim_by_keys(significant_words
                                                                            ,exclude = False)
test_data['word_count_subset'] = test_data['word_count'].dict_trim_by_keys(significant_words,
                                                                          exclude=False)

In [126]:
print train_data[0]['word_count']
print " "
print "Showing the word count of just the significant words in the reviews"
print train_data[0]['word_count_subset']

{'and': 3, 'love': 1, 'it': 3, 'highly': 1, 'osocozy': 1, 'bags': 1, 'holder': 1, 'leak': 1, 'moist': 1, 'does': 1, 'recommend': 1, 'was': 1, 'wipes': 1, 'early': 1, 'not': 2, 'now': 1, 'disappointed': 1, 'wipe': 1, 'keps': 1, 'wise': 1, 'i': 1, 'planet': 1, 'my': 2, 'came': 1}
 
Showing the word count of just the significant words in the reviews
{'love': 1, 'disappointed': 1}


In [129]:
#Training a Logistic classifier on the subset of data

new_model = graphlab.logistic_classifier.create(train_data,
                                               target = 'sentiment',
                                               features =['word_count_subset'],
                                               validation_set = None)

In [130]:
new_model

Class                          : LogisticClassifier

Schema
------
Number of coefficients         : 21
Number of examples             : 133416
Number of classes              : 2
Number of feature columns      : 1
Number of unpacked features    : 20

Hyperparameters
---------------
L1 penalty                     : 0.0
L2 penalty                     : 0.01

Training Summary
----------------
Solver                         : newton
Solver iterations              : 6
Solver status                  : SUCCESS: Optimal solution found.
Training time (sec)            : 0.892

Settings
--------
Log-likelihood                 : 44323.7254

Highest Positive Coefficients
-----------------------------
word_count_subset[loves]       : 1.6773
word_count_subset[perfect]     : 1.5145
word_count_subset[love]        : 1.3654
(intercept)                    : 1.2995
word_count_subset[easy]        : 1.1937

Lowest Negative Coefficients
----------------------------
word_count_subset[disappointed] : -2.3551
wor

In [135]:
#prediction accuracy of the new logistic classifier

print "Prediction accuracy of new classifier =",prediction_accuracy(new_model, test_data, test_data['sentiment'])

Prediction accuracy of new classifier = 0.869300455964


In [136]:
#Inspecting the weights of the new model

new_model.coefficients

name,index,class,value,stderr
(intercept),,1,1.2995449552,0.0120888541331
word_count_subset,disappointed,1,-2.35509250061,0.0504149888557
word_count_subset,love,1,1.36543549368,0.0303546295109
word_count_subset,little,1,0.520628636025,0.0214691475665
word_count_subset,loves,1,1.67727145556,0.0482328275384
word_count_subset,product,1,-0.320555492996,0.0154311321362
word_count_subset,well,1,0.504256746398,0.021381300631
word_count_subset,great,1,0.94469126948,0.0209509926591
word_count_subset,easy,1,1.19366189833,0.029288869202
word_count_subset,work,1,-0.621700012425,0.0230330597946


In [138]:
new_model.coefficients.sort('value', ascending = False).print_rows(num_rows = 21)

+-------------------+--------------+-------+-----------------+-----------------+
|        name       |    index     | class |      value      |      stderr     |
+-------------------+--------------+-------+-----------------+-----------------+
| word_count_subset |    loves     |   1   |  1.67727145556  | 0.0482328275384 |
| word_count_subset |   perfect    |   1   |  1.51448626703  |  0.049861952294 |
| word_count_subset |     love     |   1   |  1.36543549368  | 0.0303546295109 |
|    (intercept)    |     None     |   1   |   1.2995449552  | 0.0120888541331 |
| word_count_subset |     easy     |   1   |  1.19366189833  |  0.029288869202 |
| word_count_subset |    great     |   1   |  0.94469126948  | 0.0209509926591 |
| word_count_subset |    little    |   1   |  0.520628636025 | 0.0214691475665 |
| word_count_subset |     well     |   1   |  0.504256746398 |  0.021381300631 |
| word_count_subset |     able     |   1   |  0.191438302295 | 0.0337581955697 |
| word_count_subset |     ol

In [142]:
new_model_weights = new_model.coefficients
positive_words = new_model_weights[(new_model_weights['value'] > 0)& 
                                  (new_model_weights['name']=='word_count_subset')]['index']
print len(positive_words)
print positive_words

10
['love', 'little', 'loves', 'well', 'great', 'easy', 'able', 'perfect', 'old', 'car']


In [145]:
weights.filter_by(positive_words, 'index')

name,index,class,value,stderr
word_count,love,1,1.43301685439,
word_count,little,1,0.674162457499,
word_count,loves,1,1.5664851757,
word_count,well,1,0.627964877567,
word_count,great,1,1.31459245039,
word_count,easy,1,1.21346937822,
word_count,able,1,0.174331272552,
word_count,perfect,1,1.75190114392,
word_count,old,1,0.0091223011367,
word_count,car,1,0.195263670618,


In [148]:
#Comparing the models on training data

print "classification accuracy of the sentiment_model on the train_data:"
print prediction_accuracy(sentiment_model, train_data, train_data['sentiment'])

print "classification accuracy of the new_model on the train_data:"
print prediction_accuracy(new_model, train_data, train_data['sentiment'])

classification accuracy of the sentiment_model on the train_data:
0.979440247047
classification accuracy of the new_model on the train_data:
0.866815074654


In [149]:
#Comparing the models on test data

print "classification accuracy of the sentiment_model on the test_data:"
print prediction_accuracy(sentiment_model, test_data, test_data['sentiment'])
print "classification accuracy of the sentiment_model on the test_data:"
print prediction_accuracy(new_model, test_data, test_data['sentiment'])

classification accuracy of the sentiment_model on the test_data:
0.914536837053
classification accuracy of the sentiment_model on the test_data:
0.869300455964


In [154]:
#Computing the accuracy for the baseline model
num_positive  = (train_data['sentiment'] == +1).sum()
num_negative = (train_data['sentiment'] == -1).sum()
print num_positive
print num_negative


112164
21252


In [152]:

print (test_data['sentiment'] == +1).sum()
print (test_data['sentiment'] == -1).sum()

28095
5241


In [153]:
print "The accuracy of the baseline model is : "
print (test_data['sentiment'] == +1).sum()/len(test_data['sentiment'])


The accuracy of the baseline model is : 
0.842782577394


In [155]:
print (train_data['sentiment']==+1).sum()/len(train_data)

0.840708760568
