In [75]:
from __future__ import division
import graphlab
import math
import string

## Load products

In [76]:
products = graphlab.SFrame('amazon_baby.sframe/')

In [77]:
products

name,review,rating
Planetwise Flannel Wipes,"These flannel wipes are OK, but in my opinion ...",3.0
Planetwise Wipe Pouch,it came early and was not disappointed. i love ...,5.0
Annas Dream Full Quilt with 2 Shams ...,Very soft and comfortable and warmer than it ...,5.0
Stop Pacifier Sucking without tears with ...,This is a product well worth the purchase. I ...,5.0
Stop Pacifier Sucking without tears with ...,All of my kids have cried non-stop when I tried to ...,5.0
Stop Pacifier Sucking without tears with ...,"When the Binky Fairy came to our house, we didn't ...",5.0
A Tale of Baby's Days with Peter Rabbit ...,"Lovely book, it's bound tightly so you may no ...",4.0
"Baby Tracker&reg; - Daily Childcare Journal, ...",Perfect for new parents. We were able to keep ...,5.0
"Baby Tracker&reg; - Daily Childcare Journal, ...",A friend of mine pinned this product on Pinte ...,5.0
"Baby Tracker&reg; - Daily Childcare Journal, ...",This has been an easy way for my nanny to record ...,4.0


## Build the word count vector for each review

In [78]:
def remove_punctuation(text):
    import string
    return text.translate(None, string.punctuation) 

review_without_punctuation = products['review'].apply(remove_punctuation)
products['word_count'] = graphlab.text_analytics.count_words(review_without_punctuation)

In [79]:
products[269]

{'name': 'The First Years Massaging Action Teether',
 'rating': 5.0,
 'review': 'A favorite in our house!',
 'word_count': {'a': 1L, 'favorite': 1L, 'house': 1L, 'in': 1L, 'our': 1L}}

## Sentiment extraction

In [80]:
products = products[products['rating'] != 3]
len(products)

166752

In [81]:
products['sentiment'] = products['rating'].apply(lambda rating : +1 if rating > 3 else -1)
products

name,review,rating,word_count,sentiment
Planetwise Wipe Pouch,it came early and was not disappointed. i love ...,5.0,"{'and': 3L, 'love': 1L, 'it': 3L, 'highly': 1L, ...",1
Annas Dream Full Quilt with 2 Shams ...,Very soft and comfortable and warmer than it ...,5.0,"{'and': 2L, 'quilt': 1L, 'it': 1L, 'comfortable': ...",1
Stop Pacifier Sucking without tears with ...,This is a product well worth the purchase. I ...,5.0,"{'and': 3L, 'ingenious': 1L, 'love': 2L, 'is': ...",1
Stop Pacifier Sucking without tears with ...,All of my kids have cried non-stop when I tried to ...,5.0,"{'and': 2L, 'all': 2L, 'help': 1L, 'cried': 1L, ...",1
Stop Pacifier Sucking without tears with ...,"When the Binky Fairy came to our house, we didn't ...",5.0,"{'and': 2L, 'cute': 1L, 'help': 2L, 'habit': 1L, ...",1
A Tale of Baby's Days with Peter Rabbit ...,"Lovely book, it's bound tightly so you may no ...",4.0,"{'shop': 1L, 'be': 1L, 'is': 1L, 'bound': 1L, ...",1
"Baby Tracker&reg; - Daily Childcare Journal, ...",Perfect for new parents. We were able to keep ...,5.0,"{'and': 2L, 'all': 1L, 'right': 1L, 'able': 1L, ...",1
"Baby Tracker&reg; - Daily Childcare Journal, ...",A friend of mine pinned this product on Pinte ...,5.0,"{'and': 1L, 'fantastic': 1L, 'help': 1L, 'give': ...",1
"Baby Tracker&reg; - Daily Childcare Journal, ...",This has been an easy way for my nanny to record ...,4.0,"{'all': 1L, 'standarad': 1L, 'another': 1L, ...",1
"Baby Tracker&reg; - Daily Childcare Journal, ...",I love this journal and our nanny uses it ...,4.0,"{'all': 2L, 'nannys': 1L, 'just': 1L, 'sleep': 2L, ...",1


## Split into training and test sets

In [82]:
train_data, test_data = products.random_split(.8, seed=1)
print len(train_data)
print len(test_data)

133416
33336


## Train a sentiment classifier with logistic regression

In [83]:
sentiment_model = graphlab.logistic_classifier.create(train_data,
                                                      target = 'sentiment',
                                                      features=['word_count'],
                                                      validation_set=None)

In [84]:
sentiment_model

Class                          : LogisticClassifier

Schema
------
Number of coefficients         : 121713
Number of examples             : 133416
Number of classes              : 2
Number of feature columns      : 1
Number of unpacked features    : 121712

Hyperparameters
---------------
L1 penalty                     : 0.0
L2 penalty                     : 0.01

Training Summary
----------------
Solver                         : lbfgs
Solver iterations              : 6
Solver status                  : TERMINATED: Terminated due to numerical difficulties.
Training time (sec)            : 7.4204

Settings
--------
Log-likelihood                 : inf

Highest Positive Coefficients
-----------------------------
word_count[mobileupdate]       : 41.9847
word_count[placeid]            : 41.7354
word_count[labelbox]           : 41.151
word_count[httpwwwamazoncomreviewrhgg6qp7tdnhbrefcmcrprcmtieutf8asinb00318cla0nodeid] : 40.0454
word_count[knobskeeping]       : 36.2091

Lowest Negative Coeffi

In [85]:
weights = sentiment_model.coefficients
weights.column_names()

['name', 'index', 'class', 'value', 'stderr']

In [95]:
num_positive_weights = len(weights[weights['value'] >= 0 ])
num_negative_weights = len(weights[weights['value'] < 0 ])
print "Number of positive weights: %s " % num_positive_weights
print "Number of negative weights: %s " % num_negative_weights

Number of positive weights: 68419 
Number of negative weights: 53294 


## Making predictions with logistic regression

In [96]:
sample_test_data = test_data[10:13]
print sample_test_data['rating']
sample_test_data

[5.0, 2.0, 1.0]


name,review,rating,word_count,sentiment
Our Baby Girl Memory Book,Absolutely love it and all of the Scripture in ...,5.0,"{'and': 2L, 'all': 1L, 'love': 1L, ...",1
Wall Decor Removable Decal Sticker - Colorful ...,Would not purchase again or recommend. The decals ...,2.0,"{'and': 1L, 'wall': 1L, 'them': 1L, 'decals': ...",-1
New Style Trailing Cherry Blossom Tree Decal ...,Was so excited to get this product for my baby ...,1.0,"{'all': 1L, 'money': 1L, 'into': 1L, 'it': 3L, ...",-1


In [97]:
sample_test_data[0]['review']

'Absolutely love it and all of the Scripture in it.  I purchased the Baby Boy version for my grandson when he was born and my daughter-in-law was thrilled to receive the same book again.'

In [98]:
sample_test_data[1]['review']

'Would not purchase again or recommend. The decals were thick almost plastic like and were coming off the wall as I was applying them! The would NOT stick! Literally stayed stuck for about 5 minutes then started peeling off.'

##### We will now make a class prediction for the sample_test_data. The sentiment_model should predict +1 if the sentiment is positive and -1 if the sentiment is negative. Recall from the lecture that the score (sometimes called margin) for the logistic regression model is defined as: scorei=w⊺h(xi)

In [99]:
scores = sentiment_model.predict(sample_test_data, output_type='margin')
print scores

[6.734619727060567, -5.734130996761049, -14.668460404469824]


## Prediciting Sentiment

In [100]:
class_prediction = map(lambda x: +1 if x>0 else -1,scores)
print class_prediction

[1, -1, -1]


In [101]:
print "Class predictions according to GraphLab Create:" 
print sentiment_model.predict(sample_test_data)

Class predictions according to GraphLab Create:
[1L, -1L, -1L]


## Probability predictions

In [102]:
def sigmoid(score):
    return (1+math.exp(-score))**(-1)
print map(sigmoid,scores)
print min(map(sigmoid,scores))

[0.9988123848377214, 0.003223268181798138, 4.261557996649856e-07]
4.26155799665e-07


In [103]:
print "Class predictions according to GraphLab Create:" 
print sentiment_model.predict(sample_test_data, output_type='probability')
print min( sentiment_model.predict(sample_test_data, output_type='probability'))

Class predictions according to GraphLab Create:
[0.9988123848377213, 0.0032232681817981374, 4.2615579966498535e-07]
4.26155799665e-07


## Find the most positive (and negative) review

In [104]:
predictions = sentiment_model.predict(test_data, output_type='probability')
test_data['pro'] = sentiment_model.predict(test_data, output_type='probability')

test_data.topk('prob',20).head(20)

RuntimeError: Runtime Exception. Column name prob does not exist.

In [None]:
test_data.topk('prob',20,reverse=True).head(20)

## Compute accuracy of the classifier

In [None]:
def get_classification_accuracy(model, data, true_labels):
    accuracy=None
    # First get the predictions
    prediction=model.predict(data)
    # Compute the number of correctly classified examples
    correct = prediction+true_labels
    correct = list(correct)
    true_correct=correct.count(2)
    false_correct = correct.count(-2)
    allc=false_correct+true_correct
    print " correctly classified examples: ",allc
    print " total examples: ", len(true_labels)
    
    # Then compute accuracy by dividing num_correct by total number of examples
    accuracy = allc / len(true_labels)
    print "accuracy: ", accuracy
    return accuracy

In [105]:
get_classification_accuracy(sentiment_model, test_data, test_data['sentiment'])

get_classification_accuracy(sentiment_model, train_data, train_data['sentiment'])

 correctly classified examples:  30487
 total examples:  33336
accuracy:  0.914536837053
 correctly classified examples:  130673
 total examples:  133416
accuracy:  0.979440247047


0.979440247046831

In [106]:
significant_words = ['love', 'great', 'easy', 'old', 'little', 'perfect', 'loves', 
      'well', 'able', 'car', 'broke', 'less', 'even', 'waste', 'disappointed', 
      'work', 'product', 'money', 'would', 'return']

In [107]:
len(significant_words)

20

In [108]:
train_data['word_count_subset'] = train_data['word_count'].dict_trim_by_keys(significant_words, exclude=False)
test_data['word_count_subset'] = test_data['word_count'].dict_trim_by_keys(significant_words, exclude=False)

In [109]:
train_data[0]['review']

'it came early and was not disappointed. i love planet wise bags and now my wipe holder. it keps my osocozy wipes moist and does not leak. highly recommend it.'

In [110]:
print train_data[0]['word_count']

{'and': 3L, 'love': 1L, 'it': 3L, 'highly': 1L, 'osocozy': 1L, 'bags': 1L, 'leak': 1L, 'moist': 1L, 'does': 1L, 'recommend': 1L, 'was': 1L, 'wipes': 1L, 'disappointed': 1L, 'early': 1L, 'not': 2L, 'now': 1L, 'holder': 1L, 'wipe': 1L, 'keps': 1L, 'wise': 1L, 'i': 1L, 'planet': 1L, 'my': 2L, 'came': 1L}


In [111]:
print train_data[0]['word_count_subset']

{'love': 1L, 'disappointed': 1L}


## Train a logistic regression model on a subset of data

In [112]:
simple_model = graphlab.logistic_classifier.create(train_data,
                                                   target = 'sentiment',
                                                   features=['word_count_subset'],
                                                   validation_set=None)
simple_model

Class                          : LogisticClassifier

Schema
------
Number of coefficients         : 21
Number of examples             : 133416
Number of classes              : 2
Number of feature columns      : 1
Number of unpacked features    : 20

Hyperparameters
---------------
L1 penalty                     : 0.0
L2 penalty                     : 0.01

Training Summary
----------------
Solver                         : newton
Solver iterations              : 6
Solver status                  : SUCCESS: Optimal solution found.
Training time (sec)            : 1.1087

Settings
--------
Log-likelihood                 : 44323.7254

Highest Positive Coefficients
-----------------------------
word_count_subset[loves]       : 1.6773
word_count_subset[perfect]     : 1.5145
word_count_subset[love]        : 1.3654
(intercept)                    : 1.2995
word_count_subset[easy]        : 1.1937

Lowest Negative Coefficients
----------------------------
word_count_subset[disappointed] : -2.3551
wo

In [113]:
get_classification_accuracy(simple_model, test_data, test_data['sentiment'])

 correctly classified examples:  28979
 total examples:  33336
accuracy:  0.869300455964


0.8693004559635229

In [114]:
simple_model.coefficients

name,index,class,value,stderr
(intercept),,1,1.2995449552,0.0120888541331
word_count_subset,disappointed,1,-2.35509250061,0.0504149888557
word_count_subset,love,1,1.36543549368,0.0303546295109
word_count_subset,well,1,0.504256746398,0.021381300631
word_count_subset,product,1,-0.320555492996,0.0154311321362
word_count_subset,loves,1,1.67727145556,0.0482328275384
word_count_subset,little,1,0.520628636025,0.0214691475665
word_count_subset,work,1,-0.621700012425,0.0230330597946
word_count_subset,easy,1,1.19366189833,0.029288869202
word_count_subset,great,1,0.94469126948,0.0209509926591


In [115]:
simple_model.coefficients.sort('value', ascending=False).print_rows(num_rows=21)

+-------------------+--------------+-------+-----------------+-----------------+
|        name       |    index     | class |      value      |      stderr     |
+-------------------+--------------+-------+-----------------+-----------------+
| word_count_subset |    loves     |   1   |  1.67727145556  | 0.0482328275384 |
| word_count_subset |   perfect    |   1   |  1.51448626703  |  0.049861952294 |
| word_count_subset |     love     |   1   |  1.36543549368  | 0.0303546295109 |
|    (intercept)    |     None     |   1   |   1.2995449552  | 0.0120888541331 |
| word_count_subset |     easy     |   1   |  1.19366189833  |  0.029288869202 |
| word_count_subset |    great     |   1   |  0.94469126948  | 0.0209509926591 |
| word_count_subset |    little    |   1   |  0.520628636025 | 0.0214691475665 |
| word_count_subset |     well     |   1   |  0.504256746398 |  0.021381300631 |
| word_count_subset |     able     |   1   |  0.191438302295 | 0.0337581955697 |
| word_count_subset |     ol

In [116]:
good= map(lambda x: 1 if x>0 else -1,simple_model.coefficients['value'])
print good.count(1)

11


In [117]:
temp=sentiment_model.coefficients['index','value']
temp2 = simple_model.coefficients['index','value']
for i in significant_words:
    if temp[temp['index']==i]:
        print i ,' : ',
        print temp[temp['index']==i]['value'] , temp2[temp2['index']==i]['value']

love  :  [1.4330168543928692, ... ] [1.3654354936790394, ... ]
great  :  [1.3145924503860642, ... ] [0.9446912694798449, ... ]
easy  :  [1.2134693782160904, ... ] [1.1936618983284666, ... ]
old  :  [0.009122301136664302, ... ] [0.08539618866781733, ... ]
little  :  [0.6741624574994768, ... ] [0.52062863602502, ... ]
perfect  :  [1.7519011439201353, ... ] [1.5144862670271366, ... ]
loves  :  [1.5664851756956746, ... ] [1.6772714555592931, ... ]
well  :  [0.6279648775668364, ... ] [0.5042567463979309, ... ]
able  :  [0.17433127255187547, ... ] [0.1914383022947517, ... ]
car  :  [0.1952636706177672, ... ] [0.05883499006802089, ... ]
broke  :  [-2.2508055151605793, ... ] [-1.6579644783802756, ... ]
less  :  [-0.20314996272370037, ... ] [-0.20970981521595558, ... ]
even  :  [-0.6001109070842315, ... ] [-0.5117385512700553, ... ]
waste  :  [-3.621380024220667, ... ] [-2.0427736110037222, ... ]
disappointed  :  [-3.957486183926941, ... ] [-2.3550925006107244, ... ]
work  :  [-0.60515383596316

## Comparing models

In [118]:
get_classification_accuracy(sentiment_model, train_data, train_data['sentiment'])

 correctly classified examples:  130673
 total examples:  133416
accuracy:  0.979440247047


0.979440247046831

In [119]:
get_classification_accuracy(simple_model, train_data, train_data['sentiment'])

 correctly classified examples:  115647
 total examples:  133416
accuracy:  0.866815074654


0.8668150746537147

In [120]:
get_classification_accuracy(sentiment_model, test_data, test_data['sentiment'])

 correctly classified examples:  30487
 total examples:  33336
accuracy:  0.914536837053


0.9145368370530358

In [121]:
get_classification_accuracy(simple_model, test_data, test_data['sentiment'])

 correctly classified examples:  28979
 total examples:  33336
accuracy:  0.869300455964


0.8693004559635229

## Baseline: Majority class prediction

In [122]:
num_positive  = (train_data['sentiment'] == +1).sum()
num_negative = (train_data['sentiment'] == -1).sum()
print num_positive
print num_negative

112164
21252


In [123]:
num_positive  = (test_data['sentiment'] == +1).sum()
num_negative = (test_data['sentiment'] == -1).sum()
print num_positive
print num_negative

print num_positive/(num_negative+num_positive)

28095
5241
0.842782577394
