# Loading data

In [1]:
import turicreate as tc
products = tc.load_sframe('amazon_baby.sframe/')

In [2]:
products.head()

name,review,rating
Planetwise Flannel Wipes,"These flannel wipes are OK, but in my opinion ...",3.0
Planetwise Wipe Pouch,it came early and was not disappointed. i love ...,5.0
Annas Dream Full Quilt with 2 Shams ...,Very soft and comfortable and warmer than it ...,5.0
Stop Pacifier Sucking without tears with ...,This is a product well worth the purchase. I ...,5.0
Stop Pacifier Sucking without tears with ...,All of my kids have cried non-stop when I tried to ...,5.0
Stop Pacifier Sucking without tears with ...,"When the Binky Fairy came to our house, we didn't ...",5.0
A Tale of Baby's Days with Peter Rabbit ...,"Lovely book, it's bound tightly so you may no ...",4.0
"Baby Tracker&reg; - Daily Childcare Journal, ...",Perfect for new parents. We were able to keep ...,5.0
"Baby Tracker&reg; - Daily Childcare Journal, ...",A friend of mine pinned this product on Pinte ...,5.0
"Baby Tracker&reg; - Daily Childcare Journal, ...",This has been an easy way for my nanny to record ...,4.0


# Build word count vector

In [3]:
products['word_count'] = tc.text_analytics.count_words(products['review'])

In [4]:
products.head()

name,review,rating,word_count
Planetwise Flannel Wipes,"These flannel wipes are OK, but in my opinion ...",3.0,"{'handles': 1.0, 'stripping': 1.0, ..."
Planetwise Wipe Pouch,it came early and was not disappointed. i love ...,5.0,"{'recommend': 1.0, 'highly': 1.0, ..."
Annas Dream Full Quilt with 2 Shams ...,Very soft and comfortable and warmer than it ...,5.0,"{'quilt': 1.0, 'of': 1.0, 'the': 1.0, 'than': 1.0, ..."
Stop Pacifier Sucking without tears with ...,This is a product well worth the purchase. I ...,5.0,"{'tool': 1.0, 'clever': 1.0, 'approach': 2.0, ..."
Stop Pacifier Sucking without tears with ...,All of my kids have cried non-stop when I tried to ...,5.0,"{'rock': 1.0, 'many': 1.0, 'headaches': 1.0, ..."
Stop Pacifier Sucking without tears with ...,"When the Binky Fairy came to our house, we didn't ...",5.0,"{'thumb': 1.0, 'or': 1.0, 'break': 1.0, 'trying': ..."
A Tale of Baby's Days with Peter Rabbit ...,"Lovely book, it's bound tightly so you may no ...",4.0,"{'for': 1.0, 'barnes': 1.0, 'at': 1.0, 'is': ..."
"Baby Tracker&reg; - Daily Childcare Journal, ...",Perfect for new parents. We were able to keep ...,5.0,"{'right': 1.0, 'because': 1.0, 'questions': 1.0, ..."
"Baby Tracker&reg; - Daily Childcare Journal, ...",A friend of mine pinned this product on Pinte ...,5.0,"{'like': 1.0, 'and': 1.0, 'changes': 1.0, 'the': ..."
"Baby Tracker&reg; - Daily Childcare Journal, ...",This has been an easy way for my nanny to record ...,4.0,"{'in': 1.0, 'pages': 1.0, 'out': 1.0, 'run': 1.0, ..."


In [5]:
selected_words = ['awesome', 'great', 'fantastic', 'amazing', 'love', 'horrible', 'bad', 'terrible', 'awful', 'wow', 'hate']

# Count selected words by loop

In [6]:
sum_awesome = 0
for dic in products['word_count']:
    if 'awesome' in dic:
        count_awesome = dic['awesome']
        sum_awesome = sum_awesome + count_awesome
print (sum_awesome)

4075.0


In [7]:
sum_great = 0
for dic in products['word_count']:
    if 'great' in dic:
        count_great = dic['great']
        sum_great = sum_great + count_great
print (sum_great)

59536.0


In [8]:
def sum_word(x):
    sum_x = 0
    for dic in products['word_count']:
        if x in dic:
            count_x = dic[x]
            sum_x = sum_x + count_x
    return sum_x

In [9]:
for x in selected_words:
    print ("%s : %i" % (x,sum_word(x)))

awesome : 4075
great : 59536
fantastic : 1765
amazing : 2726
love : 43867
horrible : 1245
bad : 4950
terrible : 1282
awful : 753
wow : 461
hate : 1285


# Count selected words by .apply()

In [10]:
def count_word(x,y):
    if x in y:
        return y[x]
    else:
        return 0

In [11]:
for word in selected_words:
    products[word] = products['word_count'].apply(lambda dic: count_word(word,dic))
    print ( "%s : %i" % (word,products[word].sum()))

awesome : 4075
great : 59536
fantastic : 1765
amazing : 2726
love : 43867
horrible : 1245
bad : 4950
terrible : 1282
awful : 753
wow : 461
hate : 1285


In [12]:
products.head()

name,review,rating,word_count,awesome,great
Planetwise Flannel Wipes,"These flannel wipes are OK, but in my opinion ...",3.0,"{'handles': 1.0, 'stripping': 1.0, ...",0.0,0.0
Planetwise Wipe Pouch,it came early and was not disappointed. i love ...,5.0,"{'recommend': 1.0, 'highly': 1.0, ...",0.0,0.0
Annas Dream Full Quilt with 2 Shams ...,Very soft and comfortable and warmer than it ...,5.0,"{'quilt': 1.0, 'of': 1.0, 'the': 1.0, 'than': 1.0, ...",0.0,0.0
Stop Pacifier Sucking without tears with ...,This is a product well worth the purchase. I ...,5.0,"{'tool': 1.0, 'clever': 1.0, 'approach': 2.0, ...",0.0,0.0
Stop Pacifier Sucking without tears with ...,All of my kids have cried non-stop when I tried to ...,5.0,"{'rock': 1.0, 'many': 1.0, 'headaches': 1.0, ...",0.0,1.0
Stop Pacifier Sucking without tears with ...,"When the Binky Fairy came to our house, we didn't ...",5.0,"{'thumb': 1.0, 'or': 1.0, 'break': 1.0, 'trying': ...",0.0,1.0
A Tale of Baby's Days with Peter Rabbit ...,"Lovely book, it's bound tightly so you may no ...",4.0,"{'for': 1.0, 'barnes': 1.0, 'at': 1.0, 'is': ...",0.0,0.0
"Baby Tracker&reg; - Daily Childcare Journal, ...",Perfect for new parents. We were able to keep ...,5.0,"{'right': 1.0, 'because': 1.0, 'questions': 1.0, ...",0.0,0.0
"Baby Tracker&reg; - Daily Childcare Journal, ...",A friend of mine pinned this product on Pinte ...,5.0,"{'like': 1.0, 'and': 1.0, 'changes': 1.0, 'the': ...",0.0,0.0
"Baby Tracker&reg; - Daily Childcare Journal, ...",This has been an easy way for my nanny to record ...,4.0,"{'in': 1.0, 'pages': 1.0, 'out': 1.0, 'run': 1.0, ...",0.0,0.0

fantastic,amazing,love,horrible,bad,terrible,awful,wow,hate
0.0,0.0,0.0,0,0.0,0,0,0,0
0.0,0.0,1.0,0,0.0,0,0,0,0
0.0,0.0,0.0,0,0.0,0,0,0,0
0.0,0.0,2.0,0,0.0,0,0,0,0
0.0,0.0,1.0,0,0.0,0,0,0,0
0.0,0.0,0.0,0,0.0,0,0,0,0
0.0,0.0,0.0,0,0.0,0,0,0,0
0.0,0.0,0.0,0,0.0,0,0,0,0
1.0,0.0,0.0,0,0.0,0,0,0,0
0.0,0.0,0.0,0,0.0,0,0,0,0


## Define what's a positive and a negative sentiment

In [13]:
#ignore all 3* reviews
products = products[products['rating'] !=3]

In [14]:
#positive sentiment = 4* or 5* reviews
products['sentiment'] = products['rating'] >= 4

In [15]:
products.head()

name,review,rating,word_count,awesome,great
Planetwise Wipe Pouch,it came early and was not disappointed. i love ...,5.0,"{'recommend': 1.0, 'highly': 1.0, ...",0.0,0.0
Annas Dream Full Quilt with 2 Shams ...,Very soft and comfortable and warmer than it ...,5.0,"{'quilt': 1.0, 'of': 1.0, 'the': 1.0, 'than': 1.0, ...",0.0,0.0
Stop Pacifier Sucking without tears with ...,This is a product well worth the purchase. I ...,5.0,"{'tool': 1.0, 'clever': 1.0, 'approach': 2.0, ...",0.0,0.0
Stop Pacifier Sucking without tears with ...,All of my kids have cried non-stop when I tried to ...,5.0,"{'rock': 1.0, 'many': 1.0, 'headaches': 1.0, ...",0.0,1.0
Stop Pacifier Sucking without tears with ...,"When the Binky Fairy came to our house, we didn't ...",5.0,"{'thumb': 1.0, 'or': 1.0, 'break': 1.0, 'trying': ...",0.0,1.0
A Tale of Baby's Days with Peter Rabbit ...,"Lovely book, it's bound tightly so you may no ...",4.0,"{'for': 1.0, 'barnes': 1.0, 'at': 1.0, 'is': ...",0.0,0.0
"Baby Tracker&reg; - Daily Childcare Journal, ...",Perfect for new parents. We were able to keep ...,5.0,"{'right': 1.0, 'because': 1.0, 'questions': 1.0, ...",0.0,0.0
"Baby Tracker&reg; - Daily Childcare Journal, ...",A friend of mine pinned this product on Pinte ...,5.0,"{'like': 1.0, 'and': 1.0, 'changes': 1.0, 'the': ...",0.0,0.0
"Baby Tracker&reg; - Daily Childcare Journal, ...",This has been an easy way for my nanny to record ...,4.0,"{'in': 1.0, 'pages': 1.0, 'out': 1.0, 'run': 1.0, ...",0.0,0.0
"Baby Tracker&reg; - Daily Childcare Journal, ...",I love this journal and our nanny uses it ...,4.0,"{'tracker': 1.0, 'now': 1.0, 'its': 1.0, 'sti ...",0.0,0.0

fantastic,amazing,love,horrible,bad,terrible,awful,wow,hate,sentiment
0.0,0.0,1.0,0,0.0,0,0,0,0,1
0.0,0.0,0.0,0,0.0,0,0,0,0,1
0.0,0.0,2.0,0,0.0,0,0,0,0,1
0.0,0.0,1.0,0,0.0,0,0,0,0,1
0.0,0.0,0.0,0,0.0,0,0,0,0,1
0.0,0.0,0.0,0,0.0,0,0,0,0,1
0.0,0.0,0.0,0,0.0,0,0,0,0,1
1.0,0.0,0.0,0,0.0,0,0,0,0,1
0.0,0.0,0.0,0,0.0,0,0,0,0,1
0.0,0.0,2.0,0,0.0,0,0,0,0,1


## Let's train the sentiment classifier

In [16]:
train_data, test_data = products.random_split(.8,seed=0)

In [17]:
selected_words_model = tc.logistic_classifier.create(train_data,
                                               target='sentiment',
                                               features=selected_words,
                                                validation_set=test_data)

In [26]:
coef = selected_words_model.coefficients
coef


name,index,class,value,stderr
(intercept),,1,1.3365913848877558,0.0089299697876567
awesome,,1,1.133534666034145,0.0839964398318752
great,,1,0.8630655001196618,0.0189550524443773
fantastic,,1,0.8858047568814295,0.1116759129339965
amazing,,1,1.1000933113660285,0.0995477626046598
love,,1,1.3592688669225153,0.0280683001520994
horrible,,1,-2.251335236759093,0.0802024938878844
bad,,1,-0.9914778800650564,0.0384842866469906
terrible,,1,-2.223661436085127,0.0773173620378575
awful,,1,-2.0529082040313518,0.1009973543525925


In [33]:
#coef.sort('value').print_rows(rows=12)
print_rows?

Object `print_rows` not found.


# Evaluate the model

In [20]:
selected_words_model.evaluate(test_data, metric='roc_curve')

{'roc_curve': Columns:
 	threshold	float
 	fpr	float
 	tpr	float
 	p	int
 	n	int
 
 Rows: 1001
 
 Data:
 +-----------+--------------------+-----+-------+------+
 | threshold |        fpr         | tpr |   p   |  n   |
 +-----------+--------------------+-----+-------+------+
 |    0.0    |        1.0         | 1.0 | 27976 | 5328 |
 |   0.001   | 0.9994369369369369 | 1.0 | 27976 | 5328 |
 |   0.002   | 0.9992492492492493 | 1.0 | 27976 | 5328 |
 |   0.003   | 0.9990615615615616 | 1.0 | 27976 | 5328 |
 |   0.004   | 0.9990615615615616 | 1.0 | 27976 | 5328 |
 |   0.005   | 0.9981231231231231 | 1.0 | 27976 | 5328 |
 |   0.006   | 0.9981231231231231 | 1.0 | 27976 | 5328 |
 |   0.007   | 0.9979354354354354 | 1.0 | 27976 | 5328 |
 |   0.008   | 0.9973723723723724 | 1.0 | 27976 | 5328 |
 |   0.009   | 0.9971846846846847 | 1.0 | 27976 | 5328 |
 +-----------+--------------------+-----+-------+------+
 [1001 rows x 5 columns]
 Note: Only the head of the SFrame is printed.
 You can use print_rows(nu

In [21]:
selected_words_model.summary

<bound method Model.summary of Class                          : LogisticClassifier

Schema
------
Number of coefficients         : 12
Number of examples             : 133448
Number of classes              : 2
Number of feature columns      : 11
Number of unpacked features    : 11

Hyperparameters
---------------
L1 penalty                     : 0.0
L2 penalty                     : 0.01

Training Summary
----------------
Solver                         : newton
Solver iterations              : 6
Solver status                  : SUCCESS: Optimal solution found.
Training time (sec)            : 1.3926

Settings
--------
Log-likelihood                 : 52926.808

Highest Positive Coefficients
-----------------------------
love                           : 1.3593
(intercept)                    : 1.3366
awesome                        : 1.1335
amazing                        : 1.1001
fantastic                      : 0.8858

Lowest Negative Coefficients
----------------------------
horrible     

In [22]:
selected_words_model.evaluate(test_data)

{'accuracy': 0.8463848186404036,
 'auc': 0.6935096220934976,
 'confusion_matrix': Columns:
 	target_label	int
 	predicted_label	int
 	count	int
 
 Rows: 4
 
 Data:
 +--------------+-----------------+-------+
 | target_label | predicted_label | count |
 +--------------+-----------------+-------+
 |      1       |        0        |  159  |
 |      0       |        0        |  371  |
 |      0       |        1        |  4957 |
 |      1       |        1        | 27817 |
 +--------------+-----------------+-------+
 [4 rows x 3 columns],
 'f1_score': 0.9157860082304526,
 'log_loss': 0.3962265467087378,
 'precision': 0.8487520595594068,
 'recall': 0.9943165570488991,
 'roc_curve': Columns:
 	threshold	float
 	fpr	float
 	tpr	float
 	p	int
 	n	int
 
 Rows: 1001
 
 Data:
 +-----------+--------------------+-----+-------+------+
 | threshold |        fpr         | tpr |   p   |  n   |
 +-----------+--------------------+-----+-------+------+
 |    0.0    |        1.0         | 1.0 | 27976 | 5328 