In [1]:
import pandas as pd
import numpy as np
import string

from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

products = pd.read_csv('amazon_baby.csv')


Vendor:  Continuum Analytics, Inc.
Package: mkl
Message: trial mode expires in 29 days


<a href="https://eventing.coursera.org/api/redirectStrict/_d3kgMAwyyfHuIys9QgZRmpE3ZOOJcLr7_Q69QSTOf6yfTjc55IWZoO1aG9nHGAtcimq7ozgWzN2uQEVMgS-CQ.anw9EdGhhpFu2oGHhdGeCw.QytjdNO1YrdoZZmsEq9svvJjL8z8B3mUJhwNnpDoqfASNIhbrZViXcQVMqIzSrBczSXqU-0-cXGH0hF06WkDlmo_qIH86YCYgJrk8pSl8--P-hizUoQ5_bFP9wfZijyn-iWylqDmN3HwQ1p1lY9a2QG8cqNEV-T_8ZduadxefK6h0w0RI0El662M0P_O0xR5igY8tRezJ4JnTCHRed1cwdPjvXmeyR2LaI6UvrV2q7oZ2nFnkqJpGQRwMo_nrAPKq6G2lzFsHatgjzmkTdi6GM-9XNuOEu05eJu6-7j3c3TfjdE7Y6KZa1lJ4Mpg-OwJYYym7rIkdOjGpd_m9VL9hqK0iHNsfd34LBjsfiBgTzDeblvPINfAyxjTnwNx6xCqy_-k8QQKn6ty9A4D6phgPIYqUo0jCtMT_GMLEGpgm3wrPmStZFxWjlgBWCzw70s6">Download amazon_baby.csv data here</a>

In [2]:
products.columns

Index([u'name', u'review', u'rating'], dtype='object')

In [3]:
products.head()

Unnamed: 0,name,review,rating
0,Planetwise Flannel Wipes,"These flannel wipes are OK, but in my opinion ...",3
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5


In [4]:
products.describe()

Unnamed: 0,rating
count,183531.0
mean,4.120448
std,1.285017
min,1.0
25%,4.0
50%,5.0
75%,5.0
max,5.0


## Perform text cleaning

We start by removing punctuation, so that words "cake." and "cake!" are counted as the same word.

* Write a function remove_punctuation that strips punctuation from a line of text
* Apply this function to every element in the review column of products, and save the result to a new column review_clean.

In [5]:
def remove_punctuation(x):
    return x.translate(None, string.punctuation) 


In [6]:
products['review_clean'] = products['review'].astype(str).apply(remove_punctuation)

In [7]:
type(products['review'])

pandas.core.series.Series

In [8]:
products.head()

Unnamed: 0,name,review,rating,review_clean
0,Planetwise Flannel Wipes,"These flannel wipes are OK, but in my opinion ...",3,These flannel wipes are OK but in my opinion n...
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5,it came early and was not disappointed i love ...
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5,Very soft and comfortable and warmer than it l...
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5,This is a product well worth the purchase I h...
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,All of my kids have cried nonstop when I tried...


Extract Sentiments

* We will ignore all reviews with rating = 3, since they tend to have a neutral sentiment. In SFrame, for instance

In [9]:
products = products[products['rating'] != 3]


Now, we will assign reviews with a rating of 4 or higher to be positive reviews, while the ones with rating of 2 or lower are negative. For the sentiment column, we use +1 for the positive class label and -1 for the negative class label. A good way is to create an anonymous function that converts a rating into a class label and then apply that function to every element in the rating column. In SFrame, you would use apply():

In [10]:
products['sentiment'] = products['rating'].apply(lambda rating : +1 if rating > 3 else -1)


Now, we can see that the dataset contains an extra column called sentiment which is either positive (+1) or negative (-1).

In [11]:
#train_data, test_data = train_test_split(products, test_size=0.2)

In [12]:
train_idx = pd.read_json('module-2-assignment-train-idx.json')
test_idx = pd.read_json('module-2-assignment-test-idx.json')

train_data = products.iloc[train_idx[0]]
test_data = products.iloc[test_idx[0]]

In [13]:
train_data.head()

Unnamed: 0,name,review,rating,review_clean,sentiment
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5,it came early and was not disappointed i love ...,1
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5,Very soft and comfortable and warmer than it l...,1
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5,This is a product well worth the purchase I h...,1
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,All of my kids have cried nonstop when I tried...,1
5,Stop Pacifier Sucking without tears with Thumb...,"When the Binky Fairy came to our house, we did...",5,When the Binky Fairy came to our house we didn...,1


In [14]:
train_data.shape

(133416, 5)

In [15]:
test_data.shape

(33336, 5)

In [16]:
products.shape

(166752, 5)

In [17]:
train_idx.shape

(133416, 1)

In [18]:
test_idx.shape

(33336, 1)

## Build the word count vector for each review

We will now compute the word count for each word that appears in the reviews. A vector consisting of word counts is often referred to as bag-of-word features. Since most words occur in only a few reviews, word count vectors are sparse. For this reason, scikit-learn and many other tools use sparse matrices to store a collection of word count vectors. Refer to appropriate manuals to produce sparse word count vectors. General steps for extracting word count vectors are as follows:

* Learn a vocabulary (set of all words) from the training data. Only the words that show up in the training data will be considered for feature extraction.

* Compute the occurrences of the words in each review and collect them into a row vector.
* Build a sparse matrix where each row is the word count vector for the corresponding review. Call this matrix train_matrix.
* Using the same mapping between words and columns, convert the test data into a sparse matrix test_matrix.

In [19]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
     # Use this token pattern to keep single-letter words
# First, learn vocabulary from the training data and assign columns to words
# Then convert the training data into a sparse matrix
train_matrix = vectorizer.fit_transform(train_data['review_clean'])
# Second, convert the test data into a sparse matrix, using the same word-column mapping
test_matrix = vectorizer.transform(test_data['review_clean'])

In [100]:
train_matrix.shape

(133416, 121713)

In [101]:
test_matrix.shape

(33336, 121713)

In [20]:
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression()
lr_model.fit(train_matrix, train_data['sentiment'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [21]:
lr_model.coef_.shape

(1, 121713)

In [22]:
coef = lr_model.coef_[0]

In [23]:
coef[coef > 0].shape

(85765,)

## Quiz question 1 answer above:
How many weights are >=0

In [24]:
sample_test_data = test_data[10:13]
sample_test_data.head()

Unnamed: 0,name,review,rating,review_clean,sentiment
59,Our Baby Girl Memory Book,Absolutely love it and all of the Scripture in...,5,Absolutely love it and all of the Scripture in...,1
71,Wall Decor Removable Decal Sticker - Colorful ...,Would not purchase again or recommend. The dec...,2,Would not purchase again or recommend The deca...,-1
91,New Style Trailing Cherry Blossom Tree Decal R...,Was so excited to get this product for my baby...,1,Was so excited to get this product for my baby...,-1


In [25]:
sample_test_data.iloc[0]['review']

'Absolutely love it and all of the Scripture in it.  I purchased the Baby Boy version for my grandson when he was born and my daughter-in-law was thrilled to receive the same book again.'

In [26]:
sample_test_data.iloc[1]['review']

'Would not purchase again or recommend. The decals were thick almost plastic like and were coming off the wall as I was applying them! The would NOT stick! Literally stayed stuck for about 5 minutes then started peeling off.'

We will now make a **class** prediction for the **sample_test_data**. The `sentiment_model` should predict **+1** if the sentiment is positive and **-1** if the sentiment is negative. Recall from the lecture that the **score** (sometimes called **margin**) for the logistic regression model  is defined as:

$$
\mbox{score}_i = \mathbf{w}^T h(\mathbf{x}_i)
$$ 

where $h(\mathbf{x}_i)$ represents the features for example $i$.  We will write some code to obtain the **scores** using GraphLab Create. For each row, the **score** (or margin) is a number in the range **[-inf, inf]**.

In [27]:
sample_test_matrix = vectorizer.transform(sample_test_data['review_clean'])
scores = lr_model.decision_function(sample_test_matrix)
print scores

[  5.58657766  -3.151758   -10.41782941]


### Predicting sentiment

These scores can be used to make class predictions as follows:

$$
\hat{y} = 
\left\{
\begin{array}{ll}
      +1 & \mathbf{w}^T h(\mathbf{x}_i) > 0 \\
      -1 & \mathbf{w}^T h(\mathbf{x}_i) \leq 0 \\
\end{array} 
\right.
$$

Using scores, write code to calculate $\hat{y}$, the class predictions:

In [28]:
print "Class predictions according to GraphLab Create:" 
print lr_model.predict(sample_test_matrix)

Class predictions according to GraphLab Create:
[ 1 -1 -1]


## Quiz question:
Quiz question: Of the three data points in sample_test_data, which one (first, second, or third) has the lowest probability of being classified as a positive review?

In [29]:
print lr_model.predict_proba(sample_test_matrix)

[[  3.73383851e-03   9.96266161e-01]
 [  9.58977936e-01   4.10220639e-02]
 [  9.99970106e-01   2.98938045e-05]]


Answer: the theird one with 2.9e-05 probability as +1

## Find the most positive (and negative) review

We now turn to examining the full test dataset, test_data, and use sklearn.linear_model.LogisticRegression to form predictions on all of the test data points.

* Using the sentiment_model, find the 20 reviews in the entire test_data with the highest probability of being classified as a positive review. We refer to these as the "most positive reviews."

* To calculate these top-20 reviews, use the following steps:

* Make probability predictions on test_data using the sentiment_model.
Sort the data according to those predictions and pick the top 20.

In [32]:
test_pred_proba = lr_model.predict_proba(test_matrix)


In [35]:
test_pred_proba_p = test_pred_proba[:,1]
test_pred_proba_p

array([ 0.77657793,  0.99999928,  0.93255077, ...,  0.99999439,
        0.9999974 ,  0.9806904 ])

In [36]:
np.argsort(test_pred_proba_p)

array([ 2931, 21700, 13939, ...,  9125, 25554, 20743])

In [38]:
np.argsort(test_pred_proba, axis=0)

array([[24899,  2931],
       [25554, 21700],
       [30535, 13939],
       ..., 
       [13939,  9125],
       [21700, 25554],
       [ 2931, 20743]])

In [63]:
sorted_test_data_ascending = test_data.iloc[np.argsort(test_pred_proba_p)]
sorted_test_data_descending = test_data.iloc[np.argsort(-test_pred_proba_p)]


In [64]:
sorted_test_data_ascending.head(20)['review'].values[0] # worsed feeling comment



In [67]:
sorted_test_data_descending.head(20)['review'].values[0] # best feeling comment

"My husband and I assembled this Pack n' Play last night and so far we are very happy with it.  The assembly seemed a bit complicated but I think that's a normal reaction from a couple who has never put together baby gear before.  My husband was able to put it together in about 15-20 minutes and the entire setup seems very sturdy and secure.  The elevated bassinet is also 100% level (we checked with an actual level) and the changer is solid.  I was able to move the Pack n' Play easily and get it through our doors with no issues (it just fit) although I'm almost eight months along.  We haven't had a chance to try the mechanical portion yet as it appears you need to add your own batteries.My only surprise thus far was with the exact color.  The black looks great; I purposely was looking for simple modern colors.  However, I thought the trim color was a beige but it is actually a medium gray.  While I like the clean and simple black coloring of the main section, the gray makes it look a l

## 20 most positive review:

In [68]:
sorted_test_data_descending.head(20)

Unnamed: 0,name,review,rating,review_clean,sentiment
137034,Graco Pack 'n Play Element Playard - Flint,My husband and I assembled this Pack n' Play l...,4,My husband and I assembled this Pack n Play la...,1
140816,"Diono RadianRXT Convertible Car Seat, Plum",I bought this seat for my tall (38in) and thin...,5,I bought this seat for my tall 38in and thin 2...,1
168081,Buttons Cloth Diaper Cover - One Size - 8 Colo...,"We are big Best Bottoms fans here, but I wante...",4,We are big Best Bottoms fans here but I wanted...,1
180646,Mamas &amp; Papas 2014 Urbo2 Stroller - Black,After much research I purchased an Urbo2. It's...,4,After much research I purchased an Urbo2 Its e...,1
168697,Graco FastAction Fold Jogger Click Connect Str...,Graco's FastAction Jogging Stroller definitely...,5,Gracos FastAction Jogging Stroller definitely ...,1
66059,"Evenflo 6 Pack Classic Glass Bottle, 4-Ounce",It's always fun to write a review on those pro...,5,Its always fun to write a review on those prod...,1
100166,"Infantino Wrap and Tie Baby Carrier, Black Blu...",I bought this carrier when my daughter was abo...,5,I bought this carrier when my daughter was abo...,1
97325,Freemie Hands-Free Concealable Breast Pump Col...,I absolutely love this product. I work as a C...,5,I absolutely love this product I work as a Cu...,1
119182,Roan Rocco Classic Pram Stroller 2-in-1 with B...,Great Pram Rocco!!!!!!I bought this pram from ...,5,Great Pram RoccoI bought this pram from Europe...,1
52631,Evenflo X Sport Plus Convenience Stroller - Ch...,After seeing this in Parent's Magazine and rea...,5,After seeing this in Parents Magazine and read...,1


## 20 most negative review:

In [69]:
sorted_test_data_ascending.head(20)

Unnamed: 0,name,review,rating,review_clean,sentiment
16042,Fisher-Price Ocean Wonders Aquarium Bouncer,We have not had ANY luck with Fisher-Price pro...,2,We have not had ANY luck with FisherPrice prod...,-1
120209,Levana Safe N'See Digital Video Baby Monitor w...,This is the first review I have ever written o...,1,This is the first review I have ever written o...,-1
77072,Safety 1st Exchangeable Tip 3 in 1 Thermometer,I thought it sounded great to have different t...,1,I thought it sounded great to have different t...,-1
48694,Adiri BPA Free Natural Nurser Ultimate Bottle ...,I will try to write an objective review of the...,2,I will try to write an objective review of the...,-1
155287,VTech Communications Safe &amp; Sounds Full Co...,"This is my second video monitoring system, the...",1,This is my second video monitoring system the ...,-1
94560,The First Years True Choice P400 Premium Digit...,Note: we never installed batteries in these un...,1,Note we never installed batteries in these uni...,-1
53207,Safety 1st High-Def Digital Monitor,We bought this baby monitor to replace a diffe...,1,We bought this baby monitor to replace a diffe...,-1
81332,Cloth Diaper Sprayer--styles may vary,I bought this sprayer out of desperation durin...,1,I bought this sprayer out of desperation durin...,-1
113995,Motorola Digital Video Baby Monitor with Room ...,DO NOT BUY THIS BABY MONITOR!I purchased this ...,1,DO NOT BUY THIS BABY MONITORI purchased this m...,-1
10677,Philips AVENT Newborn Starter Set,"It's 3am in the morning and needless to say, t...",1,Its 3am in the morning and needless to say thi...,-1


In [70]:
sorted_test_data_ascending.head(20)['review'].values[18] # worsed feeling comment

'My Experience: Babykicks Inserts failure vs RAVING Success of Thirsties insertsPurchased the Thirsties Hemp inserts 2-Pack, Lg 18-40lbs and the BabyKicks 3 Pack Joey Bunz Hemp Inserts, as both products seemed equally matched based on reviews. Put each brand to the "overnight" diapering test over a week, alternating nights. (CD\'ing fleece liner over 4x8x4 unbleached indian cotton premium prefolds with hemp inserts wrapped outside, then a Thirsties Duo Wrap cover size 2 over.) Both brands of inserts were prepped in same wash loads with same detergent, same dryers all at same time prior to the test.THE RESULTS?  Thirsties won and Babykicks was NOT remotely CLOSE!The Babykicks Hemp Inserts, when compared to the Thirsties inserts, performed little better than paper towel! The little one was completely SOAKED after a mere 3-4 hours on nights the Babykicks. And not just the baby, but the mattress, diaper cover exterior, EVERYTHING. (**On a side note, I must commend the Thristies Duo Diaper 

## Compute accuracy of the classifier

In [71]:
test_pred = lr_model.predict(test_matrix)

num_correct_predic = (test_data['sentiment'] == test_pred).sum()

In [74]:
accuracy_test = float(num_correct_predic) / test_data.shape[0]

In [76]:
accuracy_test

0.9325953923686106

### Or with sklearn:

In [77]:
from sklearn.metrics import accuracy_score

accuracy_score(test_data['sentiment'], test_pred)

0.93259539236861055

## Learn another classifier with fewer words

In [78]:
significant_words = ['love', 'great', 'easy', 'old', 'little', 'perfect', 'loves', 
      'well', 'able', 'car', 'broke', 'less', 'even', 'waste', 'disappointed', 
      'work', 'product', 'money', 'would', 'return']

In [79]:
vectorizer_word_subset = CountVectorizer(vocabulary=significant_words) # limit to 20 words
train_matrix_word_subset = vectorizer_word_subset.fit_transform(train_data['review_clean'])
test_matrix_word_subset = vectorizer_word_subset.transform(test_data['review_clean'])

In [80]:
from sklearn.linear_model import LogisticRegression

lr_model_simple = LogisticRegression()
lr_model_simple.fit(train_matrix_word_subset, train_data['sentiment'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [82]:
lr_model_simple.coef_.shape

(1, 20)

### Quiz Question: Consider the coefficients of simple_model. How many of the 20 coefficients (corresponding to the 20 significant_words) are positive for the simple_model?

In [83]:
coef_simple = lr_model_simple.coef_[0]
coef_simple[coef_simple>0].shape

(10,)

In [87]:
coef_simple[coef_simple>0]

array([ 1.36368976,  0.94399959,  1.19253827,  0.08551278,  0.52018576,
        1.50981248,  1.67307389,  0.50376046,  0.19090857,  0.05885467])

In [89]:
coef_simple.shape

(20,)

In [92]:
np.array(significant_words)[coef_simple>0]

array(['love', 'great', 'easy', 'old', 'little', 'perfect', 'loves',
       'well', 'able', 'car'], 
      dtype='|S12')

In [93]:
np.array(significant_words)[coef_simple<0]

array(['broke', 'less', 'even', 'waste', 'disappointed', 'work', 'product',
       'money', 'would', 'return'], 
      dtype='|S12')

### Quiz Question: Are the positive words in the simple_model also positive words in the sentiment_model?

In [102]:
significant_words_matrix = vectorizer.transform(significant_words)

In [104]:
significant_words_matrix.shape

(20, 121713)

In [123]:
type(significant_words_matrix[0])

scipy.sparse.csr.csr_matrix

In [124]:
significant_words_matrix.indices

array([ 63567,  48789,  37640,  74107,  62602,  78983,  63646, 117907,
         7386,  22122,  20190,  61494,  39961, 117083,  34453, 119933,
        83730,  68076, 120337,  89500], dtype=int32)

In [125]:
significant_words_matrix.indptr

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20], dtype=int32)

In [132]:
lr_model.coef_[0][significant_words_matrix.indices]

array([ 1.58238993,  1.23328876,  1.35967256,  0.05424918,  0.6403053 ,
        1.86188428,  1.52489301,  0.54061138,  0.3957783 ,  0.12648197,
       -1.38211822, -0.27637368, -0.46562762, -1.98606474, -2.18768289,
       -0.45727579, -0.18652524, -0.7846135 , -0.28734847, -1.64702197])

### To answer the question above: (Yes)

In [133]:
significant_words_positive = np.array(significant_words)[coef_simple>0]
significant_positive_words_matrix = vectorizer.transform(significant_words_positive)

lr_model.coef_[0][significant_positive_words_matrix.indices]

array([ 1.58238993,  1.23328876,  1.35967256,  0.05424918,  0.6403053 ,
        1.86188428,  1.52489301,  0.54061138,  0.3957783 ,  0.12648197])

In [134]:
lr_model_simple.coef_[0]

array([ 1.36368976,  0.94399959,  1.19253827,  0.08551278,  0.52018576,
        1.50981248,  1.67307389,  0.50376046,  0.19090857,  0.05885467,
       -1.65157634, -0.20956286, -0.51137963, -2.03369861, -2.34829822,
       -0.62116877, -0.32055624, -0.89803074, -0.36216674, -2.10933109])

## Comparing models

### Quiz Question: Which model (sentiment_model or simple_model) has higher accuracy on the TRAINING set, and how about Test set?

#### On train data:

In [135]:
from sklearn.metrics import accuracy_score

accuracy_score(train_data['sentiment'], lr_model.predict(train_matrix))

0.96759758949451335

In [136]:
accuracy_score(train_data['sentiment'], lr_model_simple.predict(train_matrix_word_subset))

0.8668225700065959

#### On test data:

In [137]:
accuracy_score(test_data['sentiment'], lr_model.predict(test_matrix))

0.93259539236861055

In [138]:
accuracy_score(test_data['sentiment'], lr_model_simple.predict(test_matrix_word_subset))

0.86936045116390692

## Baseline: Majority class prediction

In [139]:
num_positive  = (train_data['sentiment'] == +1).sum()
num_negative = (train_data['sentiment'] == -1).sum()
print num_positive
print num_negative

print float(num_positive)/(num_positive + num_negative)

112164
21252
0.840708760568


In [140]:
num_positive  = (test_data['sentiment'] == +1).sum()
num_negative = (test_data['sentiment'] == -1).sum()
print num_positive
print num_negative

print float(num_positive)/(num_positive + num_negative)

28095
5241
0.842782577394
