# Predicting sentiment from product reviews

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
import string
import json

DATA_DIR = './data/week1/'

## Inspect and Clean Data

In [3]:
amazon_baby_csv = pd.read_csv(DATA_DIR + 'amazon_baby.csv')

amazon_baby_csv.info()
print('\n')
amazon_baby_csv.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 183531 entries, 0 to 183530
Data columns (total 3 columns):
name      183213 non-null object
review    182702 non-null object
rating    183531 non-null int64
dtypes: int64(1), object(2)
memory usage: 4.2+ MB




Unnamed: 0,name,review,rating
0,Planetwise Flannel Wipes,"These flannel wipes are OK, but in my opinion ...",3
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5


In [3]:
def remove_ponctuation(s):
    s = s.translate(None, string.punctuation)
    return s

# remove ponctuation from the review colum
amazon_baby_csv['review'] = amazon_baby_csv['review'].fillna('')
amazon_baby_csv['review'] = amazon_baby_csv['review'].apply(remove_ponctuation)

# remove ratings < 3 and create a sentiment column
amazon_baby_csv = amazon_baby_csv[amazon_baby_csv['rating'] != 3]
amazon_baby_csv['sentiment'] = amazon_baby_csv['rating'].apply(lambda x: 1 if x > 3 else -1)

# Split the dataframe into a training and test set
train_indices = open(DATA_DIR + 'module-2-assignment-train-idx.json').read()
train_indices = json.loads(train_indices)
test_indices = open(DATA_DIR + 'module-2-assignment-test-idx.json').read()
test_indices = json.loads(test_indices)
train_amazon_baby = amazon_baby_csv.iloc[train_indices]
test_amazon_baby = amazon_baby_csv.iloc[test_indices]

## Bag-of-words Model

### Training

In [4]:
# Construct bag-of-words features
vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
train_matrix = vectorizer.fit_transform(train_amazon_baby['review'])
test_matrix = vectorizer.transform(test_amazon_baby['review'])

# Train the logistic regression model
sentiment_model = LogisticRegression()
sentiment_model.fit(train_matrix, train_amazon_baby['sentiment'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

#### How many weights are >= 0 ?

In [57]:
positive_coef = np.sum(sentiment_model.coef_ >= 0)
print('There are %s positives coefficients in the sentiment model.' % positive_coef) 

There are 85974 positives coefficients in the sentiment model.


### Predictions

In [55]:
sample_test_data = test_amazon_baby.iloc[10:13]
sample_test_matrix = vectorizer.transform(sample_test_data['review'])
sample_test_scores = sentiment_model.decision_function(sample_test_matrix)
predictions = np.empty(3)
predictions.fill(-1) 
predictions[sample_test_scores > 0] = 1

print('Based on the score given by the decision function we can deduce that the predictions are : %s %s %s.' % 
      (int(predictions[0]), int(predictions[1]), int(predictions[2])))

Based on the score given by the decision function we can deduce that the predictions are : 1 -1 -1.


In [56]:
sample_test_predictions = sentiment_model.predict(sample_test_matrix)

print('The prediction previously computed matches the predictions given by the model : %s %s %s.' % 
      (int(sample_test_predictions[0]), int(sample_test_predictions[1]), int(sample_test_predictions[2])))

The prediction previously computed matches the predictions given by the model : 1 -1 -1.


#### Of the three data points in sample_test_data, which one (first, second, or third) has the lowest probability of being classified as a positive review ?

In [54]:
def sigmoid(z):
    proba = 1 / (1 + np.exp(-z))
    return proba
    
probabilities = []
for score in sample_test_scores:
    proba = sigmoid(score)
    probabilities.append(proba)
    
print('Based on the score given by the decision function we can compute the probabilities that a given review \
       is positive using the sigmoid function. The probabilities are : %s, %s and %s.' % 
      (probabilities[0], probabilities[1], probabilities[2]))
print('\n')
print('So it is the third data point that has the lowest probability of being classified as a positive review.')

Based on the score given by the decision function we can compute the probabilities that a given review        is positive using the sigmoid function. The probabilities are : 0.996337061949, 0.0411715733413 and 2.96845484816e-05.


So it is the third data point that has the lowest probability of being classified as a positive review.


#### Which of the following products are represented in the 20 most positive reviews ?

In [9]:
test_probas = sentiment_model.predict_proba(test_matrix)
test_positive_probas = test_probas[:,1]
most_positive_probas = list(test_positive_probas.argsort()[::-1][0:20])
most_positive_baby = test_amazon_baby.iloc[most_positive_probas]

print('The most positive products are : \n')
print(most_positive_baby['name'])

The most positive products are : 

114796    Fisher-Price Cradle 'N Swing,  My Little Snuga...
80155     Simple Wishes Hands-Free Breastpump Bra, Pink,...
87017       Baby Einstein Around The World Discovery Center
168081    Buttons Cloth Diaper Cover - One Size - 8 Colo...
137034           Graco Pack 'n Play Element Playard - Flint
100166    Infantino Wrap and Tie Baby Carrier, Black Blu...
180646        Mamas &amp; Papas 2014 Urbo2 Stroller - Black
52631     Evenflo X Sport Plus Convenience Stroller - Ch...
168697    Graco FastAction Fold Jogger Click Connect Str...
50315            P'Kolino Silly Soft Seating in Tias, Green
133651                    Britax 2012 B-Agile Stroller, Red
66059          Evenflo 6 Pack Classic Glass Bottle, 4-Ounce
97325     Freemie Hands-Free Concealable Breast Pump Col...
119182    Roan Rocco Classic Pram Stroller 2-in-1 with B...
140816           Diono RadianRXT Convertible Car Seat, Plum
147949    Baby Jogger City Mini GT Single Stroller, Shad...
22586

#### Which of the following products are represented in the 20 most negative reviews ?

In [10]:
test_probas = sentiment_model.predict_proba(test_matrix)
test_negative_probas = test_probas[:,1]
most_negative_probas = list(test_positive_probas.argsort()[0:20])
most_negative_baby = test_amazon_baby.iloc[most_negative_probas]

print('The most negative products are : \n')
print(most_negative_baby['name'])

The most negative products are : 

16042           Fisher-Price Ocean Wonders Aquarium Bouncer
120209    Levana Safe N'See Digital Video Baby Monitor w...
77072        Safety 1st Exchangeable Tip 3 in 1 Thermometer
48694     Adiri BPA Free Natural Nurser Ultimate Bottle ...
155287    VTech Communications Safe &amp; Sounds Full Co...
94560     The First Years True Choice P400 Premium Digit...
53207                   Safety 1st High-Def Digital Monitor
81332                 Cloth Diaper Sprayer--styles may vary
113995    Motorola Digital Video Baby Monitor with Room ...
10677                     Philips AVENT Newborn Starter Set
9915           Cosco Alpha Omega Elite Convertible Car Seat
59546                Ellaroo Mei Tai Baby Carrier - Hershey
172090    Belkin WeMo Wi-Fi Baby Monitor for Apple iPhon...
75994            Peg-Perego Tatamia High Chair, White Latte
40079     Chicco Cortina KeyFit 30 Travel System in Adve...
149987                     NUK Cook-n-Blend Baby Food Maker
15487

#### What is the accuracy of the sentiment_model on the test_data ?

In [53]:
test_predictions = sentiment_model.predict(test_matrix)

accuracy = accuracy_score(test_amazon_baby['sentiment'], test_predictions)

print('The accuracy on the test data is %s.' % accuracy)

The accuracy on the test data is 0.932265418766.


#### Does a higher accuracy value on the training_data always imply that the classifier is better ?

No because the model might have overfit the training data thus would generalyze poorly on the test data.

## Bag-of-words Model with reduced vocabulary

In [21]:
significant_words = ['love', 'great', 'easy', 'old', 'little', 'perfect', 'loves', 
                     'well', 'able', 'car', 'broke', 'less', 'even', 'waste', 'disappointed', 
                     'work', 'product', 'money', 'would', 'return']

# Construct bag-of-words features 
vectorizer_word_subset = CountVectorizer(vocabulary=significant_words)
train_matrix_word_subset = vectorizer_word_subset.fit_transform(train_amazon_baby['review'])
test_matrix_word_subset = vectorizer_word_subset.transform(test_amazon_baby['review'])

# Train the logistic regression features
simple_model = LogisticRegression()
simple_model.fit(train_matrix_word_subset, train_amazon_baby['sentiment'])

# Construct dataframe to store (word, coefficient) pairs
simple_model_data = {
    'words': significant_words,
    'coefs': simple_model.coef_[0,:]
}
simple_model_coef_table = pd.DataFrame(data=simple_model_data)

In [22]:
#### How many of the 20 coefficients (corresponding to the 20 significant_words) are positive for the simple_model ?

In [52]:
simple_model_positive_coefs = np.sum(simple_model.coef_ > 0) 

print('There are %s positive coefficients for the simple model.' % simple_model_positive_coefs)

There are 10 positive coefficients for the simple model.


#### Are the positive words in the simple_model also positive words in the sentiment_model?

In [51]:
positive_words = list(simple_model_coef_table['words'].iloc[np.arange(10)])
indices_positive_words = [ vectorizer.vocabulary_[word] for word in positive_words]

sentiment_model_coefs = sentiment_model.coef_[0,:]
coefs_positive_words_subset = sentiment_model_coefs[indices_positive_words]
coefs_positive_words_subset = [str(coef) for coef in coefs_positive_words_subset]
print('The coefficients for the words in the sentiment model are : ' + ', '.join(coefs_positive_words_subset) + '.')
print('\n')
print('We can see that the coefficients are all positive so the words in the sentiment model are also positive words.')

The coefficients for the words in the sentiment model are : 1.57301531631, 1.22507913667, 1.35195262572, 0.0548859014267, 0.638852688253, 1.86278360017, 1.51794273443, 0.541220062444, 0.389968628061, 0.124447349271.


We can see that the coefficients are all positive so the words in the sentiment model are also positive words.


## Comparing model

In [49]:
train_predictions = sentiment_model.predict(train_matrix)
accuracy = accuracy_score(train_amazon_baby['sentiment'], train_predictions)

print('The accuracy for the sentiment model on the train data is %s.' % accuracy)

The accuracy for the sentiment model on the train data is 0.967912394316.


In [50]:
train_predictions_subset = simple_model.predict(train_matrix_word_subset)
accuracy_subset = accuracy_score(train_amazon_baby['sentiment'], train_predictions_subset)

print('The accuracy for the sentiment model on the train data is %s.' % accuracy_subset)

The accuracy for the sentiment model on the train data is 0.866822570007.


#### Which model (sentiment_model or simple_model) has higher accuracy on the TRAINING set ?

It is the sentiment_model that has the higher accuracy on the training set.

In [58]:
test_predictions_subset = simple_model.predict(test_matrix_word_subset)
accuracy = accuracy_score(test_amazon_baby['sentiment'], test_predictions_subset)

print('The accuracy for the simple model on the test data is %s.' % accuracy)

The accuracy for the simple model on the test data is 0.869360451164.


## Baseline: Majority class prediction

In [61]:
dummy_classifier = DummyClassifier()
dummy_classifier.fit(train_matrix, train_amazon_baby['sentiment'])
train_predictions = dummy_classifier.predict(train_matrix)
accuracy = accuracy_score(train_amazon_baby['sentiment'], train_predictions)

print('The accuracy for the dummy classifier on the train data is %s.' % accuracy)

The accuracy for the dummy classifier on the train data is 0.732460874258.


#### Is the sentiment_model definitely better than the majority class classifier (the baseline)?