# Predicting sentiment from product reviews

In this notebook, we try to predict whether a product review has positive or negative sentiment using logistic regression.

We'll use a dataset of amazon baby products.

We'll work with text data and bag-of-word representations for feature extraction. A tf-idf model will also be implemented.

### Load the libraries

In [1]:
import pandas as pd
import math

from sklearn import linear_model
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer

### Load the data

In [2]:
products = pd.read_csv('amazon_baby.csv')
products.head()

Unnamed: 0,name,review,rating
0,Planetwise Flannel Wipes,"These flannel wipes are OK, but in my opinion ...",3
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5


### Data cleaning

In [3]:
products = products.fillna({'review':''})  # fill in N/A's in the review column
products = products[products['rating'] != 3] # Remove reviews with a core of 3.
products['sentiment'] = products['rating'].apply(lambda rating : +1 if rating > 3 else -1) # add a sentiment column. 1 if positive, -1 if negative
products.head()

Unnamed: 0,name,review,rating,sentiment
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5,1
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5,1
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5,1
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,1
5,Stop Pacifier Sucking without tears with Thumb...,"When the Binky Fairy came to our house, we did...",5,1


### Split dataset

In [4]:
train_data, test_data = train_test_split(products, train_size = 0.8, random_state = 10)

## Model 1. Classifier using logistic regression and all words in the corpus

### Feature extraction

In [5]:
vectorizer = CountVectorizer()
word_count = vectorizer.fit_transform(train_data['review'].values)

### Logistic Regression model

In [7]:
# moodel definition
classifier = linear_model.LogisticRegression() 
targets = train_data['sentiment'].values
classifier.fit(word_count, targets)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [8]:
# Example review classification
examples = ['I love it! Perfect for my cloth wipes!', 
            "it was awful. i didn't like it all"]
example_counts = vectorizer.transform(examples)
predictions = classifier.predict(example_counts)
predictions_proba = classifier.predict_proba(example_counts)
print predictions
print predictions_proba

[ 1 -1]
[[ 0.00722719  0.99277281]
 [ 0.67636933  0.32363067]]


In [9]:
# Predict sentiment on test data
test_data_word_count = vectorizer.transform(test_data['review'].values)
predictions_test_data = classifier.predict(test_data_word_count)
print predictions_test_data

[ 1 -1  1 ..., -1  1  1]


In [10]:
# Print coefficients
coefficients = classifier.coef_ 
print coefficients

[[-0.33042862  0.15670811  0.00949425 ...,  0.00519866  0.03212715
   0.00320389]]


In [11]:
# Accuracy of the model on test data
accuracy_score(test_data['sentiment'], predictions_test_data)

0.93166621690504037

In [12]:
# Probabilities 
predictions_proba_test_data = classifier.predict_proba(test_data_word_count)
print predictions_proba_test_data

[[  5.72935680e-03   9.94270643e-01]
 [  8.49456024e-01   1.50543976e-01]
 [  9.96582299e-06   9.99990034e-01]
 ..., 
 [  6.63944486e-01   3.36055514e-01]
 [  2.94171635e-04   9.99705828e-01]
 [  1.17445221e-01   8.82554779e-01]]


## Model 2. Simple classifier (only 20 words)

In [13]:
significant_words = ['love', 'great', 'easy', 'old', 'little', 'perfect', 'loves', 
      'well', 'able', 'car', 'broke', 'less', 'even', 'waste', 'disappointed', 
      'work', 'product', 'money', 'would', 'return']

In [14]:
vectorizer2 = CountVectorizer(vocabulary = significant_words)
word_count2 = vectorizer2.fit_transform(train_data['review'].values)

In [15]:
classifier2 = linear_model.LogisticRegression()
classifier2.fit(word_count2, targets)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [16]:
test_data_word_count2 = vectorizer2.transform(test_data['review'].values)
predictions_test_data2 = classifier2.predict(test_data_word_count2)
print predictions_test_data2

[1 1 1 ..., 1 1 1]


In [17]:
coefficients2 = classifier2.coef_ 
print coefficients2

[[ 1.36258866  0.95188095  1.16567375  0.07084353  0.50704601  1.48341667
   1.71173377  0.51955938  0.20361731  0.06965207 -1.6163692  -0.13500917
  -0.52166598 -2.00558081 -2.39622512 -0.63473968 -0.32338326 -0.87424943
  -0.34257254 -2.11117408]]


In [18]:
accuracy_score(test_data['sentiment'], predictions_test_data2)

0.86489160744805249

## Model 3. Classifier with tfidf transform

In [19]:
tfidf_vectorizer = TfidfVectorizer()
word_count3 = tfidf_vectorizer.fit_transform(train_data['review'].values)
classifier3 = linear_model.LogisticRegression()
classifier3.fit(word_count3, targets)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [20]:
test_data_word_count3 = tfidf_vectorizer.transform(test_data['review'].values)
predictions_test_data3 = classifier3.predict(test_data_word_count3)
print predictions_test_data3

[ 1 -1  1 ..., -1  1  1]


In [21]:
coefficients3 = classifier3.coef_ 
print coefficients3

[[-1.4450426  -0.02209531  0.03779681 ...,  0.01203408  0.00629359
   0.00639339]]


In [22]:
accuracy_score(test_data['sentiment'], predictions_test_data3)

0.93292554945878681

In [23]:
classifier3.score(test_data_word_count3,test_data['sentiment'])

0.93292554945878681

## Best and worst words in Model 1

In [24]:
list_of_words = pd.DataFrame.from_dict(vectorizer.vocabulary_, orient = 'index')
list_of_words.reset_index(inplace=True)
list_of_words.set_index(0, inplace = True)
list_of_words = list_of_words.sort_index()

In [25]:
list_of_coefficients = pd.DataFrame(data = coefficients)
list_of_coefficients = list_of_coefficients.transpose()

In [26]:
results = pd.merge(list_of_words, list_of_coefficients, left_index = True, right_index = True).sort_values(by=0, ascending = False)

In [27]:
results.head(20)

Unnamed: 0_level_0,index,0
0,Unnamed: 1_level_1,Unnamed: 2_level_1
29536,lifesaver,2.320043
45730,skeptical,2.291736
50508,thankful,2.261218
11629,code,2.086964
12426,con,2.073053
42703,rich,2.070038
41735,relax,2.005231
30017,locate,1.998506
19247,excellent,1.956949
56703,worry,1.956884


In [28]:
results.tail(20)

Unnamed: 0_level_0,index,0
0,Unnamed: 1_level_1,Unnamed: 2_level_1
9094,bummer,-2.04762
9093,bummed,-2.070134
42511,returned,-2.085465
42514,returning,-2.099537
38394,poor,-2.124387
28861,lame,-2.135806
12454,concept,-2.181294
41377,rediculous,-2.208805
37651,pinches,-2.238534
16099,disappointed,-2.276401


## Best and worst words in Model 3 (tf-idf)

In [29]:
list_of_words = pd.DataFrame.from_dict(tfidf_vectorizer.vocabulary_, orient = 'index')
list_of_words.reset_index(inplace=True)
list_of_words.set_index(0, inplace = True)
list_of_words = list_of_words.sort_index()

list_of_coefficients = pd.DataFrame(data = coefficients3)
list_of_coefficients = list_of_coefficients.transpose()

results = pd.merge(list_of_words, list_of_coefficients, left_index = True, right_index = True).sort_values(by=0, ascending = False)

results.head(20)

Unnamed: 0_level_0,index,0
0,Unnamed: 1_level_1,Unnamed: 2_level_1
30339,love,14.075142
23302,great,12.604814
17745,easy,11.472226
37087,perfect,10.659853
30368,loves,9.758368
37101,perfectly,7.465394
7231,best,7.401948
24769,highly,7.029631
24100,happy,6.853185
22664,glad,6.215139


In [30]:
results.tail(20)

Unnamed: 0_level_0,index,0
0,Unnamed: 1_level_1,Unnamed: 2_level_1
32582,money,-5.073166
12454,concept,-5.553375
26007,impossible,-5.570638
8772,broke,-5.576125
53450,unfortunately,-5.609352
6512,barely,-5.723863
25211,horrible,-5.848213
25696,idea,-5.886024
38399,poorly,-6.066991
50389,terrible,-6.15554
