In [1]:
import sklearn
import pandas as pd
import numpy as np

In [2]:
products = pd.read_csv('amazon_baby_subset.csv')

In [3]:
products.head(10)

Unnamed: 0,name,review,rating,sentiment
0,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,1
1,Nature's Lullabies Second Year Sticker Calendar,We wanted to get something to keep track of ou...,5,1
2,Nature's Lullabies Second Year Sticker Calendar,My daughter had her 1st baby over a year ago. ...,5,1
3,"Lamaze Peekaboo, I Love You","One of baby's first and favorite books, and it...",4,1
4,SoftPlay Peek-A-Boo Where's Elmo A Children's ...,Very cute interactive book! My son loves this ...,5,1
5,Our Baby Girl Memory Book,"Beautiful book, I love it to record cherished ...",5,1
6,Hunnt&reg; Falling Flowers and Birds Kids Nurs...,"Try this out for a spring project !Easy ,fun a...",5,1
7,Blessed By Pope Benedict XVI Divine Mercy Full...,very nice Divine Mercy Pendant of Jesus now on...,5,1
8,Cloth Diaper Pins Stainless Steel Traditional ...,We bought the pins as my 6 year old Autistic s...,4,1
9,Cloth Diaper Pins Stainless Steel Traditional ...,It has been many years since we needed diaper ...,5,1


In [4]:
products[products['sentiment'] > 0]['sentiment'].count()

26579

In [5]:
products[products['sentiment'] < 0]['sentiment'].count()

26493

### Apply text cleaning on the review data

In [6]:
import json

with open('important_words.json') as fp:
    important_words = json.load(fp)

In [7]:
# fill n/a values in the review column with empty strings
products = products.fillna({'review': ''})

In [8]:
import string

table = str.maketrans({c : '' for c in string.punctuation})

def remove_punctuation(text):
    return text.translate(table)

In [9]:
products['review_clean'] = products['review'].apply(remove_punctuation)

In [10]:
for word in important_words:
    products[word] = products['review_clean'].apply(lambda s : s.split().count(word))

**QUIZ 1**: How many review contain the word **perfect**?

In [11]:
products[products['perfect'] > 0]['perfect'].count()

2955

### Convert data frame to multi-dimensional array

In [12]:
def get_numpy_data(dataframe, features, label):
    dataframe['constant'] = 1
    features = ['constant'] + features
    return (dataframe[features], dataframe[label])

In [13]:
feature_matrix, sentiment = get_numpy_data(products, important_words, 'sentiment')

**QUIZ 2**: How many features are there in the `feature_matrix`?

In [14]:
feature_matrix.shape

(53072, 194)

**QUIZ 3**: Assuming that the intercept is present, how does the number of features in `feature_matrix` relate to the number of features in the logistic regression model?

### Estimating conditional probability with link function

In [17]:
def predict_probability(feature_matrix, coefficients):
    score = np.dot(feature_matrix, coefficients)
    
    predictions = 1 / (1. + np.exp(-score))
    
    return predictions

In [30]:
def feature_derivative(errors, features):     
    derivative = np.dot(errors, features)
    return derivative

In [31]:
def compute_log_likelihood(feature_matrix, sentiment, coefficients):
    indicator = (sentiment == 1)
    scores = np.dot(feature_matrix, coefficients)
    lp = np.sum((indicator - 1) * scores - np.log(1. + np.exp(-scores)))
    return lp

### Taking gradient steps

In [44]:
from math import sqrt

def logistic_regression(features,
                        label,
                        coefficients,
                        step_size,
                        max_iter):
    print(features.shape)
    print(coefficients.shape)

    for i in range(max_iter):
        predictions = predict_probability(feature_matrix, coefficients)

        indicator = (sentiment == +1)
        
        errors = indicator - predictions

        derivative = feature_derivative(errors, features)
        
        coefficients = coefficients + step_size * derivative

        # Checking whether log likelihood is increasing
        if i <= 15 or (i <= 100 and i % 10 == 0) or (i <= 1000 and i % 100 == 0) or (i <= 10000 and i % 1000 == 0) or i % 10000 == 0:
            lp = compute_log_likelihood(features, sentiment, coefficients)
            print('iteration %*d: log likelihood of observed labels = %.8f' %
                  (int(np.ceil(np.log10(max_iter))), i, lp))

    return coefficients

In [45]:
coefficients = logistic_regression(feature_matrix,
                                   sentiment,
                                   coefficients=np.zeros(194),
                                   step_size=1e-7,
                                   max_iter=301)

(53072, 194)
(194,)
iteration   0: log likelihood of observed labels = -36780.91768478
iteration   1: log likelihood of observed labels = -36775.13434712
iteration   2: log likelihood of observed labels = -36769.35713564
iteration   3: log likelihood of observed labels = -36763.58603240
iteration   4: log likelihood of observed labels = -36757.82101962
iteration   5: log likelihood of observed labels = -36752.06207964
iteration   6: log likelihood of observed labels = -36746.30919497
iteration   7: log likelihood of observed labels = -36740.56234821
iteration   8: log likelihood of observed labels = -36734.82152213
iteration   9: log likelihood of observed labels = -36729.08669961
iteration  10: log likelihood of observed labels = -36723.35786366
iteration  11: log likelihood of observed labels = -36717.63499744
iteration  12: log likelihood of observed labels = -36711.91808422
iteration  13: log likelihood of observed labels = -36706.20710739
iteration  14: log likelihood of observed 

**Quiz 4**: As each iteration of gradient ascent passes, does the log likelihood *increase* or *decrease*?

**increaces**

### Predicting sentiments

In [72]:
print(feature_matrix.shape)
print(coefficients.shape)

scores = np.dot(feature_matrix, coefficients)

predictions = predict_probability(feature_matrix, coefficients)

(53072, 194)
(194,)


In [73]:
prediction_df = pd.DataFrame({ 'sentiment': sentiment
                             , 'prediction': predictions
                             , 'score': scores})

**Quiz 5**: How many reviews were predicted to have positive sentiment?

In [75]:
prediction_df[prediction_df['score'] > 0]['score'].count()

25126

### Measuring accuracy

**Quiz 6**: What is the accuracy of the model on predictions made above? (round to 2 digits of accuracy)

In [76]:
prediction_df['predicted_sentiment'] = [1 if p > 0.5 else -1 for p in prediction_df['prediction']]

In [78]:
prediction_df['is_correct'] = (prediction_df['predicted_sentiment'] - prediction_df['sentiment']).apply(lambda x: True if x == 0 else False)

In [97]:
accuracy = prediction_df[prediction_df['is_correct'] == True]['is_correct'].size / prediction_df['is_correct'].size

In [98]:
print(accuracy)

0.7518653904130238


### Which words contribute most to positive & negative sentiments

In [99]:
# exclude intercept
coefficients = list(coefficients[1:])
word_coefficient_tuples = [(word, coefficient) for word, coefficient in zip(important_words, coefficients)]
word_coefficient_tuples = sorted(word_coefficient_tuples, key=lambda x:x[1], reverse=True)

In [100]:
word_coefficient_tuples[:10]

[('great', 0.06654608417045775),
 ('love', 0.065890762922123258),
 ('easy', 0.064794586802578394),
 ('little', 0.045435626308421365),
 ('loves', 0.044976401394906038),
 ('well', 0.030135001092107067),
 ('perfect', 0.029739937104968459),
 ('old', 0.020077541034775385),
 ('nice', 0.018408707995268989),
 ('daughter', 0.017703199905701704)]

**Quiz 7**: Which word is not present in the top 10 "most positive" words?

In [101]:
word_coefficient_tuples[-10:]

[('monitor', -0.024482100545891724),
 ('return', -0.026592778462247283),
 ('back', -0.027742697230661331),
 ('get', -0.028711552980192585),
 ('disappointed', -0.028978976142317068),
 ('even', -0.030051249236035808),
 ('work', -0.033069515294752716),
 ('money', -0.038982037286487123),
 ('product', -0.041511033392108883),
 ('would', -0.053860148445203121)]

**Quiz 8**: Which word is not present in the top 10 "most negative" words?