In [4]:
import pandas as pd
import json
import string
import numpy as np
import math

In [5]:
dtype_dict = {'name': str, 'review': str, 'rating': int, 'sentiment':int}

In [6]:
products = pd.read_csv('amazon_baby_subset.csv', dtype=dtype_dict)

In [7]:
# try listing names of the first 10 products
products['name'][0:10]

0    Stop Pacifier Sucking without tears with Thumb...
1      Nature's Lullabies Second Year Sticker Calendar
2      Nature's Lullabies Second Year Sticker Calendar
3                          Lamaze Peekaboo, I Love You
4    SoftPlay Peek-A-Boo Where's Elmo A Children's ...
5                            Our Baby Girl Memory Book
6    Hunnt&reg; Falling Flowers and Birds Kids Nurs...
7    Blessed By Pope Benedict XVI Divine Mercy Full...
8    Cloth Diaper Pins Stainless Steel Traditional ...
9    Cloth Diaper Pins Stainless Steel Traditional ...
Name: name, dtype: object

In [8]:
# number of positive reviews
print "number of positive reviews", len(products[products['sentiment'] >= 0])

number of positive reviews 26579


In [9]:
# number of negative reviews
print "number of negative reviews", len(products[products['sentiment'] <= 0])

number of negative reviews 26493


In [10]:
# load the important words
with open('important_words.json') as wfile:
    important_words = json.load(wfile)

In [11]:
print important_words

[u'baby', u'one', u'great', u'love', u'use', u'would', u'like', u'easy', u'little', u'seat', u'old', u'well', u'get', u'also', u'really', u'son', u'time', u'bought', u'product', u'good', u'daughter', u'much', u'loves', u'stroller', u'put', u'months', u'car', u'still', u'back', u'used', u'recommend', u'first', u'even', u'perfect', u'nice', u'bag', u'two', u'using', u'got', u'fit', u'around', u'diaper', u'enough', u'month', u'price', u'go', u'could', u'soft', u'since', u'buy', u'room', u'works', u'made', u'child', u'keep', u'size', u'small', u'need', u'year', u'big', u'make', u'take', u'easily', u'think', u'crib', u'clean', u'way', u'quality', u'thing', u'better', u'without', u'set', u'new', u'every', u'cute', u'best', u'bottles', u'work', u'purchased', u'right', u'lot', u'side', u'happy', u'comfortable', u'toy', u'able', u'kids', u'bit', u'night', u'long', u'fits', u'see', u'us', u'another', u'play', u'day', u'money', u'monitor', u'tried', u'thought', u'never', u'item', u'hard', u'plast

In [12]:
# pre-processing
products = products.fillna({'review' : ''})

In [13]:
def remove_punctuation(text):
    return text.translate(None, string.punctuation)

In [14]:
products['review_clean'] = products['review'].apply(remove_punctuation)

In [15]:
#
for word in important_words:
    print word,
    products[word] = products['review_clean'].apply(lambda s : s.split().count(word))

baby one great love use would like easy little seat old well get also really son time bought product good daughter much loves stroller put months car still back used recommend first even perfect nice bag two using got fit around diaper enough month price go could soft since buy room works made child keep size small need year big make take easily think crib clean way quality thing better without set new every cute best bottles work purchased right lot side happy comfortable toy able kids bit night long fits see us another play day money monitor tried thought never item hard plastic however disappointed reviews something going pump bottle cup waste return amazon different top want problem know water try received sure times chair find hold gate open bottom away actually cheap worked getting ordered came milk bad part worth found cover many design looking weeks say wanted look place purchase looks second piece box pretty trying difficult together though give started anything last company c

### Quiz Question. How many reviews contain the word perfect?



In [16]:
len(products[products['perfect'] > 0])

2955

In [17]:
def get_numpy_data(data_frame, features, output):
    data_frame['constant'] = 1
    features = ['constant'] + features

    features_frame = data_frame[features]
    feature_matrix = features_frame.as_matrix()
    
    output_array = data_frame[output]
    output_array = output_array.as_matrix()
    
    return(feature_matrix, output_array)

In [18]:
feature_matrix, sentiment = get_numpy_data(products, important_words, 'sentiment')

### Quiz Question: How many features are there in the feature_matrix?

In [71]:
print feature_matrix.shape, 'features'

(53072, 194) features


### Quiz Question: Assuming that the intercept is present, how does the number of features in feature_matrix relate to the number of features in the logistic regression model?

they are equal

In [20]:
print 'size of important_words (', len(important_words), ') + 1'

size of important_words ( 193 ) + 1


In [40]:
'''
produces probablistic estimate for P(y_i = +1 | x_i, w).
estimate ranges between 0 and 1.
'''
def predict_probability(feature_matrix, coefficients):
    # Take dot product of feature_matrix and coefficients  
    score = np.dot(feature_matrix, coefficients)
    
    # Compute P(y_i = +1 | x_i, w) using the link function
    predictions = 1.0 / (1.0 + np.exp(-score))
    
    # return predictions
    return predictions

In [41]:
dummy_feature_matrix = np.array([[1.,2.,3.], [1.,-1.,-1]])
dummy_coefficients = np.array([1., 3., -1.])

correct_scores      = np.array( [ 1.*1. + 2.*3. + 3.*(-1.),          1.*1. + (-1.)*3. + (-1.)*(-1.) ] )
correct_predictions = np.array( [ 1./(1+np.exp(-correct_scores[0])), 1./(1+np.exp(-correct_scores[1])) ] )

print 'The following outputs must match '
print '------------------------------------------------'
print 'correct_predictions           =', correct_predictions
print 'output of predict_probability =', predict_probability(dummy_feature_matrix, dummy_coefficients)

The following outputs must match 
------------------------------------------------
correct_predictions           = [ 0.98201379  0.26894142]
output of predict_probability = [ 0.98201379  0.26894142]


In [22]:
def feature_derivative(errors, feature):     
    # Compute the dot product of errors and feature
    derivative = np.dot(errors, feature)
    # Return the derivative
    return derivative

In [23]:
def compute_log_likelihood(feature_matrix, sentiment, coefficients):
    indicator = (sentiment==+1)
    scores = np.dot(feature_matrix, coefficients)
    lp = np.sum((indicator-1)*scores - np.log(1. + np.exp(-scores)))
    return lp

In [76]:
def logistic_regression(feature_matrix, sentiment, initial_coefficients, step_size, max_iter):
    coefficients = np.array(initial_coefficients) # make sure it's a numpy array
    for itr in xrange(max_iter):
        # Predict P(y_i = +1|x_1,w) using your predict_probability() function
        predictions = predict_probability(feature_matrix, coefficients)

        # Compute indicator value for (y_i = +1)
        indicator = (sentiment==+1)

        # Compute the errors as indicator - predictions
        errors = indicator - predictions

        for j in xrange(len(coefficients)): # loop over each coefficient
            # Recall that feature_matrix[:,j] is the feature column associated with coefficients[j]
            # compute the derivative for coefficients[j]. Save it in a variable called derivative
            # YOUR CODE HERE
            derivative = feature_derivative(errors, feature_matrix[:, j])

            # add the step size times the derivative to the current coefficient
            # YOUR CODE HERE
            coefficients[j] += step_size * derivative

        # Checking whether log likelihood is increasing
        if itr <= 15 or (itr <= 100 and itr % 10 == 0) or (itr <= 1000 and itr % 100 == 0) \
        or (itr <= 10000 and itr % 1000 == 0) or itr % 10000 == 0:
            lp = compute_log_likelihood(feature_matrix, sentiment, coefficients)
            print 'iteration %*d: log likelihood of observed labels = %.8f' % \
                (int(np.ceil(np.log10(max_iter))), itr, lp)
            print lp
    return coefficients

In [77]:
initial_coefficients = np.zeros(feature_matrix.shape[1])
step_size = 1e-7
max_iter = 301

In [78]:
coefficients = logistic_regression(feature_matrix, sentiment, initial_coefficients,
                                  step_size, max_iter)
print coefficients

iteration   0: log likelihood of observed labels = -36780.91768478
-36780.9176848
iteration   1: log likelihood of observed labels = -36775.13434712
-36775.1343471
iteration   2: log likelihood of observed labels = -36769.35713564
-36769.3571356
iteration   3: log likelihood of observed labels = -36763.58603240
-36763.5860324
iteration   4: log likelihood of observed labels = -36757.82101962
-36757.8210196
iteration   5: log likelihood of observed labels = -36752.06207964
-36752.0620796
iteration   6: log likelihood of observed labels = -36746.30919497
-36746.309195
iteration   7: log likelihood of observed labels = -36740.56234821
-36740.5623482
iteration   8: log likelihood of observed labels = -36734.82152213
-36734.8215221
iteration   9: log likelihood of observed labels = -36729.08669961
-36729.0866996
iteration  10: log likelihood of observed labels = -36723.35786366
-36723.3578637
iteration  11: log likelihood of observed labels = -36717.63499744
-36717.6349974
iteration  12: lo

### Quiz question: As each iteration of gradient ascent passes, does the log likelihood increase or decrease?

#### should increase!


### Quiz question: How many reviews were predicted to have positive sentiment?



In [79]:
scores = np.dot(feature_matrix, coefficients)
predicted_sentiments = np.array([+1 if s > 0 else -1 for s in scores])

In [80]:
print np.where(predicted_sentiments == 1)[0].shape[0]

25126


### Quiz question: What is the accuracy of the model on predictions made above? (round to 2 digits of accuracy)

In [67]:
print sum(predicted_sentiments==sentiment) / float(len(sentiment))

0.74114410612


In [68]:
coefficients = list(coefficients[1:]) # exclude intercept
word_coefficient_tuples = [(word, coefficient) for word, coefficient in zip(important_words, coefficients)]
word_coefficient_tuples = sorted(word_coefficient_tuples, key=lambda x:x[1], reverse=True)

### Quiz question: Which word is not present in the top 10 "most positive" words?

In [69]:
word_coefficient_tuples[0:10]

[(u'great', 0.00023011095749548731),
 (u'love', 0.00022919083832962883),
 (u'easy', 0.00022415562487307223),
 (u'little', 0.00015544207691545761),
 (u'loves', 0.00015417337068725665),
 (u'well', 0.00010086492750064691),
 (u'perfect', 0.00010084183986166351),
 (u'old', 6.7367412681327744e-05),
 (u'nice', 6.1407869651496232e-05),
 (u'soft', 5.9646857678954867e-05)]

### Quiz question: Which word is not present in the top 10 "most negative" words?

In [70]:
word_coefficient_tuples[-11:-1]

[(u'waste', -8.3670176248573709e-05),
 (u'return', -9.2913592987661818e-05),
 (u'monitor', -9.3995419225179145e-05),
 (u'disappointed', -9.9820964146925048e-05),
 (u'back', -0.00010483521247037151),
 (u'even', -0.00011109959075874404),
 (u'get', -0.00011269149624664059),
 (u'work', -0.00012016286066171855),
 (u'money', -0.00013753480269165251),
 (u'product', -0.00015748674652922082)]