In [51]:
import pandas as pd
import json
import string
import numpy as np
import math

In [2]:
dtype_dict = {'name': str, 'review': str, 'rating': int, 'sentiment':int}

In [3]:
products = pd.read_csv('amazon_baby_subset.csv', dtype=dtype_dict)

In [7]:
# try listing names of the first 10 products
products['name'][0:10]

0    Stop Pacifier Sucking without tears with Thumb...
1      Nature's Lullabies Second Year Sticker Calendar
2      Nature's Lullabies Second Year Sticker Calendar
3                          Lamaze Peekaboo, I Love You
4    SoftPlay Peek-A-Boo Where's Elmo A Children's ...
5                            Our Baby Girl Memory Book
6    Hunnt&reg; Falling Flowers and Birds Kids Nurs...
7    Blessed By Pope Benedict XVI Divine Mercy Full...
8    Cloth Diaper Pins Stainless Steel Traditional ...
9    Cloth Diaper Pins Stainless Steel Traditional ...
Name: name, dtype: object

In [11]:
# number of positive reviews
print "number of positive reviews", len(products[products['sentiment'] >= 0])

number of positive reviews 26579


In [12]:
# number of negative reviews
print "number of negative reviews", len(products[products['sentiment'] <= 0])

number of negative reviews 26493


In [14]:
# load the important words
with open('important_words.json') as wfile:
    important_words = json.load(wfile)

In [15]:
print important_words

[u'baby', u'one', u'great', u'love', u'use', u'would', u'like', u'easy', u'little', u'seat', u'old', u'well', u'get', u'also', u'really', u'son', u'time', u'bought', u'product', u'good', u'daughter', u'much', u'loves', u'stroller', u'put', u'months', u'car', u'still', u'back', u'used', u'recommend', u'first', u'even', u'perfect', u'nice', u'bag', u'two', u'using', u'got', u'fit', u'around', u'diaper', u'enough', u'month', u'price', u'go', u'could', u'soft', u'since', u'buy', u'room', u'works', u'made', u'child', u'keep', u'size', u'small', u'need', u'year', u'big', u'make', u'take', u'easily', u'think', u'crib', u'clean', u'way', u'quality', u'thing', u'better', u'without', u'set', u'new', u'every', u'cute', u'best', u'bottles', u'work', u'purchased', u'right', u'lot', u'side', u'happy', u'comfortable', u'toy', u'able', u'kids', u'bit', u'night', u'long', u'fits', u'see', u'us', u'another', u'play', u'day', u'money', u'monitor', u'tried', u'thought', u'never', u'item', u'hard', u'plast

In [16]:
# pre-processing
products = products.fillna({'review' : ''})

In [18]:
def remove_punctuation(text):
    return text.translate(None, string.punctuation)

In [19]:
products['review_clean'] = products['review'].apply(remove_punctuation)

In [26]:
#
for word in important_words:
    print word,
    products[word] = products['review_clean'].apply(lambda s : s.split().count(word))

baby one great love use would like easy little seat old well get also really son time bought product good daughter much loves stroller put months car still back used recommend first even perfect nice bag two using got fit around diaper enough month price go could soft since buy room works made child keep size small need year big make take easily think crib clean way quality thing better without set new every cute best bottles work purchased right lot side happy comfortable toy able kids bit night long fits see us another play day money monitor tried thought never item hard plastic however disappointed reviews something going pump bottle cup waste return amazon different top want problem know water try received sure times chair find hold gate open bottom away actually cheap worked getting ordered came milk bad part worth found cover many design looking weeks say wanted look place purchase looks second piece box pretty trying difficult together though give started anything last company c

### Quiz Question. How many reviews contain the word perfect?



In [35]:
len(products[products['perfect'] > 0])

2955

In [39]:
def get_numpy_data(data_frame, features, output):
    data_frame['constant'] = 1
    features = ['constant'] + features

    features_frame = data_frame[features]
    feature_matrix = features_frame.as_matrix()
    
    output_array = data_frame[output]
    output_array = output_array.as_matrix()
    
    return(feature_matrix, output_array)

In [40]:
feature_matrix, sentiment = get_numpy_data(products, important_words, 'sentiment')

### Quiz Question: How many features are there in the feature_matrix?

In [46]:
print feature_matrix.shape[1], 'features'

194 features


### Quiz Question: Assuming that the intercept is present, how does the number of features in feature_matrix relate to the number of features in the logistic regression model?

In [47]:
print 'size of important_words (', len(important_words), ') + 1'

size of important_words ( 193 ) + 1


In [53]:
'''
produces probablistic estimate for P(y_i = +1 | x_i, w).
estimate ranges between 0 and 1.
'''
def predict_probability(feature_matrix, coefficients):
    # Take dot product of feature_matrix and coefficients  
    score = np.dot(feature_matrix, coefficients)
    
    # Compute P(y_i = +1 | x_i, w) using the link function
    predictions = 1 / (1 + math.log(-score))
    
    # return predictions
    return predictions

In [55]:
def feature_derivative(errors, feature):     
    # Compute the dot product of errors and feature
    derivative = np.dot(errors, feature)
    # Return the derivative
    return derivative

In [57]:
def compute_log_likelihood(feature_matrix, sentiment, coefficients):
    indicator = (sentiment==+1)
    scores = np.dot(feature_matrix, coefficients)
    lp = np.sum((indicator-1)*scores - np.log(1. + np.exp(-scores)))
    return lp

In [59]:
def logistic_regression(feature_matrix, sentiment, initial_coefficients, step_size, max_iter):
    coefficients = np.array(initial_coefficients) # make sure it's a numpy array
    for itr in xrange(max_iter):
        # Predict P(y_i = +1|x_1,w) using your predict_probability() function
        predictions = predict_probability(feature_matrix, coefficients)

        # Compute indicator value for (y_i = +1)
        indicator = (sentiment==+1)

        # Compute the errors as indicator - predictions
        errors = indicator - predictions

        for j in xrange(len(coefficients)): # loop over each coefficient
            # Recall that feature_matrix[:,j] is the feature column associated with coefficients[j]
            # compute the derivative for coefficients[j]. Save it in a variable called derivative
            # YOUR CODE HERE
            derivative = feature_derivative(errors, feature_matrix)

            # add the step size times the derivative to the current coefficient
            # YOUR CODE HERE
            coefficients += 

        # Checking whether log likelihood is increasing
        if itr <= 15 or (itr <= 100 and itr % 10 == 0) or (itr <= 1000 and itr % 100 == 0) \
        or (itr <= 10000 and itr % 1000 == 0) or itr % 10000 == 0:
            lp = compute_log_likelihood(feature_matrix, sentiment, coefficients)
            print 'iteration %*d: log likelihood of observed labels = %.8f' % \
                (int(np.ceil(np.log10(max_iter))), itr, lp)
    return coefficients

SyntaxError: invalid syntax (<ipython-input-59-5a57d4b70ced>, line 21)