In [1]:
import numpy as np
import pandas as pd
import json

In [2]:
products = pd.read_csv('amazon_baby_subset.csv')

In [3]:
# load json file with 'important words'
with open('important_words.json') as f:
    important_words = json.loads(next(f))

In [4]:
# print out first 10 product names
for i, name in enumerate(products['name'][:10]):
    print '{}. {}'.format(i+1, name)

1. Stop Pacifier Sucking without tears with Thumbuddy To Love's Binky Fairy Puppet and Adorable Book
2. Nature's Lullabies Second Year Sticker Calendar
3. Nature's Lullabies Second Year Sticker Calendar
4. Lamaze Peekaboo, I Love You
5. SoftPlay Peek-A-Boo Where's Elmo A Children's Book
6. Our Baby Girl Memory Book
7. Hunnt&reg; Falling Flowers and Birds Kids Nursery Home Decor Vinyl Mural Art Wall Paper Stickers
8. Blessed By Pope Benedict XVI Divine Mercy Full Color Medal
9. Cloth Diaper Pins Stainless Steel Traditional Safety Pin (Black)
10. Cloth Diaper Pins Stainless Steel Traditional Safety Pin (Black)


In [5]:
# count number of positive/negative reviews
positive, negative = products['sentiment'].value_counts()
print 'Number of positive reviews is {}.'.format(positive)
print 'Number of negative reviews is {}.'.format(negative)

Number of positive reviews is 26579.
Number of negative reviews is 26493.


In [6]:
# fill NA values with empty strings
products = products.fillna({'review':''})

In [7]:
def remove_punctuation(text):
    import string
    return text.translate(None, string.punctuation)

In [None]:
products['review_clean'] = products['review'].apply(remove_punctuation)

In [None]:
# create a new column with occurrences of important words
# takes some time
for i, word in enumerate(important_words):
#     print 'word is {}, {} words left.'.format(word, len(important_words) - (i+1))
    products[word] = products['review_clean'].apply(lambda s : s.split().count(word))

In [None]:
products['contains_perfect'] = products['perfect'].apply(lambda n : 1 if n >= 1 else 0)

In [None]:
# Quiz question 2: How many reviews in amazon_baby_subset.gl contain the word perfect?
print sum(products['contains_perfect'])

In [None]:
def get_numpy_data(dataframe, features, label):
    # prevent adding constant to the original dataframe
    dataframe = dataframe.copy()
    dataframe['constant'] = 1
    features = ['constant'] + features
    features_frame = dataframe[features]
    feature_matrix = features_frame.as_matrix()
    label_sarray = dataframe[label]
    label_array = label_sarray.as_matrix()
    return (feature_matrix, label_array)

In [None]:
feature_matrix, sentiment = get_numpy_data(products, important_words, 'sentiment')

In [None]:
# Quiz question 3
# Consider the feature_matrix that was obtained by converting our data to NumPy format.
# How many features are there in the feature_matrix?
print feature_matrix.shape[1]

In [None]:
# Quiz question 4
# Assuming that the intercept is present,
# how does the number of features in feature_matrix relate to the number of features in the logistic regression model?
# Let x = [number of features in feature_matrix] and y = [number of features in logistic regression model].
print 'y = x'

In [None]:
def predict_probability(feature_matrix, coefficients):
    '''
    produces probablistic estimate for P(y_i = +1 | x_i, w).
    estimate ranges between 0 and 1.
    '''
    # Take dot product of feature_matrix and coefficients  
    score = np.dot(feature_matrix, coefficients)
    
    # Compute P(y_i = +1 | x_i, w) using the link function
    predictions = 1. / (1 + np.exp(-score))
    
    return predictions

In [None]:
def feature_derivative(errors, feature):     
    return np.dot(errors, feature)

In [None]:
def compute_log_likelihood(feature_matrix, sentiment, coefficients):
    indicator = (sentiment==+1)
    scores = np.dot(feature_matrix, coefficients)
    lp = np.sum((indicator-1)*scores - np.log(1. + np.exp(-scores)))
    return lp

In [None]:
from math import sqrt

def logistic_regression(feature_matrix, sentiment, initial_coefficients, step_size, max_iter):
    coefficients = np.array(initial_coefficients) # make sure it's a numpy array
    for itr in xrange(max_iter):
        # Predict P(y_i = +1|x_1,w) using your predict_probability() function
        predictions = predict_probability(feature_matrix, coefficients)

        # Compute indicator value for (y_i = +1)
        indicator = (sentiment==+1)

        # Compute the errors as indicator - predictions
        errors = indicator - predictions

        for j in xrange(len(coefficients)): # loop over each coefficient
            # Recall that feature_matrix[:,j] is the feature column associated with coefficients[j]
            # compute the derivative for coefficients[j]. Save it in a variable called derivative
            derivative = feature_derivative(errors, feature_matrix[:, j])

            # add the step size times the derivative to the current coefficient
            coefficients[j] += (step_size * derivative)

        # Checking whether log likelihood is increasing
        if itr <= 15 or (itr <= 100 and itr % 10 == 0) or (itr <= 1000 and itr % 100 == 0) \
        or (itr <= 10000 and itr % 1000 == 0) or itr % 10000 == 0:
            lp = compute_log_likelihood(feature_matrix, sentiment, coefficients)
            print 'iteration %*d: log likelihood of observed labels = %.8f' % \
                (int(np.ceil(np.log10(max_iter))), itr, lp)
    return coefficients

In [None]:
initial_coefficients = np.zeros(194)
step_size = 1e-7
max_iter = 301

In [None]:
coef_ = logistic_regression(feature_matrix, sentiment, initial_coefficients, step_size, max_iter)

In [None]:
# Quiz question 5:
# Run your logistic regression solver with provided parameters.
# As each iteration of gradient ascent passes, does the log-likelihood increase or decrease?
print 'It increases.'

In [None]:
scores = np.dot(feature_matrix, coef_)

In [None]:
# Quiz question 6: We make predictions using the weights just learned.
# How many reviews were predicted to have positive sentiment?
print sum(scores > 0)

In [None]:
# Quiz question 7: What is the accuracy of the model on predictions made above? (round to 2 digits of accuracy)
def predict(p):
    return 1 if p > 0 else -1

predict = np.vectorize(predict)
predictions = predict(scores)

accuracy = sum(predictions == sentiment) / float(len(scores))
print round(accuracy, 2)

In [None]:
coefficients = list(coef_[1:]) # exclude intercept
word_coefficient_tuples = [(word, coefficient) for word, coefficient in zip(important_words, coefficients)]
word_coefficient_tuples = sorted(word_coefficient_tuples, key=lambda x:x[1], reverse=True)

In [None]:
# Quiz question 8: 
# We look at "most positive" words, the words that correspond most strongly with positive reviews.
# Which of the following words is not present in the top 10 "most positive" words?
for i, (word, coef) in enumerate(word_coefficient_tuples[:10]):
    print '{}. {}'.format(i+1, word)

In [None]:
print 'Cheap is not one of the 10 most positive words.'

In [None]:
# Quiz question 9:
# Similarly, we look at "most negative" words, the words that correspond most strongly with negative reviews.
# Which of the following words is not present in the top 10 "most negative" words?
for i, (word, coef) in enumerate(word_coefficient_tuples[-10:]):
    print '{}. {}'.format(i+1, word)

In [None]:
print 'Need is not one of the 10 most positive words.'