In [None]:
import pandas as pd
import json
import string
import numpy as np
import math

In [None]:
dtype_dict = {'name': str, 'review': str, 'rating': int, 'sentiment':int}

In [None]:
products = pd.read_csv('amazon_baby_subset.csv', dtype=dtype_dict)

In [None]:
with open('important_words.json') as wfile:
    important_words = json.load(wfile)

In [None]:
products = products.fillna({'review' : ''})

In [None]:
def remove_punctuation(text):
    return text.translate(None, string.punctuation)

In [None]:
products['review_clean'] = products['review'].apply(remove_punctuation)

In [None]:
for word in important_words:
    print word,
    products[word] = products['review_clean'].apply(lambda s : s.split().count(word))

In [None]:
with open('module-4-assignment-train-idx.json') as train_json_file:
    train_idx = json.load(train_json_file)
    train_data = products.ix[train_idx]

In [None]:
with open('module-4-assignment-validation-idx.json') as valida_json_file:
    valida_idx = json.load(valida_json_file)
    validation_data = products.ix[valida_idx]

In [None]:
def get_numpy_data(data_frame, features, output):
    data_frame['constant'] = 1
    features = ['constant'] + features

    features_frame = data_frame[features]
    feature_matrix = features_frame.as_matrix()
    
    output_array = data_frame[output]
    output_array = output_array.as_matrix()
    
    return(feature_matrix, output_array)

In [None]:
feature_matrix_train, sentiment_train = get_numpy_data(train_data, important_words, 'sentiment')
feature_matrix_valid, sentiment_valid = get_numpy_data(validation_data, important_words, 'sentiment') 

In [None]:
'''
produces probablistic estimate for P(y_i = +1 | x_i, w).
estimate ranges between 0 and 1.
'''
def predict_probability(feature_matrix, coefficients):
    # Take dot product of feature_matrix and coefficients  
    score = np.dot(feature_matrix, coefficients)
    
    # Compute P(y_i = +1 | x_i, w) using the link function
    predictions = 1.0 / (1.0 + np.exp(-score))
    
    # return predictions
    return predictions

In [None]:
def feature_derivative_with_L2(errors, feature, coefficient, l2_penalty, feature_is_constant): 
    
    # Compute the dot product of errors and feature
    derivative = np.dot(errors, feature)

    # add L2 penalty term for any feature that isn't the intercept.
    if not feature_is_constant: 
        derivative -= 2 * l2_penalty * coefficient
        
    return derivative

### Quiz question: In the code above, was the intercept term regularized?

#### NO!

In [None]:
def compute_log_likelihood_with_L2(feature_matrix, sentiment, coefficients, l2_penalty):
    indicator = (sentiment==+1)
    scores = np.dot(feature_matrix, coefficients)
    
    lp = np.sum((indicator-1)*scores - np.log(1. + np.exp(-scores))) - l2_penalty*np.sum(coefficients[1:]**2)
    
    return lp

### Quiz question: Does the term with L2 regularization increase or decrease ℓℓ(w)?

#### Decrease, since the term is always positive

In [None]:
def logistic_regression_with_L2(feature_matrix, sentiment, initial_coefficients, step_size, l2_penalty, max_iter):
    coefficients = np.array(initial_coefficients) # make sure it's a numpy array
    for itr in xrange(max_iter):
        # Predict P(y_i = +1|x_i,w) using your predict_probability() function
        predictions = predict_probability(feature_matrix, coefficients)
        
        # Compute indicator value for (y_i = +1)
        indicator = (sentiment==+1)
        
        # Compute the errors as indicator - predictions
        errors = indicator - predictions
        for j in xrange(len(coefficients)): # loop over each coefficient
            is_intercept = (j == 0)
            # Recall that feature_matrix[:,j] is the feature column associated with coefficients[j].
            # Compute the derivative for coefficients[j]. Save it in a variable called derivative
            derivative = feature_derivative_with_L2(errors, feature_matrix[:, j], 
                                                    coefficients[j], l2_penalty, is_intercept)
            
            # add the step size times the derivative to the current coefficient
            coefficients[j] += step_size * derivative
        
        # Checking whether log likelihood is increasing
        if itr <= 15 or (itr <= 100 and itr % 10 == 0) or (itr <= 1000 and itr % 100 == 0) \
        or (itr <= 10000 and itr % 1000 == 0) or itr % 10000 == 0:
            lp = compute_log_likelihood_with_L2(feature_matrix, sentiment, coefficients, l2_penalty)
            print 'iteration %*d: log likelihood of observed labels = %.8f' % \
                (int(np.ceil(np.log10(max_iter))), itr, lp)
    return coefficients

In [None]:
feature_matrix = feature_matrix_train
sentiment = sentiment_train
initial_coefficients = np.zeros(feature_matrix.shape[1])
step_size = 5e-6
max_iter = 501

In [None]:
coefficients_0_penalty = logistic_regression_with_L2(feature_matrix, sentiment, initial_coefficients, step_size,
                                                    0, max_iter)

In [None]:
coefficients_4_penalty = logistic_regression_with_L2(feature_matrix, sentiment, initial_coefficients, step_size,
                                                    4, max_iter)

In [None]:
coefficients_10_penalty = logistic_regression_with_L2(feature_matrix, sentiment, initial_coefficients, step_size,
                                                    10, max_iter)

In [None]:
coefficients_1e2_penalty = logistic_regression_with_L2(feature_matrix, sentiment, initial_coefficients, step_size,
                                                    1e2, max_iter)

In [None]:
coefficients_1e3_penalty = logistic_regression_with_L2(feature_matrix, sentiment, initial_coefficients, step_size,
                                                    1e3, max_iter)

In [None]:
coefficients_1e5_penalty = logistic_regression_with_L2(feature_matrix, sentiment, initial_coefficients, step_size,
                                                    1e5, max_iter)

In [None]:
word_coefficient_tuples = [(word, coefficient) for word, coefficient in zip(important_words, coefficients_0_penalty)]
word_coefficient_tuples = sorted(word_coefficient_tuples, key=lambda x:x[1], reverse=True)

### Quiz Question. Which of the following is not listed in either positive_words or negative_words?

In [None]:
print word_coefficient_tuples[0:5]
positive_words = [str(i[0]) for i in word_coefficient_tuples[0:5]]
print positive_words

In [None]:
word_coefficient_tuples[-6:-1]
negative_words = [str(i[0]) for i in word_coefficient_tuples[-6:-1]]
print negative_words

In [None]:
table = pd.DataFrame({'word': ['(intercept)'] + important_words})
def add_coefficients_to_table(coefficients, column_name):
    table[column_name] = coefficients
    return table

In [None]:
add_coefficients_to_table(coefficients_0_penalty, 'coefficients [L2=0]')
add_coefficients_to_table(coefficients_4_penalty, 'coefficients [L2=4]')
add_coefficients_to_table(coefficients_10_penalty, 'coefficients [L2=10]')
add_coefficients_to_table(coefficients_1e2_penalty, 'coefficients [L2=1e2]')
add_coefficients_to_table(coefficients_1e3_penalty, 'coefficients [L2=1e3]')
add_coefficients_to_table(coefficients_1e5_penalty, 'coefficients [L2=1e5]')

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = 10, 6

def make_coefficient_plot(table, positive_words, negative_words, l2_penalty_list):
    cmap_positive = plt.get_cmap('Reds')
    cmap_negative = plt.get_cmap('Blues')
    
    xx = l2_penalty_list
    plt.plot(xx, [0.]*len(xx), '--', lw=1, color='k')
    
    table_positive_words = table[table['word'].isin(positive_words)]
    table_negative_words = table[table['word'].isin(negative_words)]
    del table_positive_words['word']
    del table_negative_words['word']
    
    for i in xrange(len(positive_words)):
        color = cmap_positive(0.8*((i+1)/(len(positive_words)*1.2)+0.15))
        plt.plot(xx, table_positive_words[i:i+1].as_matrix().flatten(),
                 '-', label=positive_words[i], linewidth=4.0, color=color)
        
    for i in xrange(len(negative_words)):
        color = cmap_negative(0.8*((i+1)/(len(negative_words)*1.2)+0.15))
        plt.plot(xx, table_negative_words[i:i+1].as_matrix().flatten(),
                 '-', label=negative_words[i], linewidth=4.0, color=color)
        
    plt.legend(loc='best', ncol=3, prop={'size':16}, columnspacing=0.5)
    plt.axis([1, 1e5, -1, 2])
    plt.title('Coefficient path')
    plt.xlabel('L2 penalty ($\lambda$)')
    plt.ylabel('Coefficient value')
    plt.xscale('log')
    plt.rcParams.update({'font.size': 18})
    plt.tight_layout()


make_coefficient_plot(table, positive_words, negative_words, l2_penalty_list=[0, 4, 10, 1e2, 1e3, 1e5])

### Quiz Question: (True/False) All coefficients consistently get smaller in size as L2 penalty is increased.

#### Yes

### Quiz Question: (True/False) Relative order of coefficients is preserved as L2 penalty is increased. (If word 'cat' was more positive than word 'dog', then it remains to be so as L2 penalty is increased.)
#### No

In [None]:
def get_classification_accuracy(feature_matrix, sentiment, coefficients):
    scores = np.dot(feature_matrix, coefficients)
    apply_threshold = np.vectorize(lambda x: 1. if x > 0  else -1.)
    predictions = apply_threshold(scores)
    num_correct = (predictions == sentiment).sum()
    return num_correct / float(len(feature_matrix))

### Quiz question: Which model (L2 = 0, 4, 10, 100, 1e3, 1e5) has the highest accuracy on the training data?

In [None]:
print get_classification_accuracy(feature_matrix_train, sentiment_train, coefficients_0_penalty)
print get_classification_accuracy(feature_matrix_train, sentiment_train, coefficients_4_penalty)
print get_classification_accuracy(feature_matrix_train, sentiment_train, coefficients_10_penalty)
print get_classification_accuracy(feature_matrix_train, sentiment_train, coefficients_1e2_penalty)
print get_classification_accuracy(feature_matrix_train, sentiment_train, coefficients_1e3_penalty)
print get_classification_accuracy(feature_matrix_train, sentiment_train, coefficients_1e5_penalty)

### Quiz question: Which model (L2 = 0, 4, 10, 100, 1e3, 1e5) has the highest accuracy on the validation data?

In [None]:
print get_classification_accuracy(feature_matrix_valid, sentiment_valid, coefficients_0_penalty)
print get_classification_accuracy(feature_matrix_valid, sentiment_valid, coefficients_4_penalty)
print get_classification_accuracy(feature_matrix_valid, sentiment_valid, coefficients_10_penalty)
print get_classification_accuracy(feature_matrix_valid, sentiment_valid, coefficients_1e2_penalty)
print get_classification_accuracy(feature_matrix_valid, sentiment_valid, coefficients_1e3_penalty)
print get_classification_accuracy(feature_matrix_valid, sentiment_valid, coefficients_1e5_penalty)

### Quiz question: Does the highest accuracy on the training data imply that the model is the best one?

### NO