In [1]:
import numpy as np 
import pandas as pd
pd.set_option('display.max_colwidth', -1)
np.set_printoptions(threshold=np.nan)

Data preparation

In [33]:
#import data 

products = pd.read_csv('amazon_baby_subset.csv')

# fill all the empty reviews with n/a's

products = products.fillna({'review':''})

# remove all the punctuations.

def remove_punctuation(text):
    import string
    return text.translate(None,string.punctuation)

products['review_clean'] = products['review'].apply(remove_punctuation)

# import important words

import json

with open('important_words.json') as important_words:
    important_words = json.load(important_words)
with open('module-4-assignment-validation-idx.json') as valid_index:
    valid_index = json.load(valid_index)
with open('module-4-assignment-train-idx.json') as train_index:
    train_index = json.load(train_index)


for word in important_words:
    products[word] = products['review_clean'].apply(lambda x: x.split().count(word))
    
# split data into training and validtion

train_data = products.loc[train_index]

validation_data = products.loc[valid_index]

# convert data frame into matrix

def get_numpy_data(dataframe,features,target):
    dataframe['constant'] = 1
    features = ['constant'] + features
    feature_matrix = dataframe[features].values
    target_array = dataframe[target].values
    return feature_matrix,target_array

feature_matrix_train, sentiment_train = get_numpy_data(train_data, important_words, 'sentiment')
feature_matrix_valid, sentiment_valid = get_numpy_data(validation_data, important_words, 'sentiment') 

L2 penalty

In [58]:
# Write a function to produce probablistic estimate for p(y_i = +1|x,w)

def predict_probability(feature_matrix, coefficients):
    scores = feature_matrix.dot(coefficients)
    predictions = 1/(1 + np.exp(np.negative(scores)))
    return predictions

# write a function to compute the derivative

def feature_derivative_with_l2(errors, features, coefficient, l2_penalty,feature_is_constant):
    derivative = features.dot(errors)
    
    if not feature_is_constant:
        derivative = derivative - 2*l2_penalty*coefficient
    return derivative
            
# write a function to compute log-likelihood

def compute_log_likelihood_with_L2(feature_matrix, sentiment, coefficients, l2_penalty):
    indicator = (sentiment==+1)
    scores = np.dot(feature_matrix, coefficients)
    
    lp = np.sum((indicator-1)*scores - np.log(1. + np.exp(-scores))) - l2_penalty*np.sum(coefficients[1:]**2)
    
    return lp

# write a function logistic_regression_with_l2 to fit a logistic regression moel under L2 regularization.

def logistic_regression_with_l2(feature_matrix, sentiment, initial_coefficients, step_size, l2_penalty, max_iter):
    coefficients = np.array(initial_coefficients)
    for itr in xrange(max_iter):
        predictions = predict_probability(feature_matrix, coefficients)
        indicator = (sentiment == +1)
        errors = indicator - predictions
        for j in xrange(len(coefficients)):
            is_intercept = (j == 0)
            derivative = feature_matrix[:,j].T.dot(errors)
            if not is_intercept:
                derivative = derivative - 2 * l2_penalty*coefficients[j]
            coefficients[j] = coefficients[j] + step_size * derivative
            
        if itr <= 15 or (itr <= 100 and itr % 10 == 0) or (itr <= 1000 and itr % 100 ==0) or\
        (itr <= 10000 and itr % 1000 == 0) or itr % 10000 ==0:
            lp = compute_log_likelihood_with_L2(feature_matrix, sentiment, coefficients, l2_penalty)
            print 'iteration %*d: log likelihood of observed labels = %.8f' % \
                (int(np.ceil(np.log10(max_iter))), itr, lp)
    return coefficients

def sorted_coefficients(coefficients, reverse_value):
    coefficients = list(coefficients[1:])

    word_coefficients_tuples = [(word, coefficient) for word,coefficient in zip(important_words, coefficients)]

    word_coefficients_tuples = sorted(word_coefficients_tuples, key = lambda x:x[1], reverse = reverse_value)
    return word_coefficients_tuples
    
    

In [44]:
feature_matrix = feature_matrix_train
sentiment = sentiment_train
initial_coefficients = np.zeros(194)
step_size = 5e-6
max_iter = 501
l2_0_penalty = 0.0
l2_4_penalty = 4.0
l2_10_penalty = 10.0
l2_1e2_penalty = 1e2
l2_1e3_penalty = 1e3
l2_1e5_penalty = 1e5

In [52]:
coefficients_0_penalty = logistic_regression_with_l2(feature_matrix, sentiment, initial_coefficients, step_size, \
                                                     l2_0_penalty, max_iter)

iteration   0: log likelihood of observed labels = -29179.39138303
iteration   1: log likelihood of observed labels = -29003.71259047
iteration   2: log likelihood of observed labels = -28834.66187288
iteration   3: log likelihood of observed labels = -28671.70781507
iteration   4: log likelihood of observed labels = -28514.43078198
iteration   5: log likelihood of observed labels = -28362.48344665
iteration   6: log likelihood of observed labels = -28215.56713122
iteration   7: log likelihood of observed labels = -28073.41743783
iteration   8: log likelihood of observed labels = -27935.79536396
iteration   9: log likelihood of observed labels = -27802.48168669
iteration  10: log likelihood of observed labels = -27673.27331484
iteration  11: log likelihood of observed labels = -27547.98083656
iteration  12: log likelihood of observed labels = -27426.42679977
iteration  13: log likelihood of observed labels = -27308.44444728
iteration  14: log likelihood of observed labels = -27193.8767

In [53]:
coefficients_4_penalty = logistic_regression_with_l2(feature_matrix, sentiment, initial_coefficients, step_size, \
                                                     l2_4_penalty, max_iter)

iteration   0: log likelihood of observed labels = -29179.39508175
iteration   1: log likelihood of observed labels = -29003.73417180
iteration   2: log likelihood of observed labels = -28834.71441858
iteration   3: log likelihood of observed labels = -28671.80345068
iteration   4: log likelihood of observed labels = -28514.58077957
iteration   5: log likelihood of observed labels = -28362.69830317
iteration   6: log likelihood of observed labels = -28215.85663259
iteration   7: log likelihood of observed labels = -28073.79071393
iteration   8: log likelihood of observed labels = -27936.26093762
iteration   9: log likelihood of observed labels = -27803.04751805
iteration  10: log likelihood of observed labels = -27673.94684207
iteration  11: log likelihood of observed labels = -27548.76901327
iteration  12: log likelihood of observed labels = -27427.33612958
iteration  13: log likelihood of observed labels = -27309.48101569
iteration  14: log likelihood of observed labels = -27195.0462

In [54]:
coefficients_10_penalty = logistic_regression_with_l2(feature_matrix, sentiment, initial_coefficients, step_size, \
                                                     l2_10_penalty, max_iter)

iteration   0: log likelihood of observed labels = -29179.40062984
iteration   1: log likelihood of observed labels = -29003.76654163
iteration   2: log likelihood of observed labels = -28834.79322654
iteration   3: log likelihood of observed labels = -28671.94687528
iteration   4: log likelihood of observed labels = -28514.80571589
iteration   5: log likelihood of observed labels = -28363.02048079
iteration   6: log likelihood of observed labels = -28216.29071186
iteration   7: log likelihood of observed labels = -28074.35036891
iteration   8: log likelihood of observed labels = -27936.95892966
iteration   9: log likelihood of observed labels = -27803.89576265
iteration  10: log likelihood of observed labels = -27674.95647005
iteration  11: log likelihood of observed labels = -27549.95042714
iteration  12: log likelihood of observed labels = -27428.69905549
iteration  13: log likelihood of observed labels = -27311.03455140
iteration  14: log likelihood of observed labels = -27196.7989

In [55]:
coefficients_1e2_penalty = logistic_regression_with_l2(feature_matrix, sentiment, initial_coefficients, step_size, \
                                                     l2_1e2_penalty, max_iter)

iteration   0: log likelihood of observed labels = -29179.48385120
iteration   1: log likelihood of observed labels = -29004.25177457
iteration   2: log likelihood of observed labels = -28835.97382190
iteration   3: log likelihood of observed labels = -28674.09410083
iteration   4: log likelihood of observed labels = -28518.17112932
iteration   5: log likelihood of observed labels = -28367.83774654
iteration   6: log likelihood of observed labels = -28222.77708939
iteration   7: log likelihood of observed labels = -28082.70799392
iteration   8: log likelihood of observed labels = -27947.37595368
iteration   9: log likelihood of observed labels = -27816.54738615
iteration  10: log likelihood of observed labels = -27690.00588850
iteration  11: log likelihood of observed labels = -27567.54970126
iteration  12: log likelihood of observed labels = -27448.98991327
iteration  13: log likelihood of observed labels = -27334.14912742
iteration  14: log likelihood of observed labels = -27222.8604

In [56]:
coefficients_1e3_penalty = logistic_regression_with_l2(feature_matrix, sentiment, initial_coefficients, step_size, \
                                                     l2_1e3_penalty, max_iter)

iteration   0: log likelihood of observed labels = -29180.31606471
iteration   1: log likelihood of observed labels = -29009.07176112
iteration   2: log likelihood of observed labels = -28847.62378912
iteration   3: log likelihood of observed labels = -28695.14439397
iteration   4: log likelihood of observed labels = -28550.95060743
iteration   5: log likelihood of observed labels = -28414.45771129
iteration   6: log likelihood of observed labels = -28285.15124375
iteration   7: log likelihood of observed labels = -28162.56976044
iteration   8: log likelihood of observed labels = -28046.29387744
iteration   9: log likelihood of observed labels = -27935.93902900
iteration  10: log likelihood of observed labels = -27831.15045502
iteration  11: log likelihood of observed labels = -27731.59955260
iteration  12: log likelihood of observed labels = -27636.98108219
iteration  13: log likelihood of observed labels = -27547.01092670
iteration  14: log likelihood of observed labels = -27461.4242

In [57]:
coefficients_1e5_penalty = logistic_regression_with_l2(feature_matrix, sentiment, initial_coefficients, step_size, \
                                                     l2_1e5_penalty, max_iter)

iteration   0: log likelihood of observed labels = -29271.85955115
iteration   1: log likelihood of observed labels = -29271.71006589
iteration   2: log likelihood of observed labels = -29271.65738833
iteration   3: log likelihood of observed labels = -29271.61189923
iteration   4: log likelihood of observed labels = -29271.57079975
iteration   5: log likelihood of observed labels = -29271.53358505
iteration   6: log likelihood of observed labels = -29271.49988440
iteration   7: log likelihood of observed labels = -29271.46936584
iteration   8: log likelihood of observed labels = -29271.44172890
iteration   9: log likelihood of observed labels = -29271.41670149
iteration  10: log likelihood of observed labels = -29271.39403722
iteration  11: log likelihood of observed labels = -29271.37351294
iteration  12: log likelihood of observed labels = -29271.35492661
iteration  13: log likelihood of observed labels = -29271.33809523
iteration  14: log likelihood of observed labels = -29271.3228