# Logistic Regression with L2 regularization

Goal: implement logistic regression classifier with L2 regularization from scratch

 * Extract features from Amazon product reviews.
 * Convert dataframe into a NumPy array.
 * Write a function to compute the derivative of log likelihood function with an L2 penalty with respect to a single coefficient.
 * Implement gradient ascent with an L2 penalty.
 * Empirically explore how the L2 penalty can ameliorate overfitting.

In [1]:
import numpy as np
import pandas as pd
import json

In [2]:
products = pd.read_csv('amazon_baby_subset.csv')

### Data pre-processing:

#### Fill NA, remove punctuation

In [8]:
products['review'].isnull().sum()

0

In [3]:
products = products.fillna({'review':''})

def remove_punctuation(text):
    import string
    return text.translate(str.maketrans('','',string.punctuation))
products['review_clean'] = products['review'].apply(remove_punctuation)

#### Count words

In [6]:
with open('important_words.json') as important_words_file:  # convert json file into python file
    important_words = json.load(important_words_file)
print (important_words[:10])    # important_words is a list 

['baby', 'one', 'great', 'love', 'use', 'would', 'like', 'easy', 'little', 'seat']


list

In [5]:
for word in important_words:
    products[word] = products['review_clean'].apply(lambda s : s.split().count(word))   # lambda s  as a defined function

In [8]:
products.iloc[:3]

Unnamed: 0,name,review,rating,sentiment,review_clean,baby,one,great,love,use,...,seems,picture,completely,wish,buying,babies,won,tub,almost,either
0,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,1,All of my kids have cried nonstop when I tried...,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Nature's Lullabies Second Year Sticker Calendar,We wanted to get something to keep track of ou...,5,1,We wanted to get something to keep track of ou...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Nature's Lullabies Second Year Sticker Calendar,My daughter had her 1st baby over a year ago. ...,5,1,My daughter had her 1st baby over a year ago S...,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Split train and validation data

In [9]:
with open('module-4-assignment-train-idx.json') as train_data_file:    
    train_data_idx = json.load(train_data_file)
with open('module-4-assignment-validation-idx.json') as validation_data_file:    
    validation_data_idx = json.load(validation_data_file)

In [10]:
print(train_data_idx[:5])
print(validation_data_idx[:5])

[0, 1, 3, 4, 5]
[2, 9, 23, 26, 27]


In [11]:
train_data = products.iloc[train_data_idx]
validation_data=products.iloc[validation_data_idx]
train_data.iloc[:3]  # train_data[:3]
validation_data.iloc[:3]

Unnamed: 0,name,review,rating,sentiment,review_clean,baby,one,great,love,use,...,seems,picture,completely,wish,buying,babies,won,tub,almost,either
2,Nature's Lullabies Second Year Sticker Calendar,My daughter had her 1st baby over a year ago. ...,5,1,My daughter had her 1st baby over a year ago S...,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,Cloth Diaper Pins Stainless Steel Traditional ...,It has been many years since we needed diaper ...,5,1,It has been many years since we needed diaper ...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
23,Fisher Price Nesting Action Vehicles,For well over a year my son has enjoyed stacki...,5,1,For well over a year my son has enjoyed stacki...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
def get_numpy_data(dataframe, features, label):
    dataframe['constant'] = 1
    features = ['constant'] + features
    features_frame = dataframe[features]
    feature_matrix = features_frame.to_numpy()
    label_sarray = dataframe[label]
    label_array = label_sarray.to_numpy()
    return(feature_matrix, label_array)

In [13]:
feature_matrix_train, sentiment_train = get_numpy_data(train_data, important_words, 'sentiment')
feature_matrix_valid, sentiment_valid = get_numpy_data(validation_data, important_words, 'sentiment')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [14]:
print (feature_matrix_train.shape)
print (feature_matrix_valid.shape)

(42361, 194)
(10711, 194)


### Build Functions for penalized logistic regression

In [15]:
def predict_probability (feature_matrix,coefficients):
    score=np.dot(feature_matrix,coefficients)  # feature_matrix: N*D  coefficients: D*1 score: N*1
    predictions=1.0/(1+np.exp(-score))         #prediction: N*1
    return predictions

In [16]:
def feature_derivative_with_L2(errors, feature, coefficient, l2_penalty, constant_feature):
    derivative= np.dot(np.transpose(errors),feature)      # errors:N*1; feature:N*1; derivative:1*1
    if constant_feature!=1:  # execute if constant_feature is 0 or empty
        derivative -= 2 * l2_penalty * coefficient
    return derivative

In [17]:
def compute_log_likelihood(feature_matrix, sentiment, coefficients,l2_penalty):
    indicator=(sentiment==+1)   
    scores= np.dot(feature_matrix, coefficients)     # scores: N*1 
    lp=np.sum((indicator-1)*scores-np.log(1+np.exp(-scores)))-l2_penalty*np.sum(coefficients[1:]**2)     # sum elements in a N*1 array
    return lp

In [18]:
def logistic_regression_with_L2(feature_matrix, sentiment, initial_coefficients, step_size, l2_penalty, max_iter):
    coefficients = np.array(initial_coefficients) # make sure it's a numpy array
    
    for itr in range(max_iter):
        predictions = predict_probability(feature_matrix, coefficients)
        indicator = (sentiment==+1)
        errors = np.transpose(np.array([indicator])) - predictions
        
        for j in range(len(coefficients)): 
            is_intercept = (j == 0)  # an indicator function, =1 only when j=0

            derivative = feature_derivative_with_L2(errors, feature_matrix[:,j], coefficients[j], l2_penalty, is_intercept)
            coefficients[j] += step_size*derivative
        
        # Checking whether log likelihood is increasing
        #if itr <= 15 or (itr <= 100 and itr % 10 == 0) or (itr <= 1000 and itr % 100 == 0) \
        #or (itr <= 10000 and itr % 1000 == 0) or itr % 10000 == 0:
            #lp = compute_log_likelihood_with_L2(feature_matrix, sentiment, coefficients, l2_penalty)
            #print 'iteration %*d: log likelihood of observed labels = %.8f' % \
                #(int(np.ceil(np.log10(max_iter))), itr, lp)
    return coefficients

In [19]:
initial_coefficients = np.zeros((194,1))
step_size = 5e-6
max_iter = 501

#### penalty=0

In [20]:
coefficients_0_penalty = logistic_regression_with_L2(feature_matrix_train , sentiment_train , initial_coefficients, step_size, 0, max_iter)

In [21]:
coef_table = pd.DataFrame({'word':important_words, 'coefficients': coefficients_0_penalty.flatten()[1:]})  # exclude coef of constant
coef_table_sort=coef_table.sort_values(by=['coefficients'], ascending=False)
print(coef_table_sort[:10])
coef_table_sort=coef_table.sort_values(by=['coefficients'], ascending=True)
print(coef_table_sort[:10])

       word  coefficients
3      love      1.058554
22    loves      1.052484
7      easy      0.984559
33  perfect      0.835693
2     great      0.801625
82    happy      0.557395
75     best      0.535034
8    little      0.524419
90     fits      0.487652
11     well      0.453866
             word  coefficients
105  disappointed     -0.955437
96          money     -0.768793
113        return     -0.742085
112         waste     -0.617809
168      returned     -0.572707
171         broke     -0.555195
77           work     -0.526716
99        thought     -0.477856
175          idea     -0.465370
133         cheap     -0.458912


#### penalty=4,10,1e2,1e3,1e5

In [22]:
coefficients_4_penalty = logistic_regression_with_L2(feature_matrix_train , sentiment_train , initial_coefficients, step_size, 4, max_iter)

In [23]:
coefficients_10_penalty = logistic_regression_with_L2(feature_matrix_train , sentiment_train , initial_coefficients, step_size, 10, max_iter)

In [24]:
coefficients_1e2_penalty = logistic_regression_with_L2(feature_matrix_train , sentiment_train , initial_coefficients, step_size, 1e2, max_iter)

In [25]:
coefficients_1e3_penalty = logistic_regression_with_L2(feature_matrix_train , sentiment_train , initial_coefficients, step_size, 1e3, max_iter)

In [26]:
coefficients_1e5_penalty = logistic_regression_with_L2(feature_matrix_train , sentiment_train , initial_coefficients, step_size, 1e5, max_iter)

#### Compare coefficients under different penalty

In [27]:
table = pd.DataFrame(data=[coefficients_0_penalty.flatten(), coefficients_4_penalty.flatten(), coefficients_10_penalty.flatten(), coefficients_1e2_penalty.flatten(), coefficients_1e3_penalty.flatten(), coefficients_1e5_penalty.flatten()], 
                     index=[0, 4, 10, 100.0, 1000.0, 100000.0],
                     columns=['(intercept)'] + important_words) 
table

Unnamed: 0,(intercept),baby,one,great,love,use,would,like,easy,little,...,seems,picture,completely,wish,buying,babies,won,tub,almost,either
0.0,-0.063742,0.074073,0.012753,0.801625,1.058554,-0.000104,-0.287021,-0.003384,0.984559,0.524419,...,0.058308,-0.196906,-0.277845,0.173191,-0.132197,0.052494,0.00496,-0.166745,-0.031916,-0.228852
4.0,-0.063143,0.073994,0.012495,0.796897,1.050856,0.000163,-0.286027,-0.003442,0.9776,0.521385,...,0.057905,-0.195273,-0.275461,0.17164,-0.131083,0.05213,0.004907,-0.165367,-0.031621,-0.226793
10.0,-0.062256,0.073877,0.012115,0.789935,1.039529,0.000556,-0.284564,-0.003527,0.967362,0.516917,...,0.057312,-0.192866,-0.271947,0.169352,-0.129441,0.051594,0.00483,-0.163338,-0.031186,-0.223758
100.0,-0.050438,0.07236,0.007247,0.701425,0.896644,0.005481,-0.265993,-0.004635,0.838245,0.460235,...,0.049753,-0.162143,-0.227098,0.140022,-0.108471,0.044805,0.003848,-0.137693,-0.025604,-0.184986
1000.0,5.4e-05,0.059752,-0.008761,0.376012,0.418354,0.017326,-0.188662,-0.007043,0.401904,0.251221,...,0.022875,-0.061171,-0.081775,0.044374,-0.040331,0.021026,0.001084,-0.054778,-0.007361,-0.061138
100000.0,0.011362,0.001784,-0.001827,0.00895,0.009042,0.000418,-0.008127,-0.000827,0.008808,0.005941,...,0.000329,-0.001151,-0.001421,0.000468,-0.000792,0.000365,1.7e-05,-0.000936,-0.000125,-0.00098


In [28]:
table = pd.DataFrame(data=[coefficients_0_penalty.flatten(), coefficients_4_penalty.flatten()], 
                     index=[0, 4],
                     columns=['(intercept)'] + important_words) 
table

Unnamed: 0,(intercept),baby,one,great,love,use,would,like,easy,little,...,seems,picture,completely,wish,buying,babies,won,tub,almost,either
0,-0.063742,0.074073,0.012753,0.801625,1.058554,-0.000104,-0.287021,-0.003384,0.984559,0.524419,...,0.058308,-0.196906,-0.277845,0.173191,-0.132197,0.052494,0.00496,-0.166745,-0.031916,-0.228852
4,-0.063143,0.073994,0.012495,0.796897,1.050856,0.000163,-0.286027,-0.003442,0.9776,0.521385,...,0.057905,-0.195273,-0.275461,0.17164,-0.131083,0.05213,0.004907,-0.165367,-0.031621,-0.226793


#### Compare coeff of some certain words under different penalty

In [39]:
compare_table = pd.DataFrame({'word':important_words, 'coefficients_0': coefficients_0_penalty.flatten()[1:],'coefficients_4': coefficients_4_penalty.flatten()[1:],
                          'coefficients_10': coefficients_10_penalty.flatten()[1:],'coefficients_1e2': coefficients_1e2_penalty.flatten()[1:],
                          'coefficients_1e3': coefficients_1e3_penalty.flatten()[1:],'coefficients_1e5': coefficients_1e5_penalty.flatten()[1:]})  # exclude coef of constant
compare_table_sort = compare_table.sort_values(by=['coefficients_0'], ascending=False)
print(compare_table_sort[:5])
compare_table_sort = compare_table.sort_values(by=['coefficients_0'], ascending=True)
print(compare_table_sort[:5])

       word  coefficients_0  coefficients_4  coefficients_10  \
3      love        1.058554        1.050856         1.039529   
22    loves        1.052484        1.043903         1.031265   
7      easy        0.984559        0.977600         0.967362   
33  perfect        0.835693        0.828555         0.818038   
2     great        0.801625        0.796897         0.789935   

    coefficients_1e2  coefficients_1e3  coefficients_1e5  
3           0.896644          0.418354          0.009042  
22          0.870794          0.345870          0.006150  
7           0.838245          0.401904          0.008808  
33          0.684143          0.250614          0.003989  
2           0.701425          0.376012          0.008950  
             word  coefficients_0  coefficients_4  coefficients_10  \
105  disappointed       -0.955437       -0.946980        -0.934518   
96          money       -0.768793       -0.762734        -0.753818   
113        return       -0.742085       -0.735502  

In [29]:
training_accuracy = []
for coefficient in [coefficients_0_penalty, coefficients_4_penalty, coefficients_10_penalty, coefficients_1e2_penalty, coefficients_1e3_penalty, coefficients_1e5_penalty]:
    predictions = predict_probability(feature_matrix_train, coefficient)
    correct_num = np.sum((predictions.flatten()> 0.5) == (sentiment_train>0))
    total_num = len(sentiment_train)
    print ("correct_num: {}, total_num: {}".format(correct_num, total_num))
    training_accuracy.append(correct_num * 1./ total_num)
print (training_accuracy)
print (predictions[:5])

correct_num: 33260, total_num: 42361
correct_num: 33258, total_num: 42361
correct_num: 33253, total_num: 42361
correct_num: 33210, total_num: 42361
correct_num: 32866, total_num: 42361
correct_num: 28821, total_num: 42361
[0.7851561577866434, 0.7851089445480512, 0.7849909114515711, 0.7839758268218409, 0.7758551497839994, 0.6803663747314747]
[[0.50404412]
 [0.50132932]
 [0.50272311]
 [0.50705669]
 [0.50706524]]


In [30]:
a= np.array(sentiment_train)[:5]
print(a)
a.shape

[1 1 1 1 1]


(5,)

In [31]:
b=sentiment_train[:5]
print(b)
b.shape

[1 1 1 1 1]


(5,)

In [32]:
e=predictions[:5]
print(e)
e.shape

[[0.50404412]
 [0.50132932]
 [0.50272311]
 [0.50705669]
 [0.50706524]]


(5, 1)

In [33]:
c=predictions.flatten()[:5]
print(c)
c.shape

[0.50404412 0.50132932 0.50272311 0.50705669 0.50706524]


(5,)

In [34]:
d=np.transpose(predictions.flatten())[:5]
print(d)
d.shape

[0.50404412 0.50132932 0.50272311 0.50705669 0.50706524]


(5,)

In [36]:
valid_accuracy = []
for coefficient in [coefficients_0_penalty, coefficients_4_penalty, coefficients_10_penalty, coefficients_1e2_penalty, coefficients_1e3_penalty, coefficients_1e5_penalty]:
    predictions = predict_probability(feature_matrix_valid, coefficient)
    correct_num = np.sum((predictions.flatten()> 0.5) == (sentiment_valid>0))
    total_num = len(sentiment_valid)
    print ("correct_num: {}, total_num: {}".format(correct_num, total_num))
    valid_accuracy.append(correct_num * 1./ total_num)
print (valid_accuracy)
print (predictions[:5])

correct_num: 8370, total_num: 10711
correct_num: 8371, total_num: 10711
correct_num: 8373, total_num: 10711
correct_num: 8366, total_num: 10711
correct_num: 8262, total_num: 10711
correct_num: 7153, total_num: 10711
[0.781439641490057, 0.7815330034543927, 0.7817197273830642, 0.781066193632714, 0.7713565493417982, 0.667818130893474]
[[0.50301941]
 [0.50012794]
 [0.50418525]
 [0.50608206]
 [0.51048022]]
