In [1]:
import pandas as pd
import numpy as np

In [2]:
products = pd.read_csv('amazon_baby_subset.csv')

In [3]:
products.head()

Unnamed: 0,name,review,rating,sentiment
0,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,1
1,Nature's Lullabies Second Year Sticker Calendar,We wanted to get something to keep track of ou...,5,1
2,Nature's Lullabies Second Year Sticker Calendar,My daughter had her 1st baby over a year ago. ...,5,1
3,"Lamaze Peekaboo, I Love You","One of baby's first and favorite books, and it...",4,1
4,SoftPlay Peek-A-Boo Where's Elmo A Children's ...,Very cute interactive book! My son loves this ...,5,1


**Fill /na values in the review column with empty strings**

In [4]:
products = products.fillna({'review': ''})

In [5]:
import string

table = str.maketrans({c : '' for c in string.punctuation})

def remove_punctuation(text):
    return text.translate(table)

In [6]:
products['review_clean'] = products['review'].apply(remove_punctuation)

In [7]:
import json

with open('important_words.json') as fp:
    important_words = json.load(fp)

In [8]:
for word in important_words:
    products[word] = products['review_clean'].apply(lambda s : s.split().count(word))

### Train-Validation split

In [9]:
with open('module-4-assignment-train-idx.json') as fp:
    train_indices = json.load(fp)

train_data = products.iloc[train_indices]

In [10]:
with open('module-4-assignment-validation-idx.json') as fp:
    validation_indices = json.load(fp)

validation_data = products.iloc[validation_indices]

### Convert data frame to multi-dimensional array

In [11]:
def get_numpy_data(dataframe, features, label):
    dataframe['constant'] = 1
    features = ['constant'] + features
    return (dataframe[features], dataframe[label])

In [12]:
feature_matrix_train, sentiment_train = get_numpy_data(train_data, important_words, 'sentiment')

feature_matrix_valid, sentiment_valid = get_numpy_data(validation_data, important_words, 'sentiment') 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [14]:
def predict_probability(feature_matrix, coefficients):
    score = np.dot(feature_matrix, coefficients)
    
    predictions = 1 / (1. + np.exp(-score))
    
    return predictions

### Adding L2 penalty to the derivative

In [15]:
def feature_derivative_with_L2(errors, feature, coefficient, l2_penalty, feature_is_constant):
    derivative = np.dot(errors, features)
    if not feature_is_constant:
        derivative -= 2 * l2_penalty * coefficient
    return derivative

In [16]:
def compute_log_likelihood_with_L2(feature_matrix, sentiment, coefficients, l2_penalty):
    indicator = (sentiment == 1)
    scores = np.dot(feature_matrix, coefficients)
    lp = np.sum((indicator-1) * scores - np.log(1. + np.exp(-scores)))
       - l2_penalty * np.sum(coefficients[1:]**2)
    return lp

### Taking gradient steps

In [18]:
from math import sqrt

def logistic_regression(features,
                        label,
                        coefficients,
                        step_size,
                        l2_penalty,
                        max_iter):
    print(features.shape)
    print(coefficients.shape)

    for i in range(max_iter):
        predictions = predict_probability(feature_matrix, coefficients)

        indicator = (sentiment == +1)
        
        errors = indicator - predictions

        for j in range(len(coefficients)):
            derivative = feature_derivative_with_L2(errors,
                                                    feature_matrix[:,j],
                                                    coefficients[j],
                                                    l2_penalty,
                                                    j == 0)
            coefficients[j] = coefficients[j] + (step_size * derivative)
        

        # Checking whether log likelihood is increasing
        if i <= 15 or (i <= 100 and i % 10 == 0) or (i <= 1000 and i % 100 == 0) or (i <= 10000 and i % 1000 == 0) or i % 10000 == 0:
            lp = compute_log_likelihood_with_L2(features, sentiment, coefficients, l2_penalty)
            print('iteration %*d: log likelihood of observed labels = %.8f' %
                  (int(np.ceil(np.log10(max_iter))), i, lp))

    return coefficients