# Implementing logistic regression from scratch

In [33]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from math import sqrt
import pandas as pd
import numpy as np
import string, json

DATA_DIR = './data/week2/'

## Inspect and clean data

In [2]:
products = pd.read_csv(DATA_DIR + 'amazon_baby_subset.csv')

products.info()
print('\n')
products.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53072 entries, 0 to 53071
Data columns (total 4 columns):
name         52982 non-null object
review       52831 non-null object
rating       53072 non-null int64
sentiment    53072 non-null int64
dtypes: int64(2), object(2)
memory usage: 1.6+ MB




Unnamed: 0,name,review,rating,sentiment
0,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,1
1,Nature's Lullabies Second Year Sticker Calendar,We wanted to get something to keep track of ou...,5,1
2,Nature's Lullabies Second Year Sticker Calendar,My daughter had her 1st baby over a year ago. ...,5,1
3,"Lamaze Peekaboo, I Love You","One of baby's first and favorite books, and it...",4,1
4,SoftPlay Peek-A-Boo Where's Elmo A Children's ...,Very cute interactive book! My son loves this ...,5,1


In [3]:
ten_first_products = list(products['name'].iloc[0:10])

print('The name of the ten first products are : \n')
print('- ' + '\n- '.join(ten_first_products))

The name of the ten first products are : 

- Stop Pacifier Sucking without tears with Thumbuddy To Love's Binky Fairy Puppet and Adorable Book
- Nature's Lullabies Second Year Sticker Calendar
- Nature's Lullabies Second Year Sticker Calendar
- Lamaze Peekaboo, I Love You
- SoftPlay Peek-A-Boo Where's Elmo A Children's Book
- Our Baby Girl Memory Book
- Hunnt&reg; Falling Flowers and Birds Kids Nursery Home Decor Vinyl Mural Art Wall Paper Stickers
- Blessed By Pope Benedict XVI Divine Mercy Full Color Medal
- Cloth Diaper Pins Stainless Steel Traditional Safety Pin (Black)
- Cloth Diaper Pins Stainless Steel Traditional Safety Pin (Black)


In [4]:
nb_positive_reviews = np.sum(products['sentiment'] > 0)
nb_negative_reviews = np.sum(products['sentiment'] < 0)

print('there are %s positive reviews and %s negative reviews.' % (nb_positive_reviews, nb_negative_reviews))

there are 26579 positive reviews and 26493 negative reviews.


In [9]:
def remove_ponctuation(s):
    s = s.translate(None, string.punctuation)
    return s

# remove ponctuation from the review colum
products['review'] = products['review'].fillna('')
products['review'] = products['review'].apply(remove_ponctuation)

important_words = open(DATA_DIR + 'important_words.json').read()
important_words = json.loads(important_words)

products.head()

Unnamed: 0,name,review,rating,sentiment
0,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried nonstop when I tried...,5,1
1,Nature's Lullabies Second Year Sticker Calendar,We wanted to get something to keep track of ou...,5,1
2,Nature's Lullabies Second Year Sticker Calendar,My daughter had her 1st baby over a year ago S...,5,1
3,"Lamaze Peekaboo, I Love You",One of babys first and favorite books and it i...,4,1
4,SoftPlay Peek-A-Boo Where's Elmo A Children's ...,Very cute interactive book My son loves this b...,5,1


In [11]:
for word in important_words:
    products[word] = products['review'].apply(lambda s : s.split().count(word))

Unnamed: 0,name,review,rating,sentiment,baby,one,great,love,use,would,...,seems,picture,completely,wish,buying,babies,won,tub,almost,either
0,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried nonstop when I tried...,5,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Nature's Lullabies Second Year Sticker Calendar,We wanted to get something to keep track of ou...,5,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Nature's Lullabies Second Year Sticker Calendar,My daughter had her 1st baby over a year ago S...,5,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Lamaze Peekaboo, I Love You",One of babys first and favorite books and it i...,4,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,SoftPlay Peek-A-Boo Where's Elmo A Children's ...,Very cute interactive book My son loves this b...,5,1,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0


#### How many reviews contain the word perfect?

In [12]:
nb_contains_perfect = np.sum(products['perfect'] > 0)

print('There are %s reviews that contains the word perfect.' % nb_contains_perfect)

There are 2955 reviews that contains the word perfect


In [13]:
def get_numpy_data(dataframe, features, label):
    dataframe['constant'] = 1
    features = ['constant'] + features
    features_frame = dataframe[features]
    feature_matrix = features_frame.as_matrix()
    label_sarray = dataframe[label]
    label_array = label_sarray.as_matrix()
    return(feature_matrix, label_array)

#### How many features are there in the feature_matrix ?

In [16]:
print('There are %s features in the feature_matrix' % (len(important_words)+1) )

There are 194 features in the feature_matrix


#### Assuming that the intercept is present, how does the number of features in feature_matrix relate to the number of features in the logistic regression model ?

y = x - 1

## Build Logistic Regression Model

In [103]:
def predict_probability(feature_matrix, coefficients):
    # Take dot product of feature_matrix and coefficients 
    score = np.dot(feature_matrix, coefficients)
    # Compute P(y_i = +1 | x_i, w) using the link function
    predictions = 1 / (1+np.exp(-score))
    return predictions

def feature_derivative(errors, feature):     
    # Compute the dot product of errors and feature
    derivative = np.dot(errors, feature)
    # Return the derivative
    return derivative

def compute_log_likelihood(feature_matrix, sentiment, coefficients):
    indicator = (sentiment==+1)
    scores = np.dot(feature_matrix, coefficients)
    lp = np.sum((indicator-1)*scores - np.log(1. + np.exp(-scores)))
    return lp

def logistic_regression(feature_matrix, sentiment, initial_coefficients, step_size, max_iter):
    coefficients = np.array(initial_coefficients) # make sure it's a numpy array
    for itr in xrange(max_iter):
        # Predict P(y_i = +1|x_1,w) using your predict_probability() function
        predictions = predict_probability(feature_matrix, coefficients)

        # Compute indicator value for (y_i = +1)
        indicator = (sentiment==+1)

        # Compute the errors as indicator - predictions
        errors = indicator - predictions

        for j in xrange(len(coefficients)): # loop over each coefficient
            # Recall that feature_matrix[:,j] is the feature column associated with coefficients[j]
            # compute the derivative for coefficients[j]. Save it in a variable called derivative
            # YOUR CODE HERE
            derivative = feature_derivative(errors, feature_matrix[:,j])

            # add the step size times the derivative to the current coefficient
            # YOUR CODE HERE
            coefficients[j] = coefficients[j] + step_size * derivative

        # Checking whether log likelihood is increasing
        if itr <= 15 or (itr <= 100 and itr % 10 == 0) or (itr <= 1000 and itr % 100 == 0) \
        or (itr <= 10000 and itr % 1000 == 0) or itr % 10000 == 0:
            lp = compute_log_likelihood(feature_matrix, sentiment, coefficients)
            print 'iteration %*d: log likelihood of observed labels = %.8f' % \
                (int(np.ceil(np.log10(max_iter))), itr, lp)
    return coefficients

feature_matrix, sentiment = get_numpy_data(products, important_words, 'sentiment')
coefficients = logistic_regression(feature_matrix, sentiment, np.zeros(194), 0.0000007, 301)

iteration   0: log likelihood of observed labels = -36746.24540276
iteration   1: log likelihood of observed labels = -36706.08205005
iteration   2: log likelihood of observed labels = -36666.21111798
iteration   3: log likelihood of observed labels = -36626.62693353
iteration   4: log likelihood of observed labels = -36587.32411808
iteration   5: log likelihood of observed labels = -36548.29756600
iteration   6: log likelihood of observed labels = -36509.54242490
iteration   7: log likelihood of observed labels = -36471.05407729
iteration   8: log likelihood of observed labels = -36432.82812375
iteration   9: log likelihood of observed labels = -36394.86036733
iteration  10: log likelihood of observed labels = -36357.14679912
iteration  11: log likelihood of observed labels = -36319.68358502
iteration  12: log likelihood of observed labels = -36282.46705344
iteration  13: log likelihood of observed labels = -36245.49368401
iteration  14: log likelihood of observed labels = -36208.7600

#### As each iteration of gradient ascent passes, does the log likelihood increase or decrease ?

the log likelihood increase

## Predicting sentiments

In [104]:
score = np.dot(feature_matrix, coefficients)
predictions = np.empty(53072)
predictions.fill(-1)
predictions[score > 0] = 1
coefficients = list(coefficients[1:])

#### How many reviews were predicted to have positive sentiment?

In [105]:
nb_positive_predictions = np.sum(predictions > 0)

print('There are %s reviews that were predicted to have positive sentiment.' % nb_positive_predictions)

There are 27049 reviews that were predicted to have positive sentiment.


#### What is the accuracy of the model on predictions made above ? 

In [106]:
accuracy = accuracy_score(sentiment, predictions)

print('the accuracy of the model is : %s.' % accuracy)

the accuracy of the model is : 0.76481006934.


## Which words contribute most to positive & negative sentiments

In [111]:
word_coefficient_tuples = [(word, coefficient) for word, coefficient in zip(important_words, coefficients)]
word_coefficient_tuples = sorted(word_coefficient_tuples, key=lambda x:x[1], reverse=True)

#### Which word is not present in the top 10 "most positive" words ?

In [112]:
ten_most_positive_words = [word_coefficient_tuple[0] for word_coefficient_tuple in word_coefficient_tuples[0:10] ]
print('The ten most positive words are : ' + ', '.join(ten_most_positive_words) + '.')

The ten most positive words are : love, easy, great, loves, little, perfect, well, nice, happy, soft.


In [116]:
ten_most_negative_words = [word_coefficient_tuple[0] for word_coefficient_tuple in word_coefficient_tuples[-10:] ]
print('The ten most negative words are : ' + ', '.join(ten_most_negative_words) + '.')

The ten most negative words are : get, back, waste, even, return, work, product, disappointed, would, money.
