In [101]:
import pandas as pd
import string
import re
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.dummy import DummyClassifier


In [102]:
products = pd.read_csv('amazon_baby_subset.csv')

In [103]:
products.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53072 entries, 0 to 53071
Data columns (total 4 columns):
name         52982 non-null object
review       52831 non-null object
rating       53072 non-null int64
sentiment    53072 non-null int64
dtypes: int64(2), object(2)
memory usage: 1.6+ MB


In [104]:
important_words = pd.read_json('important_words.json')

In [105]:
important_words.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 1 columns):
0    193 non-null object
dtypes: object(1)
memory usage: 1.6+ KB


In [106]:
products.name[0:10]

0    Stop Pacifier Sucking without tears with Thumb...
1      Nature's Lullabies Second Year Sticker Calendar
2      Nature's Lullabies Second Year Sticker Calendar
3                          Lamaze Peekaboo, I Love You
4    SoftPlay Peek-A-Boo Where's Elmo A Children's ...
5                            Our Baby Girl Memory Book
6    Hunnt&reg; Falling Flowers and Birds Kids Nurs...
7    Blessed By Pope Benedict XVI Divine Mercy Full...
8    Cloth Diaper Pins Stainless Steel Traditional ...
9    Cloth Diaper Pins Stainless Steel Traditional ...
Name: name, dtype: object

In [107]:
products.review[products.sentiment==1].count()

26438

In [108]:
products.review[products.sentiment==-1].count()

26393

In [109]:
products.review.isna().sum()

241

In [110]:
products['review'] = products.review.fillna('')

In [111]:
products.review.isna().sum()

0

In [112]:
def remove_punctuation(text):
    return re.sub(r'[^\w\s]','',text) 

In [113]:
products['review_clean'] = products.review.apply(lambda x: remove_punctuation(x))

In [114]:
important_words = important_words[0].tolist()
for word in important_words:
    products[word] = products['review_clean'].apply(lambda s : s.split().count(word))

In [115]:
products.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53072 entries, 0 to 53071
Columns: 198 entries, name to either
dtypes: int64(195), object(3)
memory usage: 80.2+ MB


In [116]:
products.sample(5)

Unnamed: 0,name,review,rating,sentiment,review_clean,baby,one,great,love,use,...,seems,picture,completely,wish,buying,babies,won,tub,almost,either
21908,"Ciao! Baby Portable Travel High Chair, Black",My daughter in law says she loves the chair fo...,5,1,My daughter in law says she loves the chair fo...,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
36952,Princess Toddler Bed,I was not very happy with this produce. It sai...,2,-1,I was not very happy with this produce It said...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
48928,BRICA Roll 'n Go Car Seat Transporter,"I really, really wanted to love it. I really d...",2,-1,I really really wanted to love it I really did...,0,0,0,1,0,...,0,0,0,1,1,0,0,0,0,0
12444,"TL Care Organic Cotton Nursing Pads, Natural, ...",These breast pads are very soft. I don't even...,5,1,These breast pads are very soft I dont even n...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6920,HALO SleepSack Micro-Fleece Early Walker Weara...,"Once winter came, I had trouble figuring out h...",5,1,Once winter came I had trouble figuring out ho...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [117]:
products.review[products.perfect>0].count()

2955

In [118]:
def get_numpy_data(dataframe, features, label):
    dataframe['constant'] = 1
    features = ['constant'] + features
    feature_frame = dataframe[features]
    feature_matrix = feature_frame.to_numpy()
    label_sarray = dataframe[label]
    label_array = label_sarray.to_numpy()
    return (feature_matrix, label_array)

In [119]:
feature_matrix, sentiment = get_numpy_data(products,important_words,'sentiment')

In [120]:
feature_matrix.shape

(53072, 194)

In [121]:
def predict_probability(feature_matrix, coefficients):
    import math
    score = np.dot(feature_matrix, coefficients)
    predictions = 1. /(1 + np.exp(-score))
    
    return predictions

In [122]:
dummy_feature_matrix = np.array([[1.,2.,3.], [1.,-1.,-1]])
dummy_coefficients = np.array([1., 3., -1.])

correct_scores      = np.array( [ 1.*1. + 2.*3. + 3.*(-1.),          1.*1. + (-1.)*3. + (-1.)*(-1.) ] )
correct_predictions = np.array( [ 1./(1+np.exp(-correct_scores[0])), 1./(1+np.exp(-correct_scores[1])) ] )

print('The following outputs must match ')
print( '------------------------------------------------')
print('correct_predictions           =', correct_predictions)
print('output of predict_probability =', predict_probability(dummy_feature_matrix, dummy_coefficients))

The following outputs must match 
------------------------------------------------
correct_predictions           = [0.98201379 0.26894142]
output of predict_probability = [0.98201379 0.26894142]


In [123]:
def feature_derivative(errors, feature):
    derivative = np.dot(errors, feature)
    return derivative

In [124]:
def compute_log_likelihood(feature_matrix, sentiment, coefficients):
    indicator = (sentiment==+1).astype(int)
    scores = np.dot(feature_matrix, coefficients)
    logexp = np.log(1. + np.exp(-scores))
    
    # Simple check to prevent overflow
    mask = np.isinf(logexp)
    logexp[mask] = -scores[mask]
    
    lp = np.sum((indicator-1)*scores - logexp)
    return lp
    

In [125]:
dummy_coefficients = np.array([1., 3., -1.])
dummy_sentiment = np.array([-1, 1])

correct_indicators  = np.array( [ -1==+1,                                       1==+1 ] )
correct_scores      = np.array( [ 1.*1. + 2.*3. + 3.*(-1.),                     1.*1. + (-1.)*3. + (-1.)*(-1.) ] )
correct_first_term  = np.array( [ (correct_indicators[0]-1)*correct_scores[0],  (correct_indicators[1]-1)*correct_scores[1] ] )
correct_second_term = np.array( [ np.log(1. + np.exp(-correct_scores[0])),      np.log(1. + np.exp(-correct_scores[1])) ] )

correct_ll          =      sum( [ correct_first_term[0]-correct_second_term[0], correct_first_term[1]-correct_second_term[1] ] ) 

print ('The following outputs must match ')
print ('------------------------------------------------')
print ('correct_log_likelihood           =', correct_ll)
print ('output of compute_log_likelihood =', compute_log_likelihood(dummy_feature_matrix, dummy_sentiment, dummy_coefficients))

The following outputs must match 
------------------------------------------------
correct_log_likelihood           = -5.331411615436032
output of compute_log_likelihood = -5.331411615436032


In [126]:
def logistic_regression(feature_matrix, sentiment, initial_coefficients, step_size, max_iter):
    coefficients = np.array(initial_coefficients)
    for itr in range(max_iter):
        predictions = predict_probability(feature_matrix, coefficients)
        indicator = (sentiment==+1)
        errors =  indicator - predictions     
        
        for j in range (len(coefficients)):
            derivative = feature_derivative(errors, feature_matrix[:,j])
            coefficients[j] += (step_size * derivative)
    
    
        # Checking whether log likelihood is increasing
        if itr <= 15 or (itr <= 100 and itr % 10 == 0) or (itr <= 1000 and itr % 100 == 0) \
        or (itr <= 10000 and itr % 1000 == 0) or itr % 10000 == 0:
            lp = compute_log_likelihood(feature_matrix, sentiment, coefficients)
            print('iteration %*d: log likelihood of observed labels = %.8f' % \
                (int(np.ceil(np.log10(max_iter))), itr, lp))
    return coefficients

In [127]:
coefficients = logistic_regression(feature_matrix, sentiment, initial_coefficients=np.zeros(194),
                                   step_size=1e-7, max_iter=301)

iteration   0: log likelihood of observed labels = -36780.91765822
iteration   1: log likelihood of observed labels = -36775.13429407
iteration   2: log likelihood of observed labels = -36769.35705617
iteration   3: log likelihood of observed labels = -36763.58592657
iteration   4: log likelihood of observed labels = -36757.82088750
iteration   5: log likelihood of observed labels = -36752.06192130
iteration   6: log likelihood of observed labels = -36746.30901047
iteration   7: log likelihood of observed labels = -36740.56213761
iteration   8: log likelihood of observed labels = -36734.82128550
iteration   9: log likelihood of observed labels = -36729.08643701
iteration  10: log likelihood of observed labels = -36723.35757516
iteration  11: log likelihood of observed labels = -36717.63468309
iteration  12: log likelihood of observed labels = -36711.91774409
iteration  13: log likelihood of observed labels = -36706.20674154
iteration  14: log likelihood of observed labels = -36700.5016

In [128]:
scores = np.dot(feature_matrix, coefficients)

In [129]:
scores

array([ 0.05104641, -0.02939436,  0.02411638, ..., -0.40987632,
        0.01411511, -0.06758789])

In [130]:
boundry = lambda x: 1 if x>0 else -1 
vfunc = np.vectorize(boundry)
predict = vfunc(scores)

In [131]:
(predict == 1).sum()

25127

In [132]:
correct_predict = (predict == sentiment).sum()
correct_predict

39902

In [133]:
accuracy = correct_predict/len(products)
accuracy

0.7518465480856196

In [134]:
coefficients = list(coefficients[1:]) # exclude intercept
word_coefficient_tuples = [(word, coefficient) for word, coefficient in zip(important_words, coefficients)]
word_coefficient_tuples = sorted(word_coefficient_tuples, key=lambda x:x[1], reverse=True)

In [135]:
word_coefficient_tuples[0:10]

[('great', 0.06654611812696795),
 ('love', 0.06589085630451577),
 ('easy', 0.06479467430583867),
 ('little', 0.04543575641799635),
 ('loves', 0.04497640499058413),
 ('well', 0.03013496861741666),
 ('perfect', 0.029739964816839565),
 ('old', 0.020077453090480573),
 ('nice', 0.018408789054742972),
 ('daughter', 0.017703241856832343)]

In [136]:
word_coefficient_tuples[-10:]

[('monitor', -0.024482031154393416),
 ('return', -0.026592754205541472),
 ('back', -0.027742584349641455),
 ('get', -0.02871141034622055),
 ('disappointed', -0.028978948955895437),
 ('even', -0.03005114996145573),
 ('work', -0.03306944956701275),
 ('money', -0.038982013325180455),
 ('product', -0.041510960132838635),
 ('would', -0.053860003347359055)]