In [1]:
import pandas as pd
import numpy as np

### Data analysis: remove punctuation, count word for important words

In [2]:
products = pd.read_csv('amazon_baby_subset.csv')

In [3]:
products.iloc[0:10,[0,3]]  # print first ten rows, name and sentiment column

Unnamed: 0,name,sentiment
0,Stop Pacifier Sucking without tears with Thumb...,1
1,Nature's Lullabies Second Year Sticker Calendar,1
2,Nature's Lullabies Second Year Sticker Calendar,1
3,"Lamaze Peekaboo, I Love You",1
4,SoftPlay Peek-A-Boo Where's Elmo A Children's ...,1
5,Our Baby Girl Memory Book,1
6,Hunnt&reg; Falling Flowers and Birds Kids Nurs...,1
7,Blessed By Pope Benedict XVI Divine Mercy Full...,1
8,Cloth Diaper Pins Stainless Steel Traditional ...,1
9,Cloth Diaper Pins Stainless Steel Traditional ...,1


In [4]:
products = products.fillna({'review':''})

In [5]:
import json
with open('important_words.json') as important_words_file:    
    important_words = json.load(important_words_file)
print (important_words[:10])    # important_words is a list 

['baby', 'one', 'great', 'love', 'use', 'would', 'like', 'easy', 'little', 'seat']


In [6]:
def remove_punctuation(text):
    import string
    return text.translate(str.maketrans('','',string.punctuation))

products['review_clean'] = products['review'].apply(remove_punctuation)
products.head(3)

Unnamed: 0,name,review,rating,sentiment,review_clean
0,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,1,All of my kids have cried nonstop when I tried...
1,Nature's Lullabies Second Year Sticker Calendar,We wanted to get something to keep track of ou...,5,1,We wanted to get something to keep track of ou...
2,Nature's Lullabies Second Year Sticker Calendar,My daughter had her 1st baby over a year ago. ...,5,1,My daughter had her 1st baby over a year ago S...


In [7]:
for word in important_words:
    products[word] = products['review_clean'].apply(lambda s : s.split().count(word))   # lambda s  as a defined function

In [8]:
products.head(3)

Unnamed: 0,name,review,rating,sentiment,review_clean,baby,one,great,love,use,...,seems,picture,completely,wish,buying,babies,won,tub,almost,either
0,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,1,All of my kids have cried nonstop when I tried...,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Nature's Lullabies Second Year Sticker Calendar,We wanted to get something to keep track of ou...,5,1,We wanted to get something to keep track of ou...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Nature's Lullabies Second Year Sticker Calendar,My daughter had her 1st baby over a year ago. ...,5,1,My daughter had her 1st baby over a year ago S...,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
def perfect_count (x):
    if x>=1:
        return 1
    else:
        return 0

In [10]:
products['contain_perfect']=products['perfect'].apply(perfect_count)

In [11]:
np.sum(products['contain_perfect'])

2955

In [12]:
#Equivalent to : 
#products['contains_perfect'] = products['perfect'] >=1  # return value is True/False
#print (products['contains_perfect'].sum())              # sum up the number of True

### Convert data frame to multi-dimensional array

In [13]:
def get_numpy_data(dataframe, features, label):
    dataframe['constant'] = 1
    features = ['constant'] + features
    features_frame = dataframe[features]
    feature_matrix = features_frame.to_numpy()
    label_sarray = dataframe[label]
    label_array = label_sarray.to_numpy()
    return(feature_matrix, label_array)

In [14]:
feature_matrix, sentiment = get_numpy_data(products, important_words, 'sentiment')  #feature_matrix=important_words columns

In [15]:
print (feature_matrix.shape)

(53072, 194)


### Define functions used in GD: predict_probability, feature_derivative, compute_log_likelihood, GD algorithm

In [16]:
def predict_probability (feature_matrix,coefficients):
    score=np.dot(feature_matrix,coefficients)  # feature_matrix: N*D  coefficients: D*1 score: N*1
    predictions=1.0/(1+np.exp(-score))         #prediction: N*1
    return predictions

In [17]:
def feature_derivative(errors,feature):
    derivative= np.dot(np.transpose(errors), feature) # errors: N*1 feature: N*1 
    return derivative                                 # derivative: 1*1

In [45]:
def compute_log_likelihood(feature_matrix, sentiment, coefficients):
    indicator=(sentiment==+1)   
    scores= np.dot(feature_matrix, coefficients)     # scores: N*1
    #lp = np.sum((np.transpose(np.array([indicator]))-1)*scores - np.log(1. + np.exp(-scores)))
    lp=np.sum((indicator-1)*score-np.log(1+np.exp(-scores))) # (indicator-1)*scores--> element-wise product
    return lp 

In [46]:
### checkpoint:
dummy_feature_matrix = np.array([[1.,2.,3.], [1.,-1.,-1]])
dummy_coefficients = np.array([1., 3., -1.])
dummy_sentiment = np.array([-1, 1])

correct_indicators  = np.array( [ -1==+1,                                       1==+1 ] )
correct_scores      = np.array( [ 1.*1. + 2.*3. + 3.*(-1.),                     1.*1. + (-1.)*3. + (-1.)*(-1.) ] )
correct_first_term  = np.array( [ (correct_indicators[0]-1)*correct_scores[0],  (correct_indicators[1]-1)*correct_scores[1] ] )
correct_second_term = np.array( [ np.log(1. + np.exp(-correct_scores[0])),      np.log(1. + np.exp(-correct_scores[1])) ] )

correct_ll          =      sum( [ correct_first_term[0]-correct_second_term[0], correct_first_term[1]-correct_second_term[1] ] ) 

print('The following outputs must match ')
print('------------------------------------------------')
print('correct_log_likelihood           =', correct_ll)
print('output of compute_log_likelihood =', compute_log_likelihood(dummy_feature_matrix, dummy_sentiment, dummy_coefficients))

The following outputs must match 
------------------------------------------------
correct_log_likelihood           = -5.331411615436032
output of compute_log_likelihood = -5.331411615436032


In [54]:
from math import sqrt
def logistic_regression(feature_matrix, sentiment, initial_coefficients, step_size, max_iter):
    coefficients = np.array(initial_coefficients) # make sure it's a numpy array
    
    for itr in range(max_iter):

        predictions = predict_probability(feature_matrix, coefficients)
        indicator = (sentiment==+1)
        errors = np.transpose(np.array([indicator])) - predictions     # errors calculated ahead to ensure not use the updated coeff from the nested loop

        for j in range(len(coefficients)): 
            
            derivative = feature_derivative(errors, feature_matrix[:,j])
            coefficients[j] += step_size*derivative
 
        # Checking whether log likelihood is increasing
        #if itr <= 15 or (itr <= 100 and itr % 10 == 0) or (itr <= 1000 and itr % 100 == 0) \
        #or (itr <= 10000 and itr % 1000 == 0) or itr % 10000 == 0:
            # lplist.append(compute_log_likelihood(feature_matrix, sentiment, coefficients))
            #lp = compute_log_likelihood(feature_matrix, sentiment, coefficients)
            #print ('iteration %*d: log likelihood of observed labels = %.8f' % \
                #(int(np.ceil(np.log10(max_iter))), itr, lp))
            
    return coefficients

In [55]:
initial_coefficients = np.zeros((194,1))   # set all initial coefficients to 0; dim: 194*1
step_size = 1e-7
max_iter = 301

In [56]:
coefficients = logistic_regression(feature_matrix, sentiment, initial_coefficients, step_size, max_iter)

In [72]:
scores = np.dot(feature_matrix,coefficients) #N*D
np.sum(scores>0)

In [115]:
print(type(scores))
print(scores.shape)
F = scores>0
F

<class 'numpy.ndarray'>
(53072, 1)


array([[ True],
       [False],
       [ True],
       ...,
       [False],
       [ True],
       [False]])

In [93]:
B = scores.flatten()
print(B.shape)
C = B>0
C

(53072,)


array([ True, False,  True, ..., False,  True, False])

In [97]:
type(products["sentiment"])
A = np.array(products["sentiment"])
D = A>0
D

array([ True,  True,  True, ..., False, False, False])

#### Compute Accuracy--way1

In [116]:
correct_num = np.sum((scores.flatten()> 0) == (np.array(products['sentiment'])>0))
total_num = len(products['sentiment'])
print ("correct_num: {}, total_num: {}".format(correct_num, total_num))
accuracy = correct_num * 1./ total_num
print (accuracy)

correct_num: 39903, total_num: 53072
0.7518653904130238


#### Compute Accuracy--way2    drawback: change the true scores value

In [29]:
#for i in range(len(scores)):
    #if scores[i]<0:
        #scores[i]= -1
    #else:
        #scores[i]= 1

In [30]:
#print(scores[:5])
#print(np.transpose(scores)[:5])

[[ 1.]
 [-1.]
 [ 1.]
 [ 1.]
 [ 1.]]
[[ 1. -1.  1. ... -1.  1. -1.]]


In [31]:
#print(products['sentiment'][:5])
#print(np.array(products['sentiment'])[:5])

0    1
1    1
2    1
3    1
4    1
Name: sentiment, dtype: int64
[1 1 1 1 1]


In [32]:
#corr=np.sum(np.transpose(scores) == np.array(products['sentiment']))
#print(corr)

39903


### Find the words contribute most to positive and negative review

In [117]:
coefficients

array([[ 5.16220157e-03],
       [ 1.55656966e-02],
       [-8.50204675e-03],
       [ 6.65460842e-02],
       [ 6.58907629e-02],
       [ 5.01743882e-03],
       [-5.38601484e-02],
       [-3.50488413e-03],
       [ 6.47945868e-02],
       [ 4.54356263e-02],
       [ 3.98353364e-03],
       [ 2.00775410e-02],
       [ 3.01350011e-02],
       [-2.87115530e-02],
       [ 1.52161964e-02],
       [ 2.72592062e-04],
       [ 1.19448177e-02],
       [-1.82461935e-02],
       [-1.21706420e-02],
       [-4.15110334e-02],
       [ 2.76820391e-03],
       [ 1.77031999e-02],
       [-4.39700067e-03],
       [ 4.49764014e-02],
       [ 9.90916464e-03],
       [ 8.99239081e-04],
       [-1.36219516e-03],
       [ 1.26859357e-02],
       [ 8.26466695e-03],
       [-2.77426972e-02],
       [ 6.10128809e-04],
       [ 1.54084501e-02],
       [-1.32134753e-02],
       [-3.00512492e-02],
       [ 2.97399371e-02],
       [ 1.84087080e-02],
       [ 2.86178752e-03],
       [-1.05768015e-02],
       [-6.5

In [118]:
print(coefficients[:10])

[[ 0.0051622 ]
 [ 0.0155657 ]
 [-0.00850205]
 [ 0.06654608]
 [ 0.06589076]
 [ 0.00501744]
 [-0.05386015]
 [-0.00350488]
 [ 0.06479459]
 [ 0.04543563]]


In [119]:
print(important_words[:5])
print(coefficients.flatten()[:5])

['baby', 'one', 'great', 'love', 'use']
[ 0.0051622   0.0155657  -0.00850205  0.06654608  0.06589076]


In [126]:
coef_table = pd.DataFrame({'word':important_words, 'coefficients': coefficients.flatten()[1:]})  # exclude coef of constant
coef_table_sort=coef_table.sort_values(by=['coefficients'], ascending=False)
print(coef_table[:10])
print(coef_table_sort[:10])

     word  coefficients
0    baby      0.015566
1     one     -0.008502
2   great      0.066546
3    love      0.065891
4     use      0.005017
5   would     -0.053860
6    like     -0.003505
7    easy      0.064795
8  little      0.045436
9    seat      0.003984
        word  coefficients
2      great      0.066546
3       love      0.065891
7       easy      0.064795
8     little      0.045436
22     loves      0.044976
11      well      0.030135
33   perfect      0.029740
10       old      0.020078
34      nice      0.018409
20  daughter      0.017703


In [37]:
coef_table = pd.DataFrame({'word':important_words, 'coefficients': coefficients.flatten()[1:]})  # exclude coef of constant
coef_table_sort=coef_table.sort_values(by=['coefficients'], ascending=True)
print(coef_table_sort[:10])

             word  coefficients
5           would     -0.053860
18        product     -0.041511
96          money     -0.038982
77           work     -0.033070
32           even     -0.030051
105  disappointed     -0.028979
12            get     -0.028712
28           back     -0.027743
113        return     -0.026593
97        monitor     -0.024482
