In [1]:
import numpy as np 
import pandas as pd
pd.set_option('display.max_colwidth', -1)
np.set_printoptions(threshold=np.nan)

In [2]:
# import dataset 
products = pd.read_csv('amazon_baby_subset.csv')

# import json file 

import json 
with open('important_words.json') as important_words:
    important_words = json.load(important_words)
    


In [3]:
# display first 10 names in the dataframe
products['name'][0:10,]

0    Stop Pacifier Sucking without tears with Thumbuddy To Love's Binky Fairy Puppet and Adorable Book
1    Nature's Lullabies Second Year Sticker Calendar                                                  
2    Nature's Lullabies Second Year Sticker Calendar                                                  
3    Lamaze Peekaboo, I Love You                                                                      
4    SoftPlay Peek-A-Boo Where's Elmo A Children's Book                                               
5    Our Baby Girl Memory Book                                                                        
6    Hunnt&reg; Falling Flowers and Birds Kids Nursery Home Decor Vinyl Mural Art Wall Paper Stickers 
7    Blessed By Pope Benedict XVI Divine Mercy Full Color Medal                                       
8    Cloth Diaper Pins Stainless Steel Traditional Safety Pin (Black)                                 
9    Cloth Diaper Pins Stainless Steel Traditional Safety Pin (Black)    

In [4]:
# count 

products.groupby('sentiment').count()

Unnamed: 0_level_0,name,review,rating
sentiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-1,26461,26393,26493
1,26521,26438,26579


In [5]:
# replace empty string with n/a in review column

products = products.fillna({'review':''})

# remove punctutation

def remove_punctuation(text):
    import string
    return text.translate(None,string.punctuation)

products['review_clean'] = products['review'].apply(remove_punctuation)


# for each word in important__words, we compute a count for the number of times the word occurs in the review. 
# We will store tis count in a separate column (one for each word). The result of this feature processing is a single 
# column for each word in important_words which keeps a count of the number of times the respective word occurs in 
# the review text.

for word in important_words:
    products[word] =products['review_clean'].apply(lambda s: s.split().count(word))

In [6]:
# create a column in products to count the number of reviews that contains 'perfect'

products['contains_perfect'] =[1 if x>=1 else 0 for x in products['perfect']]

# count

np.sum(products['contains_perfect'])

2955

In [19]:
# write a function that accept three parameters: dataframe, feature, label

def get_numpy_data(dataframe, features, label):
    dataframe['constant'] =1
    features = ['constant'] + features
    features_matrix = dataframe[features].values
    label_array = dataframe[label].values
    return features_matrix,label_array


# Write a function to produce probablistic estimate for p(y_i = +1|x,w)

def predict_probability(feature_matrix, coefficients):
    scores = feature_matrix.dot(coefficients)
    predictions = 1/(1 + np.exp(np.negative(scores)))
    return predictions
                     
# write a function to produce derivative 
  
def feature_derivative(errors,feature):
    derivative = feature.T.dot(errors)
    return derivative
# write a function that compute the log likelihood
                     
def compute_log_likelihood(feature_matrix, sentiment, coefficients):
    indicator = (sentiment==+1)
    scores = feature_matrix.dot(coefficients)
    lp = np.sum(((indicator-1) * scores)-np.log(1. + np.exp(-scores)))
    return lp
                     
# write a function to fit a logistic regression model using gradient ascent
                     
def logistic_regression(feature_matrix,sentiment, initial_coefficients,step_size,max_iter):
    # initialize vector coefficients to initial_coefficients.
    coefficients = np.array(initial_coefficients)
    for itr in xrange(max_itr):
        coefficients = initial_coefficients
        # predict the probability using initial coefficients
        predictions = predict_probability(feature_matrix, coefficients)
        # compute indicator
        indicator = (sentiment == +1)
        # compute the errors 
        errors = indicator - predictions
        for i in xrange(len(coefficients)):
            derivative = feature_matrix[:,i].T.dot(errors)
            coefficients[i] = coefficients[i] + step_size * derivative
        
        # checking whether log likelihood is increasing
                     
        if itr <= 15 or (itr <= 100 and itr % 10 == 0) or (itr <= 1000 and itr % 100 == 0) \
        or (itr <= 10000 and itr % 1000 == 0) or itr % 10000 == 0:
            lp = compute_log_likelihood(feature_matrix, sentiment, coefficients)
            print 'iteration %*d: log likelihood of observed labels = %.8f' % (int(np.ceil(np.log10(max_iter))),itr,lp)
    return coefficients 

In [20]:
feature_matrix,sentiment = get_numpy_data(products,important_words,'sentiment')

In [21]:
initial_coefficients = np.zeros(194)
step_size = 1e-7
max_itr = 301

In [45]:
coefficients = logistic_regression(feature_matrix,sentiment,initial_coefficients,step_size,max_iter)

iteration   0: log likelihood of observed labels = -35264.09713332
iteration   1: log likelihood of observed labels = -35259.68571379
iteration   2: log likelihood of observed labels = -35255.27786368
iteration   3: log likelihood of observed labels = -35250.87357844
iteration   4: log likelihood of observed labels = -35246.47285354
iteration   5: log likelihood of observed labels = -35242.07568446
iteration   6: log likelihood of observed labels = -35237.68206667
iteration   7: log likelihood of observed labels = -35233.29199568
iteration   8: log likelihood of observed labels = -35228.90546699
iteration   9: log likelihood of observed labels = -35224.52247612
iteration  10: log likelihood of observed labels = -35220.14301858
iteration  11: log likelihood of observed labels = -35215.76708993
iteration  12: log likelihood of observed labels = -35211.39468570
iteration  13: log likelihood of observed labels = -35207.02580145
iteration  14: log likelihood of observed labels = -35202.6604

In [47]:
predictions = predict_probability(feature_matrix,coefficients)
scores = feature_matrix.dot(coefficients)

In [48]:
prediction_df = pd.DataFrame({'sentiment':sentiment,
                            'prediction':predictions,
                             'scores':scores})

In [60]:
prediction_df['correct_flag'] =[1 if x>0 else -1 for x in prediction_df['scores']]

In [62]:
# correctly predicted

np.sum(prediction_df['correct_flag'])

prediction_df['flag'] = prediction_df['sentiment'] -prediction_df['correct_flag']
print np.shape(np.array(list(np.where(prediction_df['flag']==0))))[1]
print np.shape(feature_matrix)[0]

40059.0/53072.0

40059
53072


0.7548047934880916

In [37]:
coefficients = list(coefficients[1:])

word_coefficients_tuples = [(word, coefficient) for word,coefficient in zip(important_words, coefficients)]

word_coefficients_tuples = sorted(word_coefficients_tuples, key = lambda x:x[1], reverse = True)