In [1]:
#import modules.
import numpy as np
import pandas as pd
from time import time

#load dataset.
data = pd.read_csv("amazon_data/Reviews.csv")

DATA EXPLORATION

In [2]:
#view of labels
data.head(1)

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...


In [3]:
#total number of reviews in dataset
print('There are {} reviews in this dataset.'.format(data['Id'].count()))

There are 568454 reviews in this dataset.


In [4]:
#total/mean/median number of products reviewed in dataset
print('There are {} unique products reviewed.'.format(data['ProductId'].nunique()))
print('The mean number of reviews for each product is {0:.2f}.'.format(data['ProductId'].value_counts().mean()))
print('The median number of reviews for each product is {0:.2f}.'.format(data['ProductId'].value_counts().median()))


There are 74258 unique products reviewed.
The mean number of reviews for each product is 7.66.
The median number of reviews for each product is 2.00.


In [5]:
#total/mean/median number of users who contribtued reviews in dataset
print('There are {} unique users who contributed to the reviews.'.format(data['UserId'].nunique()))
print('The mean number of reviews contributed by each user is {0:.2f}.'.format(data['UserId'].value_counts().mean()))
print('The median number of reviews contributed by each user is {0:.2f}.'.format(data['UserId'].value_counts().median()))


There are 256059 unique users who contributed to the reviews.
The mean number of reviews contributed by each user is 2.22.
The median number of reviews contributed by each user is 1.00.


In [6]:
#breakdown of reviews by scores
print('The mean Score is {0:.2f}.'.format(data['Score'].mean()))
print('The median Score is {0:.2f}.'.format(data['Score'].median()))
print('There are {} ({}%) reviews with a Score of 5.'.format(len(data[data['Score']==5]), len(data[data['Score']==5])*100/len(data['Score'])))
print('There are {} ({}%) reviews with a Score of 4.'.format(len(data[data['Score']==4]), len(data[data['Score']==4])*100/len(data['Score'])))
print('There are {} ({}%) reviews with a Score of 3.'.format(len(data[data['Score']==3]), len(data[data['Score']==3])*100/len(data['Score'])))
print('There are {} ({}%) reviews with a Score of 2.'.format(len(data[data['Score']==2]), len(data[data['Score']==2])*100/len(data['Score'])))
print('There are {} ({}%) reviews with a Score of 1.'.format(len(data[data['Score']==1]), len(data[data['Score']==1])*100/len(data['Score'])))


The mean Score is 4.18.
The median Score is 5.00.
There are 363122 (63%) reviews with a Score of 5.
There are 80655 (14%) reviews with a Score of 4.
There are 42640 (7%) reviews with a Score of 3.
There are 29769 (5%) reviews with a Score of 2.
There are 52268 (9%) reviews with a Score of 1.


In [7]:
#breakdown of reviews by HelpfulnessNumerator and HelfulnessDenominator
data['Unhelpful'] = data['HelpfulnessDenominator'] - data['HelpfulnessNumerator']

print('{} ({}%) of all reviews have at least one mention of being helpful.'.format(np.count_nonzero(data['HelpfulnessNumerator']), 100*np.count_nonzero(data['HelpfulnessNumerator'])/np.count_nonzero(data['ProductId'])))
print('{} ({}%) of all reviews have at least one mention of being helpful or unhelpful.'.format(np.count_nonzero(data['HelpfulnessDenominator']), 100*np.count_nonzero(data['HelpfulnessDenominator'])/np.count_nonzero(data['ProductId'])))
print('{} ({}%) of all reviews were indicated as more helpful than unhelpful.'.format(len(data[data['HelpfulnessNumerator'] > data['Unhelpful']]),100*len(data[data['HelpfulnessNumerator'] > data['Unhelpful']])/data['Id'].count())) 
print('{} ({}%) of all reviews were indicated as more unhelpful than helpful.'.format(len(data[data['HelpfulnessNumerator'] < data['Unhelpful']]),100*len(data[data['HelpfulnessNumerator'] < data['Unhelpful']])/data['Id'].count())) 


264628 (46%) of all reviews have at least one mention of being helpful.
298402 (52%) of all reviews have at least one mention of being helpful or unhelpful.
226666 (39%) of all reviews were indicated as more helpful than unhelpful.
50113 (8%) of all reviews were indicated as more unhelpful than helpful.


DATA PREPROCESSING

In [8]:
#assuming a Score of 4 or 5 is a positive review, while a score of 1, 2 or 3 is a negative review
good_score = len(data[data['Score']==5]) + len(data[data['Score']==4])
bad_score = len(data[data['Score']==3]) + len(data[data['Score']==2]) + len(data[data['Score']==1])

#breakdown of reviews by sentiment based on good or bad scores.
print('There are {} ({}%) positive reviews in the dataset.'.format(good_score, good_score*100/(good_score+bad_score)))
print('There are {} ({}%) negative reviews in the dataset.'.format(bad_score, bad_score*100/(good_score+bad_score)))

#classifying scores into positive('0') and negative('1') sentiments
data['Sentiment'] = data.Score.map({5:0, 4:0, 3:1, 2:1, 1:1})

There are 443777 (78%) positive reviews in the dataset.
There are 124677 (21%) negative reviews in the dataset.


In [9]:
#extract hold-out set for testing final model
data1 = data.iloc[:397917]
data2 = data.iloc[397917:]
print('There are {} reviews in the train/test dataset.'.format(data1['Id'].count()))
print('There are {} reviews in the hold out dataset.'.format(data2['Id'].count()))

There are 397917 reviews in the train/test dataset.
There are 170537 reviews in the hold out dataset.


In [10]:
#split into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data1['Text'], data1['Sentiment'], test_size=0.3, random_state=7)

print('Number of samples in the dataset: {}'.format(data1.shape[0]))
print('Number of samples in the training set: {}'.format(X_train.shape[0]))
print('Number of samples in the test set: {}'.format(X_test.shape[0]))


Number of samples in the dataset: 397917
Number of samples in the training set: 278541
Number of samples in the test set: 119376


In [11]:
#create vectors for neural network
import keras
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(nb_words=10000)

X_train_nn = tokenizer.sequences_to_matrix(X_train)
X_test_nn = tokenizer.sequences_to_matrix(X_test)

# One-hot encode the target variable
from keras.utils import np_utils
num_classes = 2
y_train_nn = np_utils.to_categorical(y_train, num_classes)
y_test_nn = np_utils.to_categorical(y_test, num_classes)


Using TensorFlow backend.


In [12]:
#build CountVectorizer object for NB, AdaBoost and XGBoost classifiers
from sklearn.feature_extraction.text import CountVectorizer
count_vector = CountVectorizer()

#transform and fit training data and return frequency matrix
X_train = count_vector.fit_transform(X_train)

#transform testing data and return frequency matrix
X_test = count_vector.transform(X_test)


IMPLEMENTING NAIVE BAYES, ADABOOST AND XGBOOST CLASSIFIERS

In [13]:
#build classifiers
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier

nb = MultinomialNB()
ada = AdaBoostClassifier()
xgb = XGBClassifier()

#create function to fit, predict and evaluate models
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def train_pred_eval(clf):
    #fit
    train_start = time()
    clf.fit(X_train, y_train)
    train_end = time()
    print('Trained {} model in {:.4f} seconds.'.format(clf, train_end - train_start))
    #predict
    pred_start = time()
    pred_test = clf.predict(X_test)
    pred_end = time()
    pred_train = clf.predict(X_train)
    print('Made {} predictions in {:.4f} seconds.'.format(clf, pred_end - pred_start))
    print('*******')
    #evaluate
    train_accuracy = accuracy_score(y_train, pred_train)
    train_precision = precision_score(y_train, pred_train)
    train_recall = recall_score(y_train, pred_train)
    train_f1 = f1_score(y_train, pred_train)
    test_accuracy = accuracy_score(y_test, pred_test)
    test_precision = precision_score(y_test, pred_test)
    test_recall = recall_score(y_test, pred_test)
    test_f1 = f1_score(y_test, pred_test)
    print('SCORES FOR {} CLASSIFIER'.format(clf))
    print('Training Accuracy score: {}'.format(train_accuracy))
    print('Training Precision score: {}'.format(train_precision))
    print('Training Recall score: {}'.format(train_recall))
    print('Training F1 score: {}'.format(train_f1))
    print('-------')
    print('Testing Accuracy score: {}'.format(test_accuracy))
    print('Testing Precision score: {}'.format(test_precision))
    print('Testing Recall score: {}'.format(train_recall))
    print('Testing F1 score: {}'.format(test_f1))
    print('------------------------------')
    print('------------------------------')

#fit, predict and evaluate models
#nb is the last classifier in loop
for e in [ada, xgb, nb]:
    train_pred_eval(e)
    

Trained AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None) model in 85.0817 seconds.
Made AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None) predictions in 2.6566 seconds.
*******
SCORES FOR AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None) CLASSIFIER
Training Accuracy score: 0.827881712208
Training Precision score: 0.67730054985
Training Recall score: 0.423832245918
Training F1 score: 0.521393630828
-------
Testing Accuracy score: 0.828767926551
Testing Precision score: 0.676046961995
Testing Recall score: 0.423832245918
Testing F1 score: 0.522216768343
------------------------------
------------------------------
Trained XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
 

IMPLEMENTING NEURAL NETWORK

In [14]:
#build neural network model architecture
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation

model = Sequential()
model.add(Dense(32, activation='relu', input_dim=X_train_nn.shape[1]))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(2, activation='softmax'))

model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
dense_1 (Dense)                  (None, 32)            320032      dense_input_1[0][0]              
____________________________________________________________________________________________________
dense_2 (Dense)                  (None, 64)            2112        dense_1[0][0]                    
____________________________________________________________________________________________________
dropout_1 (Dropout)              (None, 64)            0           dense_2[0][0]                    
____________________________________________________________________________________________________
dense_3 (Dense)                  (None, 2)             130         dropout_1[0][0]                  
Total params: 322,274
Trainable params: 322,274
Non-trainable params: 0
___________________

In [15]:
#compile neural network model 
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy', 'precision', 'recall'])

In [16]:
#create function to fit and evaluate neural network 
def train_eval_nn(epoch,batch):
    #fit model
    train_start = time()
    model.fit(X_train_nn, y_train_nn, nb_epoch=epoch, batch_size=batch)
    train_end = time()
    print('Trained Neural Network in {:.4f} seconds.'.format(train_end - train_start))
    #evaluate model
    pred_start = time()
    scores_nn = model.evaluate(X_test_nn, y_test_nn)
    pred_end = time()
    print('Made Neural Network predictions in {:.4f} seconds.'.format(pred_end - pred_start))
    print('*******')
    print('SCORES FOR NEURAL NETWORK')
    print('Testing Accuracy score: {0:.4f}'.format(scores_nn[1]))
    print('Testing Rrecision score: {0:.4f}'.format(scores_nn[2]))
    print('Testing Recall score: {0:.4f}'.format(scores_nn[3]))

#train and evaluate neural network
train_eval_nn(3,32)
    

Epoch 1/3
Epoch 2/3
Epoch 3/3
Trained Neural Network in 172.7064 seconds.
Made Neural Network predictions in 14.2308 seconds.
*******
SCORES FOR NEURAL NETWORK
Testing Accuracy score: 0.7800
Testing Rrecision score: 0.7800
Testing Recall score: 0.7800


REFINEMENT(I) - INCREASE FEATURES BY COMBINING 'SUMMARY' AND 'TEXT'

In [17]:
#create new column with combined text from 'Summary' and 'Text'
data1['Summary'] = data1['Summary'].astype(str)
data1['text_all'] = data1['Summary'] + " " + data1['Text']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [18]:
#split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data1['text_all'], data1['Sentiment'], test_size=0.3, random_state=7)


In [19]:
#transform and fit training data and return frequency matrix
X_train = count_vector.fit_transform(X_train)

#transform testing data and return frequency matrix
X_test = count_vector.transform(X_test)

In [20]:
#implement NB classifier after refinement(I)
train_pred_eval(nb)


Trained MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True) model in 0.1844 seconds.
Made MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True) predictions in 0.0574 seconds.
*******
SCORES FOR MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True) CLASSIFIER
Training Accuracy score: 0.901472314668
Training Precision score: 0.764138400173
Training Recall score: 0.802187814458
Training F1 score: 0.782700956483
-------
Testing Accuracy score: 0.891577871599
Testing Precision score: 0.742461762564
Testing Recall score: 0.802187814458
Testing F1 score: 0.759061039856
------------------------------
------------------------------


REFINEMENT(II) - ACCENTUATE FEATURES BY INCREASING 'SUMMARY' to 'TEXT' RATIO AND REPLICATING DATASET

In [21]:
#accentuate words in 'Summary' by creating a new column with combined text from 'Summary' and 'Text' with the ratio of 6:1
data1['text_all'] = data1['Summary'] + " " + data1['Summary'] + " " + data1['Summary'] + " " + data1['Summary'] + " " + data1['Summary'] + " " + data1['Summary'] + " " + data1['Text']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [22]:
#accentuate most frequently occuring words by replicating "text_all"
data1['text_all'] = data1['text_all'] + " " + data1['text_all'] + " " + data1['text_all'] + " " + data1['text_all'] + " " + data1['text_all'] + " " + data1['text_all'] + " " + data1['text_all'] 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [23]:
#split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data1['text_all'], data1['Sentiment'], test_size=0.3, random_state=7)


In [24]:
#transform and fit training data and return frequency matrix
X_train = count_vector.fit_transform(X_train)

#transform testing data and return frequency matrix
X_test = count_vector.transform(X_test)

In [25]:
#implement NB classifier after refinement(II)
train_pred_eval(nb)


Trained MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True) model in 0.1631 seconds.
Made MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True) predictions in 0.0500 seconds.
*******
SCORES FOR MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True) CLASSIFIER
Training Accuracy score: 0.903479200549
Training Precision score: 0.72948577319
Training Recall score: 0.895867822248
Training F1 score: 0.804160808852
-------
Testing Accuracy score: 0.890890966358
Testing Precision score: 0.703662665436
Testing Recall score: 0.895867822248
Testing F1 score: 0.778294098623
------------------------------
------------------------------


REFINEMENT(III) - FILTER SAMPLES BY HELPFULNESSNUMERATOR AND HELPFULNESSDENOMINATOR

In [26]:
#remove reviews without a single HelpfulnessNumerator
data_helpful = data1[data1['HelpfulnessNumerator'] > 0]

In [27]:
#split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data_helpful['text_all'], data_helpful['Sentiment'], test_size=0.3, random_state=7)

print('Number of samples in the dataset: {}'.format(data_helpful.shape[0]))
print('Number of samples in the training set: {}'.format(X_train.shape[0]))
print('Number of samples in the test set: {}'.format(X_test.shape[0]))


Number of samples in the dataset: 186054
Number of samples in the training set: 130237
Number of samples in the test set: 55817


In [28]:
#transform and fit training data and return frequency matrix
X_train = count_vector.fit_transform(X_train)

#transform testing data and return frequency matrix
X_test = count_vector.transform(X_test)

In [29]:
#implement NB classifier after refinement(III)
train_pred_eval(nb)

Trained MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True) model in 0.0848 seconds.
Made MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True) predictions in 0.0263 seconds.
*******
SCORES FOR MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True) CLASSIFIER
Training Accuracy score: 0.915377350523
Training Precision score: 0.788936011309
Training Recall score: 0.904913418589
Training F1 score: 0.842954244268
-------
Testing Accuracy score: 0.894404930398
Testing Precision score: 0.750171671141
Testing Recall score: 0.904913418589
Testing F1 score: 0.803060678963
------------------------------
------------------------------


TESTING FINAL MODEL WITH HOLD OUT SET

In [30]:
#transform hold out testing data and return frequency matrix
holdout_test = count_vector.transform(data2['Text'])

#predict 
pred_start = time()
pred_test = nb.predict(holdout_test)
pred_end = time()
print('Made hold out predictions in {:.4f} seconds.'.format(pred_end - pred_start))
print('*******')

#evaluate
test_accuracy = accuracy_score(data2['Sentiment'], pred_test)
test_precision = precision_score(data2['Sentiment'], pred_test)
test_recall = recall_score(data2['Sentiment'], pred_test)
test_f1 = f1_score(data2['Sentiment'], pred_test)
print('SCORES FOR HOLD OUT DATASET')
print('Testing Accuracy score: {}'.format(test_accuracy))
print('Testing Precision score: {}'.format(test_precision))
print('Testing Recall score: {}'.format(test_recall))
print('Testing F1 score: {}'.format(test_f1))

Made hold out predictions in 0.0652 seconds.
*******
SCORES FOR HOLD OUT DATASET
Testing Accuracy score: 0.856705582953
Testing Precision score: 0.63839215774
Testing Recall score: 0.775024453864
Testing F1 score: 0.700104313677


ADDENDUM: REFINEMENT FOR NEURAL NETWORK

In [31]:
#remove unhelpful reviews from dataset
data_helpful = data1[data1['HelpfulnessNumerator'] >= data1['Unhelpful']]


In [32]:
#split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data_helpful['text_all'], data_helpful['Sentiment'], test_size=0.3, random_state=7)

print('Number of samples in the dataset: {}'.format(data_helpful.shape[0]))
print('Number of samples in the training set: {}'.format(X_train.shape[0]))
print('Number of samples in the test set: {}'.format(X_test.shape[0]))


Number of samples in the dataset: 362853
Number of samples in the training set: 253997
Number of samples in the test set: 108856


In [33]:
#create vectors for neural network
X_train_nn = tokenizer.sequences_to_matrix(X_train)
X_test_nn = tokenizer.sequences_to_matrix(X_test)

# One-hot encoding the output
y_train_nn = np_utils.to_categorical(y_train, num_classes)
y_test_nn = np_utils.to_categorical(y_test, num_classes)


In [34]:
#fit and evaluate neural network after refinement
train_eval_nn(3,32)


Epoch 1/3
Epoch 2/3
Epoch 3/3
Trained Neural Network in 155.9739 seconds.
*******
SCORES FOR NEURAL NETWORK
Testing Accuracy score: 0.8161
Testing Rrecision score: 0.8161
Testing Recall score: 0.8161


TESTING NEURAL NETWORK WITH HOLDOUT SET

In [36]:
#create vectors for neural network
holdout_test_X_nn = tokenizer.sequences_to_matrix(data2['Text'])

# One-hot encode the target variable
num_classes = 2
holdout_test_y_nn = np_utils.to_categorical(data2['Sentiment'], num_classes)



In [37]:
#evaluate model
pred_start = time()
scores_nn = model.evaluate(holdout_test_X_nn, holdout_test_y_nn)
pred_end = time()
print('Made Neural Network predictions in {:.4f} seconds.'.format(pred_end - pred_start))
print('*******')
print('SCORES FOR NEURAL NETWORK')
print('Testing Accuracy score: {0:.4f}'.format(scores_nn[1]))
print('Testing Rrecision score: {0:.4f}'.format(scores_nn[2]))
print('Testing Recall score: {0:.4f}'.format(scores_nn[3]))



*******
SCORES FOR NEURAL NETWORK
Testing Accuracy score: 0.7842
Testing Rrecision score: 0.7842
Testing Recall score: 0.7842
