In [1]:
import numpy as np
import os
import pandas as pd

from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score, precision_score, recall_score
from sklearn.model_selection import cross_validate, train_test_split
from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

#### First, load data and encode probabilities for int dense matrices ####

In [2]:
# Obtain final dataset.
my_data = pd.read_csv('../../data/results/amazon_final.csv')

In [3]:
def replace_probabilities(prob):
    """
    replace_probabilities encodes raw probabilities as integers.
    """
    prob *= 10
    return int(prob)

In [4]:
# Obtain a copy of data with probabilities replaced with integers.
data = my_data.copy()

data['joy'] = data['joy'].apply(lambda x: replace_probabilities(x))
data['sadness'] = data['sadness'].apply(lambda x: replace_probabilities(x))
data['fear'] = data['fear'].apply(lambda x: replace_probabilities(x))
data['anger'] = data['anger'].apply(lambda x: replace_probabilities(x))
data['neutral'] = data['neutral'].apply(lambda x: replace_probabilities(x))

# Encode outputs.
data['PREDICTION'] = data['PREDICTION'].apply({'neutral':0, 'fear':1, 'sadness':2, 'joy':3,'anger':4}.get)
data['VERACITY'] = data['VERACITY'].apply({'Fake':0,'Real':1}.get)

data

Unnamed: 0,index,REVIEW_TEXT,VERACITY,PREDICTION,joy,fear,neutral,anger,sadness
0,0,"When least you think so, this product will sav...",0,0,1,0,6,1,0
1,1,Lithium batteries are something new introduced...,0,4,1,1,0,6,0
2,2,I purchased this swing for my baby. She is 6 m...,0,0,2,0,4,1,1
3,3,I was looking for an inexpensive desk calcolat...,0,1,1,4,1,2,0
4,5,I m not sure what this is supposed to be but I...,0,0,2,0,6,0,0
...,...,...,...,...,...,...,...,...,...
13980,20964,These shoes are absolutely amazing. They are t...,1,3,7,0,1,0,0
13981,20966,"My son had ordered it but when it arrived, he ...",1,4,0,0,0,7,0
13982,20967,These shoes fit well. I purchased to wear to ...,1,3,6,0,1,0,0
13983,20968,"These slippers are soft, warm, and very cozy. ...",1,3,5,0,2,1,0


In [5]:
# Store main output separately.
y = data['VERACITY']

### Define the pipeline ###

In [6]:
# Define a pipeline for feature extraction and apply to review texts.
pipe = Pipeline([('vect', CountVectorizer()),
                 ('tfidf', TfidfTransformer())])
x = pipe.fit_transform(data['REVIEW_TEXT'])

#### Create a matrix with added integer emotion classification results ####

In [7]:
# Concatenate emotion classification results with textual features.
dense_matrix_int = x.todense()
dense_matrix_int = np.insert(dense_matrix_int,dense_matrix_int.shape[1],[data['PREDICTION'], data['joy'], data['sadness'], data['fear'], data['anger'], data['neutral']],axis=1)
dense_matrix_int

matrix([[0., 0., 0., ..., 0., 1., 6.],
        [0., 0., 0., ..., 1., 6., 0.],
        [0., 0., 0., ..., 0., 1., 4.],
        ...,
        [0., 0., 0., ..., 0., 0., 1.],
        [0., 0., 0., ..., 0., 1., 2.],
        [0., 0., 0., ..., 0., 1., 5.]])

#### Create a matrix with added float emotion classification results ####

In [8]:
# Concatenate emotion classification results with textual features.
dense_matrix_f = x.todense()
dense_matrix_float = dense_matrix_f.astype(float)
dense_matrix_float = np.insert(dense_matrix_float,dense_matrix_float.shape[1],[data['PREDICTION'], my_data['joy'], my_data['sadness'], my_data['fear'], my_data['anger'], my_data['neutral']],axis=1)

### Classify based on textual features using pipeline ###

In [9]:
# Extract features.
x = pipe.fit_transform(data['REVIEW_TEXT'])

In [10]:
# Split text into training and test sets. Pass random state to keep same split across models.
x_train, x_test, y_train, y_test =  train_test_split(x, y, test_size = 0.15, random_state = 7)

In [11]:
# Fit Multinomial Naive Bayes model.
naive_no_emo = MultinomialNB()
naive_no_emo.fit(x_train, y_train)
print('Trainining accuracy: ', naive_no_emo.score(x_train, y_train))
# Make predictions.
y_pred_no_emo = naive_no_emo.predict(x_test)
print('Test accuracy: ', naive_no_emo.score(x_test, y_test))
print()
print('Confusion matrix:')
print(confusion_matrix(y_test, y_pred_no_emo))
print()
print('Classification report:')
print(classification_report(y_test, y_pred_no_emo, digits=4))

Trainining accuracy:  0.800622528812989
Test accuracy:  0.6124880838894184

Confusion matrix:
[[947 165]
 [648 338]]

Classification report:
              precision    recall  f1-score   support

           0     0.5937    0.8516    0.6997      1112
           1     0.6720    0.3428    0.4540       986

    accuracy                         0.6125      2098
   macro avg     0.6328    0.5972    0.5768      2098
weighted avg     0.6305    0.6125    0.5842      2098



In [12]:
# Fit Bernoulli Naive Bayes model.
naive_b_no_emo = BernoulliNB()
naive_b_no_emo.fit(x_train, y_train)
print('Training accuracy: ', naive_b_no_emo.score(x_train, y_train))
# Make predictions.
y_pred_b_no_emo = naive_b_no_emo.predict(x_test)
print('Test accuracy: ', naive_b_no_emo.score(x_test, y_test))
print()
print('Confusion matrix:')
print(confusion_matrix(y_test, y_pred_b_no_emo))
print()
print('Classification report:')
print(classification_report(y_test, y_pred_b_no_emo, digits=4))

Training accuracy:  0.7005972911584083
Test accuracy:  0.6005719733079123

Confusion matrix:
[[905 207]
 [631 355]]

Classification report:
              precision    recall  f1-score   support

           0     0.5892    0.8138    0.6835      1112
           1     0.6317    0.3600    0.4587       986

    accuracy                         0.6006      2098
   macro avg     0.6104    0.5869    0.5711      2098
weighted avg     0.6092    0.6006    0.5778      2098



In [13]:
# Fit Gaussian Naive Bayes model.
naive_g_no_emo = GaussianNB()
naive_g_no_emo.fit(x_train.todense(), y_train)
print('Training accuravy: ', naive_g_no_emo.score(x_train.todense(), y_train))
# Make predictions.
y_pred_g_no_emo = naive_g_no_emo.predict(x_test.todense())
print('Test accuracy: ', naive_g_no_emo.score(x_test.todense(), y_test))
print()
print('Confusion matrix:')
print(confusion_matrix(y_test, y_pred_g_no_emo))
print()
print('Classification report:')
print(classification_report(y_test, y_pred_g_no_emo, digits=4))

Training accuravy:  0.8286363253974931
Test accuracy:  0.5638703527168732

Confusion matrix:
[[828 284]
 [631 355]]

Classification report:
              precision    recall  f1-score   support

           0     0.5675    0.7446    0.6441      1112
           1     0.5556    0.3600    0.4369       986

    accuracy                         0.5639      2098
   macro avg     0.5615    0.5523    0.5405      2098
weighted avg     0.5619    0.5639    0.5467      2098



In [14]:
# Fit a SVM.
naive_svm_no_emo = LinearSVC()
naive_svm_no_emo.fit(x_train, y_train)
print('Training accuracy: ', naive_svm_no_emo.score(x_train, y_train))
# Make predictions.
y_pred_svm_no_emo = naive_svm_no_emo.predict(x_test)
print('Test accuracy: ', naive_svm_no_emo.score(x_test, y_test))
print()
print('Confusion matrix:')
print(confusion_matrix(y_test, y_pred_svm_no_emo))
print()
print('Classification report:')
print(classification_report(y_test, y_pred_svm_no_emo, digits=4))

Training accuracy:  0.9256330445023976
Test accuracy:  0.6320305052430887

Confusion matrix:
[[724 388]
 [384 602]]

Classification report:
              precision    recall  f1-score   support

           0     0.6534    0.6511    0.6523      1112
           1     0.6081    0.6105    0.6093       986

    accuracy                         0.6320      2098
   macro avg     0.6308    0.6308    0.6308      2098
weighted avg     0.6321    0.6320    0.6321      2098



In [15]:
# Fit a LR model.
lr_no_emo = LogisticRegression()
lr_no_emo.fit(x_train, y_train)
print('Training accuracy: ', lr_no_emo.score(x_train, y_train))
y_pred_lr_no_emo = lr_no_emo.predict(x_test)
# Make predictions.
print('Test accuracy: ', lr_no_emo.score(x_test, y_test))
print()
print('Confusion matrix:')
print(confusion_matrix(y_test, y_pred_lr_no_emo))
print()
print('Classification report:')
print(classification_report(y_test, y_pred_lr_no_emo, digits=4))

Training accuracy:  0.7934718600151426
Test accuracy:  0.6406101048617732

Confusion matrix:
[[736 376]
 [378 608]]

Classification report:
              precision    recall  f1-score   support

           0     0.6607    0.6619    0.6613      1112
           1     0.6179    0.6166    0.6173       986

    accuracy                         0.6406      2098
   macro avg     0.6393    0.6393    0.6393      2098
weighted avg     0.6406    0.6406    0.6406      2098



### Classify based on textual features with integer-encoded emotions using pipeline ###

In [16]:
x_train, x_test, y_train, y_test =  train_test_split(dense_matrix_int, y, test_size = 0.15, random_state = 7)

In [17]:
# Fit Multinomial Naive Bayes model.
naive_extra = MultinomialNB()
naive_extra.fit(x_train, y_train)
print('Training accuracy: ', naive_extra.score(x_train, y_train))
# Make predictions.
y_pred_extra = naive_extra.predict(x_test)
print('Test accuracy: ', naive_extra.score(x_test, y_test))
print()
print('Confusion matrix:')
print(confusion_matrix(y_test, y_pred_extra))
print()
print('Classification report:')
print(classification_report(y_test, y_pred_extra, digits=4))

Training accuracy:  0.6063767140573736
Test accuracy:  0.5838894184938036

Confusion matrix:
[[770 342]
 [531 455]]

Classification report:
              precision    recall  f1-score   support

           0     0.5919    0.6924    0.6382      1112
           1     0.5709    0.4615    0.5104       986

    accuracy                         0.5839      2098
   macro avg     0.5814    0.5770    0.5743      2098
weighted avg     0.5820    0.5839    0.5781      2098



In [18]:
# Fit Bernoulli Naive Bayes model.
naive_bern = BernoulliNB()
naive_bern.fit(x_train,y_train)
print('Training accuracy: ', naive_bern.score(x_train, y_train))
# Make predictions.
y_pred_bern = naive_bern.predict(x_test)
print('Test accuracy: ', naive_bern.score(x_test, y_test))
print()
print('Confusion matrix:')
print(confusion_matrix(y_test, y_pred_bern))
print()
print('Classification report:')
print(classification_report(y_test, y_pred_bern, digits=4))

Training accuracy:  0.6993354084293766
Test accuracy:  0.5967588179218303

Confusion matrix:
[[898 214]
 [632 354]]

Classification report:
              precision    recall  f1-score   support

           0     0.5869    0.8076    0.6798      1112
           1     0.6232    0.3590    0.4556       986

    accuracy                         0.5968      2098
   macro avg     0.6051    0.5833    0.5677      2098
weighted avg     0.6040    0.5968    0.5744      2098



In [19]:
# Fit Gaussian Naive Bayes model.
naive_g = GaussianNB()
naive_g.fit(x_train,y_train)
print('Training accuracy: ', naive_g.score(x_train, y_train))
# Make predictions.
y_pred_g = naive_g.predict(x_test)
print('Test accuracy: ', naive_g.score(x_test, y_test))
print()
print('Confusion matrix:')
print(confusion_matrix(y_test, y_pred_g))
print()
print('Classification report:')
print(classification_report(y_test, y_pred_g, digits=4))

Training accuracy:  0.8463026836039371
Test accuracy:  0.5610104861773118

Confusion matrix:
[[802 310]
 [611 375]]

Classification report:
              precision    recall  f1-score   support

           0     0.5676    0.7212    0.6352      1112
           1     0.5474    0.3803    0.4488       986

    accuracy                         0.5610      2098
   macro avg     0.5575    0.5508    0.5420      2098
weighted avg     0.5581    0.5610    0.5476      2098



In [20]:
# Fit a SVM.
svm = LinearSVC()
svm.fit(x_train,y_train)
print('Training accuracy: ', svm.score(x_train, y_train))
y_pred_svm = svm.predict(x_test)
# Make predictions.
print('Test accuracy: ', svm.score(x_test, y_test))
print()
print('Confusion matrix:')
print(confusion_matrix(y_test, y_pred_svm))
print()
print('Classification report:')
print(classification_report(y_test, y_pred_svm, digits=4))



Training accuracy:  0.9261377975940103
Test accuracy:  0.6358436606291706

Confusion matrix:
[[729 383]
 [381 605]]

Classification report:
              precision    recall  f1-score   support

           0     0.6568    0.6556    0.6562      1112
           1     0.6123    0.6136    0.6130       986

    accuracy                         0.6358      2098
   macro avg     0.6346    0.6346    0.6346      2098
weighted avg     0.6359    0.6358    0.6359      2098



In [21]:
# Fit a LR model.
lr_extra = LogisticRegression()
lr_extra.fit(x_train, y_train)
print('Training accuracy: ', lr_extra.score(x_train, y_train))
# Make predictions.
y_pred_lr_extra = lr_extra.predict(x_test)
print('Test accuracy: ', lr_extra.score(x_test, y_test))
print()
print('Confusion matrix:')
print(confusion_matrix(y_test, y_pred_lr_extra))
print()
print('Classification report:')
print(classification_report(y_test, y_pred_lr_extra, digits=4))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Training accuracy:  0.7724404811979474
Test accuracy:  0.6434699714013346

Confusion matrix:
[[737 375]
 [373 613]]

Classification report:
              precision    recall  f1-score   support

           0     0.6640    0.6628    0.6634      1112
           1     0.6204    0.6217    0.6211       986

    accuracy                         0.6435      2098
   macro avg     0.6422    0.6422    0.6422      2098
weighted avg     0.6435    0.6435    0.6435      2098



### Classify based on textual features with float emotions using pipeline ###

In [22]:
x_train, x_test, y_train, y_test =  train_test_split(dense_matrix_float, y, test_size = 0.15, random_state = 7)

In [23]:
# Fit Multinomial Naive Bayes model.
naive_extra_float = MultinomialNB()
naive_extra_float.fit(x_train, y_train)
print('Training accuracy: ', naive_extra_float.score(x_train, y_train))
# Make predictions.
y_pred_extra_float = naive_extra_float.predict(x_test)
print('Test accuracy: ', naive_extra_float.score(x_test, y_test))
print()
print('Confusion matrix:')
print(confusion_matrix(y_test, y_pred_extra_float))
print()
print('Classification report:')
print(classification_report(y_test, y_pred_extra_float, digits=4))

Training accuracy:  0.7669723227054766
Test accuracy:  0.6091515729265967

Confusion matrix:
[[977 135]
 [685 301]]

Classification report:
              precision    recall  f1-score   support

           0     0.5878    0.8786    0.7044      1112
           1     0.6904    0.3053    0.4233       986

    accuracy                         0.6092      2098
   macro avg     0.6391    0.5919    0.5639      2098
weighted avg     0.6360    0.6092    0.5723      2098



In [24]:
# Fit Bernoulli Naive Bayes model.
naive_bern_float = BernoulliNB()
naive_bern_float.fit(x_train,y_train)
print('Training accuracy: ', naive_bern_float.score(x_train, y_train))
# Make predictions.
y_pred_bern_float = naive_bern_float.predict(x_test)
print('Test accuracy: ', naive_bern_float.score(x_test, y_test))
print()
print('Confusion matrix:')
print(confusion_matrix(y_test, y_pred_bern_float))
print()
print('Classification report:')
print(classification_report(y_test, y_pred_bern_float, digits=4))

Training accuracy:  0.6996719104904517
Test accuracy:  0.600095328884652

Confusion matrix:
[[904 208]
 [631 355]]

Classification report:
              precision    recall  f1-score   support

           0     0.5889    0.8129    0.6830      1112
           1     0.6306    0.3600    0.4584       986

    accuracy                         0.6001      2098
   macro avg     0.6097    0.5865    0.5707      2098
weighted avg     0.6085    0.6001    0.5774      2098



In [25]:
# Fit Gaussian Naive Bayes model.
naive_g_float = GaussianNB()
naive_g_float.fit(x_train,y_train)
print('Training accuracy: ', naive_g_float.score(x_train, y_train))
# Make predictions.
y_pred_g_float = naive_g_float.predict(x_test)
print('Test accuracy: ', naive_g_float.score(x_test, y_test))
print()
print('Confusion matrix:')
print(confusion_matrix(y_test, y_pred_g_float))
print()
print('Classification report:')
print(classification_report(y_test, y_pred_g_float, digits=4))

Training accuracy:  0.8382266341381341
Test accuracy:  0.5591039084842707

Confusion matrix:
[[807 305]
 [620 366]]

Classification report:
              precision    recall  f1-score   support

           0     0.5655    0.7257    0.6357      1112
           1     0.5455    0.3712    0.4418       986

    accuracy                         0.5591      2098
   macro avg     0.5555    0.5485    0.5387      2098
weighted avg     0.5561    0.5591    0.5445      2098



In [26]:
# Fit a SVM.
naive_svm_float = LinearSVC()
naive_svm_float.fit(x_train,y_train)
print('Training accuracy: ', naive_svm_float.score(x_train, y_train))
# Make predictions.
y_pred_svm_float = naive_svm_float.predict(x_test)
print('Test accuracy: ', naive_svm_float.score(x_test, y_test))
print()
print('Confusion matrix:')
print(confusion_matrix(y_test, y_pred_svm_float))
print()
print('Classification report:')
print(classification_report(y_test, y_pred_svm_float, digits=4))

Training accuracy:  0.9259695465634727
Test accuracy:  0.6334604385128694

Confusion matrix:
[[725 387]
 [382 604]]

Classification report:
              precision    recall  f1-score   support

           0     0.6549    0.6520    0.6534      1112
           1     0.6095    0.6126    0.6110       986

    accuracy                         0.6335      2098
   macro avg     0.6322    0.6323    0.6322      2098
weighted avg     0.6336    0.6335    0.6335      2098



In [27]:
# Fit a LR model.
lr_float = LogisticRegression()
lr_float.fit(x_train, y_train)
print('Training accuracy: ', lr_float.score(x_train, y_train))
# Make predictions.
y_pred_lr_float = lr_float.predict(x_test)
print('Test accuracy: ', lr_float.score(x_test, y_test))
print()
print('Confusion matrix:')
print(confusion_matrix(y_test, y_pred_lr_float))
print()
print('Classification report:')
print(classification_report(y_test, y_pred_lr_float, digits=4))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Training accuracy:  0.7907798435265416
Test accuracy:  0.6429933269780743

Confusion matrix:
[[740 372]
 [377 609]]

Classification report:
              precision    recall  f1-score   support

           0     0.6625    0.6655    0.6640      1112
           1     0.6208    0.6176    0.6192       986

    accuracy                         0.6430      2098
   macro avg     0.6416    0.6416    0.6416      2098
weighted avg     0.6429    0.6430    0.6429      2098

