# NLP classification with Amazon Product Data

## Imports

In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import random
import pickle

# sklearn libraries
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV

from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import f1_score

## Data Classes

In [18]:
# Sentiment Enum
class Sentiment:
  NEGATIVE = 'NEGATIVE'
  POSITIVE = 'POSITIVE'
  NEUTRAL = 'NEUTRAL'


# Review object
class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()

    def get_sentiment(self):
        if self.score <= 2:
          return Sentiment.NEGATIVE
        elif self.score == 3:
          return Sentiment.NEUTRAL
        else:
          return Sentiment.POSITIVE

    def __str__(self):
        return 'Text:\t\t{0}\nScore:\t\t{1}\nSentiment:\t{2}'.format(self.text, self.score, self.sentiment)


# Review Container
class ReviewsContainer:
    def __init__(self, reviews):
        self.reviews = reviews

    def get_text(self):
        return [x.text for x in self.reviews]

    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]

    def evenly_distribute(self):
        negatives = list(filter(lambda x: x.sentiment == Sentiment.NEGATIVE, self.reviews))
        positives = list(filter(lambda x: x.sentiment == Sentiment.POSITIVE, self.reviews))

        positive_shrunk = positives[:len(negatives)]
        self.reviews = positive_shrunk + negatives
        random.shuffle(self.reviews)

        print('Positive reviews: {0}'.format(len(positive_shrunk)))
        print('Negative reviews: {0}'.format(len(negatives)))


## Load Data

In [19]:
data_path = 'datasets/amazon_book_data.json'

# Collect reviews from the file
reviews = []
with open(data_path) as file:
  for line in file:
    review = json.loads(line)
    reviews.append(Review(review['reviewText'], review['overall']))

# Preview of a random Review
print(reviews[5])

Text:		I hoped for Mia to have some peace in this book, but her story is so real and raw.  Broken World was so touching and emotional because you go from Mia's trauma to her trying to cope.  I love the way the story displays how there is no "just bouncing back" from being sexually assaulted.  Mia showed us how those demons come for you every day and how sometimes they best you. I was so in the moment with Broken World and hurt with Mia because she was surrounded by people but so alone and I understood her feelings.  I found myself wishing I could give her some of my courage and strength or even just to be there for her.  Thank you Lizzy for putting a great character's voice on a strong subject and making it so that other peoples story may be heard through Mia's.
Score:		5.0
Sentiment:	POSITIVE


## Data Prep

In [20]:
# Split data info training & testing subsets

train_data, test_data = train_test_split(reviews, test_size = 0.33, random_state = 42)
print('Length of training data: {0}'.format(len(train_data)))

train_container = ReviewsContainer(train_data)
test_container = ReviewsContainer(test_data)

train_container.evenly_distribute()
test_container.evenly_distribute()

Length of training data: 6700
Positive reviews: 436
Negative reviews: 436
Positive reviews: 208
Negative reviews: 208


## Spliting the review data

In [21]:
train_x = train_container.get_text()
train_y = train_container.get_sentiment()

test_x = test_container.get_text()
test_y = test_container.get_sentiment()

print('Text:\t\t{0}\nSentiment:\t{1}'.format(train_x[10], train_y[10]))

Text:		I haven't read the previous tales from the Clifton Chronicles so i was pleasantly suprised. The villains are picturesque, the plot is unusual and the book is vintage Jeffrey Archer. Without reading the previous editions I don't have a full picture of the legacy of the Clifton family but enjoyable reading nevertheless.
Sentiment:	POSITIVE


## Bags of Words Vectorization

In [22]:
# vectorizer = CountVectorizer()
vectorizer = TfidfVectorizer()

train_x_vectors = vectorizer.fit_transform(train_x)
test_x_vectors = vectorizer.transform(test_x)

print(train_x[0])
print(train_x_vectors[0].toarray())

I always look forward to Amanda Quick novels, I know it will be an afternoon spent enjoying myself.  I have read others that I found more enjoyable, ie Crystal Gardens and any of the Arcane series, but this was a good read.  I would think it an ideal read for the beach or summer leisure.
[[0. 0. 0. ... 0. 0. 0.]]


## Classification

### Linear SVM

In [23]:
clf_svc = SVC(kernel = 'linear')

clf_svc.fit(train_x_vectors, train_y)
svc_predictions = clf_svc.predict(test_x_vectors)

svc_score = clf_svc.score(test_x_vectors, test_y)
print('SVM Score: {0}'.format(svc_score))

SVM Score: 0.8076923076923077


### Decision Tree

In [24]:
clf_dec = DecisionTreeClassifier()

clf_dec.fit(train_x_vectors, train_y)
dec_predictions = clf_dec.predict(test_x_vectors)

dec_score = clf_dec.score(test_x_vectors, test_y)
print('Decision Tree Score: {0}'.format(dec_score))

Decision Tree Score: 0.6490384615384616


### Logistic Regression

In [25]:
clf_log = LogisticRegression()

clf_log.fit(train_x_vectors, train_y)
log_predictions = clf_log.predict(test_x_vectors)

log_score = clf_log.score(test_x_vectors, test_y)
print('Logistic Regression Score: {0}'.format(log_score))

Logistic Regression Score: 0.8052884615384616


## Evaluation

In [26]:
# Evaluating F1 Score
svc_f1_score = f1_score(test_y, svc_predictions, average = None, labels = [Sentiment.POSITIVE, Sentiment.NEGATIVE])
dec_f1_score = f1_score(test_y, dec_predictions, average = None, labels = [Sentiment.POSITIVE, Sentiment.NEGATIVE])
log_f1_score = f1_score(test_y, log_predictions, average = None, labels = [Sentiment.POSITIVE, Sentiment.NEGATIVE])

print('SVC F1 Score:\t\t\t {0}'.format(svc_f1_score))
print('Decition Tree F1 Score:\t\t {0}'.format(dec_f1_score))
print('Logistic Regression F1 Score:\t {0}'.format(log_f1_score))

pd.DataFrame({'Linear SVM': [svc_score],
              'Decision Tree': [dec_score],
              'Logistic Regression': [log_score]},
             index = ['Mean Accuracy'])

SVC F1 Score:			 [0.80582524 0.80952381]
Decition Tree F1 Score:		 [0.64563107 0.65238095]
Logistic Regression F1 Score:	 [0.80291971 0.80760095]


Unnamed: 0,Linear SVM,Decision Tree,Logistic Regression
Mean Accuracy,0.807692,0.649038,0.805288


## Tuning Parameters

In [27]:
parameters = {
    'kernel': ('linear', 'rbf'),
    'C': (1, 4, 8, 16, 32)
}

tuned_svc = SVC()
clf = GridSearchCV(tuned_svc, parameters, cv = 5)
clf.fit(train_x_vectors, train_y)

GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': (1, 4, 8, 16, 32), 'kernel': ('linear', 'rbf')})

## Saving the model

In [28]:
with open('svc_sentiment_classifier_model.pckle', 'wb') as file:
    pickle.dump(clf, file)

In [31]:
loaded_model = None

with open('svc_sentiment_classifier_model.pckle', 'rb') as file:
    loaded_model = pickle.load(file)

In [37]:
print(test_x[5])
loaded_model.predict(test_x_vectors[5])

The strong characters introduced in the beginning of book 1 took on a whiney, pathetic quality. The story was repetitive and drawn out unnecessarily.  The author took a strong foundation and destroyed it, I'm beyond disappointed especially because I will never know how it ends (if this author ever lets it) because I will avoid this author at all costs in the future.


array(['NEGATIVE'], dtype='<U8')