# Create Classes

In [441]:
import random

# Building a class for the data

class Sentiment:
    NEGATIVE = "NEGATIVE"
    POSITIVE = "POSITIVE"
    NEUTRAL = "NEUTRAL"

class Review():
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
            
    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.NEGATIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else: #Score 4 oe 5
            return Sentiment.POSITIVE

# This class created to call an even distribute method
class ReviewContainer():
        # The training and test sets have 2 outputs        
        def __init__(self, reviews):
            self.reviews = reviews
        
        def get_text(self):
            return [x.text for x in self.reviews]
    
        def get_sentiment(self):
            return [x.sentiment for x in self.reviews]
    
    
        def evenly_distribute(self):
            negative = list(filter(lambda x: x.sentiment == Sentiment.NEGATIVE, self.reviews))
            positive = list(filter(lambda x: x.sentiment == Sentiment.POSITIVE, self.reviews))
            neutral = list(filter(lambda x: x.sentiment == Sentiment.NEUTRAL, self.reviews))
            positive_shrunk = positive[:len(negative)]
            neutral_shrunk = neutral[:len(negative)]
            self.reviews = negative + positive_shrunk + neutral_shrunk
            random.shuffle(self.reviews)
                

# Load Data and Libraries

In [442]:
import numpy as np
from sklearn.model_selection import train_test_split
# from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import json

# Amazon data from http://jmcauley.ucsd.edu/data/amazon/
file_name = "./books_small_10000.json"

# Empty list, then itrate through books file
reviews = []
with open(file_name) as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review['reviewText'], review['overall']))
        
reviews[5].sentiment

'POSITIVE'

# Prepare Data

In [443]:
len(reviews)

10000

In [444]:
len(training)

6700

In [445]:
len(test)

3300

In [446]:
training, test = train_test_split(reviews,test_size=0.33, random_state=42)

Added in after we created the new ReviewContainer class, we see that each Sentiment has an equal value now

In [447]:
train_cont = ReviewContainer(training)
train_cont.evenly_distribute()

In [448]:
len(training_cont.reviews)

1308

In [449]:
test_cont = ReviewContainer(test)
test_cont.evenly_distribute()

In [450]:
len(test_cont.reviews)

624

In [451]:
train_x = train_cont.get_text()
train_y = train_cont.get_sentiment()

In [452]:
test_x = test_cont.get_text()
test_y = test_cont.get_sentiment()

Now we can check the counts of the sentiments with our new ReviewContainer class

In [453]:
print(train_y.count(Sentiment.POSITIVE))
print(train_y.count(Sentiment.NEUTRAL))

436
436


In [454]:
print(test_y.count(Sentiment.POSITIVE))
print(test_y.count(Sentiment.NEUTRAL))

208
208


#### Bag of words vectorization

In [455]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# Problem with Countvectorizer is it weighs each word equally
# Good and Bad should be rated higher than a common word
# TO SOLVE THIS PROBLEM WE DECIDED TO IMPORT Tfidvectorizer


# Using the countVectorizer function for this example
# countVector = CountVectorizer()

# OR

# Using the TfidfVectorizer function for this example, hoping for more accurate results
countVector = TfidfVectorizer()

# Fits and transform the data into vectors
train_x_vectors = countVector.fit_transform(train_x)
train_x_vectors
# All of our training values, rows of 0s and non 0s



<1308x11465 sparse matrix of type '<class 'numpy.float64'>'
	with 83190 stored elements in Compressed Sparse Row format>

In [456]:
print(train_x[0])

Title: Regine&#8217;s Book  - A Teen Girl&#8217;s Last WordsAuthor: Regine StokkePublisher: Zest Books LLCISBN: 978-1-936976-01-0&#8220;I&#8217;ve fought and fought; I&#8217;ve done everything possible, and at this point there&#8217;s nothing left to do. If the disease doesn&#8217;t loosen its grip, it won&#8217;t be long before it sucks me down. I&#8217;m scared to death, and sad,&#8221; Regine writes in her blog-adapted book, &#34;Regine&#8217;s Book &#8211; A Teen Girl&#8217;s Last Words.&#34;At three hundred and thirty-six glossy pages, this paperback targets those who loved the girl behind her worldwide blog about her illness, those facing the daunting disease of leukemia, or those curious about dying at a young age. With minor profanity and topics discussing effects of medications, medical issues, and death, it would be geared toward mature readers. Many colored photographs of the beautiful girl, her family, and friends, along with her artistic creations and poems grace the pages

In [457]:
print(train_x_vectors[0])

  (0, 7116)	0.033045509860454125
  (0, 4933)	0.028652406549289303
  (0, 8226)	0.025494903981626935
  (0, 8562)	0.025839494318072774
  (0, 3649)	0.032547649635980655
  (0, 2121)	0.041860828795461866
  (0, 4252)	0.0502769388924082
  (0, 6073)	0.0502769388924082
  (0, 8311)	0.04755300860701105
  (0, 4096)	0.023629408958746097
  (0, 4866)	0.023995816969548698
  (0, 8967)	0.040963759732926985
  (0, 1060)	0.02012210607659219
  (0, 6240)	0.04755300860701105
  (0, 6505)	0.03720423921572126
  (0, 10682)	0.03720423921572126
  (0, 462)	0.015914202483340918
  (0, 7370)	0.040963759732926985
  (0, 10203)	0.023761568162512084
  (0, 7420)	0.04412126230058936
  (0, 4150)	0.022550690397730718
  (0, 1394)	0.04412126230058936
  (0, 4957)	0.03823982944752984
  (0, 7090)	0.01509831916646157
  (0, 377)	0.0502769388924082
  :	:
  (0, 7669)	0.02731651878587493
  (0, 10258)	0.027525975544242364
  (0, 808)	0.06373762850488811
  (0, 7747)	0.040172488741873305
  (0, 3616)	0.02699399099370517
  (0, 3093)	0.02851937

In [458]:
# Only need to transform because its only test data, only train data should be fitted
test_x_vectors = countVector.transform(test_x)


## Classification

In [459]:
# This is where we use the classifier function from sklearn and test out different models

#### Linear SVM

In [460]:
from sklearn.svm import SVC

clf_svc = SVC(kernel = 'linear')

clf_svc.fit(train_x_vectors, train_y)

test_x[0]
test_x_vectors[0]

clf_svc.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

#### Decision Tree

In [461]:
from sklearn.tree import DecisionTreeClassifier

dec_tree_clf = DecisionTreeClassifier()

dec_tree_clf.fit(train_x_vectors, train_y)
dec_tree_clf.predict(test_x_vectors[0])


array(['POSITIVE'], dtype='<U8')

#### Naive Bayes

In [462]:
from sklearn.naive_bayes import GaussianNB

clf_gnb = GaussianNB()

# Had to dense the X vectors into an array
clf_gnb.fit(train_x_vectors.toarray(), train_y)
clf_gnb.predict(test_x_vectors[[0]].toarray())

array(['NEUTRAL'], dtype='<U8')

#### Logistic Regression

In [463]:
from sklearn.linear_model import LogisticRegression

clf_log = LogisticRegression()

clf_log.fit(train_x_vectors, train_y)
clf_log.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

## Evaluation

#### Linear SV

In [464]:
# Went into documentation, use score feature to check accuracy, 
# this is more specifically the MEAN accuract for the labels

clf_svc.score(test_x_vectors, test_y)


0.6169871794871795

#### Decision Tree

In [465]:
dec_tree_clf.score(test_x_vectors, test_y)

0.4342948717948718

#### Naive Bayes

In [466]:
clf_gnb.score(train_x_vectors.toarray(), train_y)

0.963302752293578

#### Logistic Regression

In [467]:
clf_log.score(test_x_vectors, test_y)

0.6217948717948718

There is a catch, we only looked at accuracy, now we have to look into the f1score

In [468]:
from sklearn.metrics import f1_score

#### Linear SV

In [469]:
f1_score(test_y, clf_svc.predict(test_x_vectors), average = None, labels = [Sentiment.POSITIVE, 
                                                                            Sentiment.NEUTRAL, Sentiment.NEGATIVE])
# As we see for the scores, its good for the positive reviews, but not good at all for neutral or negative reviews

# First comment was before we evenly distributed the reviews, now they are all okay in terms of f1_score

array([0.69417476, 0.5450237 , 0.61352657])

#### Decision Tree

In [470]:
f1_score(test_y, clf_log.predict(test_x_vectors), average = None, labels = [Sentiment.POSITIVE, 
                                                                                Sentiment.NEUTRAL, Sentiment.NEGATIVE])
# Same trend from Linear SV

# First comment was before we evenly distributed the reviews, now they are all okay in terms of f1_score

array([0.70117647, 0.5410628 , 0.62102689])

#### Naive Bayes

In [471]:
f1_score(test_y, clf_gnb.predict(test_x_vectors.toarray()), average = None, labels = [Sentiment.POSITIVE, 
                                                                                Sentiment.NEUTRAL, Sentiment.NEGATIVE])
# Same trend again

# First comment was before we evenly distributed the reviews, now they are all okay in terms of f1_score

array([0.46786632, 0.40089087, 0.43902439])

#### Logistic Regression

In [472]:
f1_score(test_y, clf_log.predict(test_x_vectors.toarray()), average = None, labels = [Sentiment.POSITIVE, 
                                                                                Sentiment.NEUTRAL, Sentiment.NEGATIVE])
# Once again, only good at predicing when Sentiment is POSITIVE

# First comment was before we evenly distributed the reviews, now they are all okay in terms of f1_score

array([0.70117647, 0.5410628 , 0.62102689])

How can we make our models better for NEGATIVE and NEUTRAL reviews?

Since the trend is the same for all models, it might have to do with the data and not with the models

In [473]:
train_y.count(Sentiment.POSITIVE)
# Heavily biased towards positive reviews because more of them

# THIS WAS BEFORE OUT REVIEW_CONTAINER CLASS, before there was 8 - 10 times as many positives 
# compared to the negative and neutral values

436

In [474]:
train_y.count(Sentiment.NEUTRAL)

436

In [475]:
train_y.count(Sentiment.NEGATIVE)

436

Need to balance our data, we are now going to download a larger data set. We swtich data set from Book_smalls to Book_smalls_10000

## Tuning our Model (with Grid Search)

In [476]:
from sklearn.model_selection import GridSearchCV
# GridSearchCV will allow us to test out different models, in this case SVC, and allow us to compare

# SVC() - Used this to compare syntax, then choose what parameters we want to look at

parameters = {'kernel': ("linear", "rbf"), 'C' : (1, 4, 8, 16, 32, 64)}

clif_svc = SVC()

# Splits data 5 times, to cross-validate
tuned_SVC = GridSearchCV(clif_svc, parameters, cv = 5)
tuned_SVC.fit(train_x_vectors, train_y)

GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': (1, 4, 8, 16, 32, 64),
                         'kernel': ('linear', 'rbf')})

In [477]:
tuned_SVC.score(test_x_vectors, test_y)

0.6169871794871795

Don't see much of a change, this might still have to do with the data set

## Saving Model

In [478]:
# Something we can do differently is try to strip out all the punctuation, 
# since it might not have an actual effect on the sentament

#### Saving Model

In [479]:
# Use this to save our model
import pickle 

with open("./sentiment_classifer.pkl", "wb") as f:
    pickle.dump(tuned_SVC, f)

#### Loading Model

In [480]:
import pickle 

with open("./sentiment_classifer.pkl", "rb") as f:
    loaded_clf = pickle.load(f)

In [481]:
print(test_x[0])

loaded_clf.predict(test_x_vectors[0])

I gave this book a five star because I think out of all of the three football stars, Sam is my favorite.  He has had a lot of women problems and Dellina is just the perfect match for him.  I liked the storyline of Sam's parents.  It sort of made it funny that this big football player has a mom who tells him what to try and is always keeping an eye on him (whether it is checking his closets, his dresser or even telling him what kind of condoms to use.I think that anyone who has read all the Fool's Gold books would enjoy this one as much.  I hope that Susan will write a book about Kenny and Bailey.  I think Chloe will have a big impact on him.


array(['POSITIVE'], dtype='<U8')