### __Data Class__

In [87]:
import random

class Sentiment:
    NEGATIVE = 'NEGATIVE'
    NEUTRAL = 'NEUTRAL'
    POSITIVE = 'POSITIVE'

class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
        
    def get_sentiment(self):
        if (self.score <= 2):
            return Sentiment.NEGATIVE
        elif (self.score == 3):
            return Sentiment.NEUTRAL
        else: # Score of 4 or 5
            return Sentiment.POSITIVE

class ReviewContainer:
    def __init__(self,reviews):
        self.reviews = reviews
        
    def get_text(self):
        return [x.text for x in self.reviews]
    
    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]
    
    def evenly_distribute(self):
        negative = list(filter(lambda x: x.sentiment == Sentiment.NEGATIVE, self.reviews))
        positive = list(filter(lambda x: x.sentiment == Sentiment.POSITIVE, self.reviews))
        positive_shrunk = positive[:len(negative)]
        self.reviews = negative + positive_shrunk
        random.shuffle(self.reviews)
        
        

### __Load Data__

In [66]:
import json

file_name = './data/books_small_10000.json'

reviews = []

with open(file_name) as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review['reviewText'],review['overall']))

### __Prep Data__

In [88]:
from sklearn.model_selection import train_test_split

training, test = train_test_split(reviews, test_size = 0.33, random_state = 47)

train_container = ReviewContainer(training)
test_container = ReviewContainer(test)

len(cont.reviews)

852

In [106]:
train_container.evenly_distribute()

train_x = train_container.get_text() # Retrieves the text
train_y = train_container.get_sentiment() # Retrieves the sentiment


test_container.evenly_distribute()

test_x = test_container.get_text()
test_y = test_container.get_sentiment()

train_y.count(Sentiment.POSITIVE)
train_y.count(Sentiment.NEGATIVE)

426

### __Bag of Words Vectorization__

In [118]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# This book is great !
# This book was so bad

vectorizer = TfidfVectorizer()
train_x_vectors = vectorizer.fit_transform(train_x)

# Don't want to fit since test set
test_x_vectors = vectorizer.transform(test_x)

### __Classification__

#### __Linear SVM__

In [119]:
from sklearn import svm

clf_svm = svm.SVC(kernel='linear')

clf_svm.fit(train_x_vectors, train_y)

clf_svm.predict(test_x_vectors[0])


array(['POSITIVE'], dtype='<U8')

#### __Decision Tree__

In [120]:
from sklearn.tree import DecisionTreeClassifier

clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_x_vectors, train_y)

clf_dec.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

#### __Logistic Regression__

In [121]:
from sklearn.linear_model import LogisticRegression

clf_log = LogisticRegression()
clf_log.fit(train_x_vectors, train_y)

clf_log.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

### __Evaluation__

In [122]:
# Mean Accuracy
print(clf_svm.score(test_x_vectors, test_y))
print(clf_dec.score(test_x_vectors, test_y))
print(clf_log.score(test_x_vectors, test_y))

0.8211009174311926
0.6536697247706422
0.8509174311926605


In [123]:
# F1 Scores
from sklearn.metrics import f1_score

f1_score(test_y, clf_svm.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE,Sentiment.NEGATIVE])
# f1_score(test_y, clf_dec.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE,Sentiment.NEUTRAL,Sentiment.NEGATIVE])
# f1_score(test_y, clf_log.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE,Sentiment.NEUTRAL,Sentiment.NEGATIVE])

array([0.8202765 , 0.82191781])

In [136]:
test_set = ['I thoroughly enjoyed this, 5 stars', 'bad book do not buy', 'horrible waste of time', 'this was fascinating', 'i would recommend this to a friend']
new_test = vectorizer.transform(test_set)

clf_svm.predict(new_test)

array(['POSITIVE', 'NEGATIVE', 'NEGATIVE', 'NEGATIVE', 'POSITIVE'],
      dtype='<U8')

### __Tuning our model (with Grid Search)__

In [128]:
from sklearn.model_selection import GridSearchCV

parameters = {'kernel': ('linear', 'rbf'), 'C': (1,4,8,16,32)}

svc = svm.SVC()
clf = GridSearchCV(svc, parameters, cv=5)
clf.fit(train_x_vectors, train_y)

GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': (1, 4, 8, 16, 32), 'kernel': ('linear', 'rbf')})

In [129]:
print(clf.score(test_x_vectors, test_y))

0.8211009174311926


### __Saving Model__

#### __Save Classifier__

In [142]:
import pickle

with open ('./models/sentiment_classifier.pkl', 'wb') as f:
    pickle.dump(clf, f)

with open ('./models/sentiment_vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

#### __Load Classifier__

In [139]:
with open ('./models/sentiment_classifier.pkl', 'rb') as f:
    loaded_clf = pickle.load(f)

with open ('./models/sentiment_vectorizer.pkl', 'wb') as f:
    vectorizer = pickle.load(f)

In [143]:
print(test_x[6])

loaded_clf.predict(test_x_vectors[6])

This has to be the best book I have read this year !!! I am in love with Sawyer. To really appreciate the story of Sawyer, you need to read the first two books of the series to see how he use to be and how he is now. This book made me get misty eyed and bust out laughing at times. I have moved S E HALL up to the top of my Favorite Author list !!!  A must read in my opinion.


array(['POSITIVE'], dtype='<U8')