### Data Class

In [1]:
import random

class Review:
    def __init__(self, text, stars):
        self.text = text
        self.stars = stars
        self.sentiment = self.get_sentiment()
        
    def get_sentiment(self):
        if self.stars <= 2:
            return "NEGATIVE"
        elif self.stars == 3:
            return "NEUTRAL"
        else:
            return "POSITIVE"

class Container:
    def __init__(self, reviews):
        self.reviews = reviews
        
    def get_text(self):
        return [x.text for x in self.reviews]
    
    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]
    
    def evenly_distribute(self):
        neg = list(filter(lambda x: x.sentiment == "NEGATIVE", self.reviews))
        pos = list(filter(lambda x: x.sentiment == "POSITIVE", self.reviews))
        pos_shrunk = pos[:len(neg)]
        self.reviews = neg + pos_shrunk
        random.shuffle(self.reviews)

### Load Data

In [2]:
import json

file_name = './Books_small_10000.json'
reviews = []

with open(file_name) as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review['reviewText'], review['overall']))

print(reviews[4].text)
print(reviews[4].stars)
print(reviews[4].sentiment)

It was a decent read.. typical story line. Nothing unsavory as so many are. Just a slice of life, plausible.
3.0
NEUTRAL


### Data Prep

In [3]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(reviews, test_size=0.33, random_state=42)

train_cont = Container(train)
test_cont = Container(test)

In [4]:
# The data in not evenly distrubute for positives and negatives

train_cont.evenly_distribute()
train_x = train_cont.get_text()
train_y = train_cont.get_sentiment()

test_cont.evenly_distribute()
test_x = train_cont.get_text()
test_y = train_cont.get_sentiment()

print(train_y.count("POSITIVE"))
print(train_y.count("NEGATIVE"))

436
436


### Vectorization

In [5]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vectorizer = TfidfVectorizer()
train_x_vectors = vectorizer.fit_transform(train_x)

test_x_vectors = vectorizer.transform(test_x)

print(train_x[0])
print(train_x_vectors[0].toarray())

Grisham worked his magic once again and brought his characters to life.  A thoughtful and compelling story although not particularly gripping.
[[0. 0. 0. ... 0. 0. 0.]]


## Clssification
### Linear SVM

In [6]:
from sklearn import svm

clf_svm = svm.SVC()
clf_svm.fit(train_x_vectors, train_y)

clf_svm.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

### Decision Tree

In [7]:
from sklearn.tree import DecisionTreeClassifier

clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_x_vectors, train_y)

clf_dec.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

### Naive Bayes

In [8]:
from sklearn.naive_bayes import GaussianNB

clf_gnb = GaussianNB()
clf_gnb.fit(train_x_vectors.toarray(), train_y)

clf_gnb.predict(test_x_vectors.toarray())[0]

'POSITIVE'

### Logistic Regression

In [9]:
from sklearn.linear_model import LogisticRegression

clf_log = LogisticRegression()
clf_log.fit(train_x_vectors, train_y)

clf_log.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

### Evaluation

In [10]:
# Mean Accuracy

print(clf_svm.score(test_x_vectors, test_y))
print(clf_dec.score(test_x_vectors, test_y))
print(clf_gnb.score(test_x_vectors.toarray(), test_y))
print(clf_log.score(test_x_vectors, test_y))

0.9988532110091743
1.0
0.9839449541284404
0.9655963302752294


In [11]:
# F1 Scores
from sklearn.metrics import f1_score

print(f1_score(clf_svm.predict(test_x_vectors), test_y, average=None, labels=["POSITIVE", "NEGATIVE"]))
print(f1_score(clf_dec.predict(test_x_vectors), test_y, average=None, labels=["POSITIVE", "NEGATIVE"]))
print(f1_score(clf_gnb.predict(test_x_vectors.toarray()), test_y, average=None, labels=["POSITIVE", "NEGATIVE"]))
print(f1_score(clf_log.predict(test_x_vectors), test_y, average=None, labels=["POSITIVE", "NEGATIVE"]))

[0.99885452 0.99885189]
[1. 1.]
[0.98368298 0.98419865]
[0.96527778 0.96590909]


In [12]:
test_set = ['very fun', "bad book do not buy", 'horrible waste of time']
new_test = vectorizer.transform(test_set)

clf_svm.predict(new_test) # I use the 'svm' model here, but we can use other models as well

array(['POSITIVE', 'NEGATIVE', 'NEGATIVE'], dtype='<U8')

### Tuning our model (with Grid Search)

In [13]:
from sklearn.model_selection import GridSearchCV

parameters = {'kernel': ('linear', 'rbf'), 'C':(1,4,8,16,32)}

svc = svm.SVC()
clf = GridSearchCV(svc, parameters, cv=5)
clf.fit(train_x_vectors, train_y)

GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': (1, 4, 8, 16, 32), 'kernel': ('linear', 'rbf')})

In [14]:
clf.score(test_x_vectors, test_y)

1.0

### Saving Model

In [15]:
import pickle

with open('./sentiment_classifier.pkl', 'wb') as f:
    pickle.dump(clf, f)

### Load Model

In [16]:
with open('./sentiment_classifier.pkl', 'rb') as f:
    loaded_clf = pickle.load(f)

In [17]:
print(test_x[15])

loaded_clf.predict(test_x_vectors[15])

I cannot get enough of this man's writing, intense action, vivid location descriptions and amazing dialogue. I read all15 Gabriel Allon books in less than 15 weeks. The only down side is trying to find a fiction writer to match Silva's enormous gift of story-telling! I have urged him to meet talented Director Ridley Scott and make one of a dozen Allon novels into a movie.


array(['POSITIVE'], dtype='<U8')