In [1]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np

# Data Class

In [2]:
class Sentiment : 
    NEGATIVE = "NEGATIVE"
    NEUTRAL = "NEUTRAL"
    POSITIVE = "POSITIVE"

class Review :
    def __init__(self, text, score) :
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
    
    def get_sentiment(self) : 
        if self.score <= 2 : 
            return Sentiment.NEGATIVE
        elif self.score == 3 : 
            return Sentiment.NEUTRAL
        else :
            return Sentiment.POSITIVE

# Load Data

In [10]:
file_name = "data/Books_small.json"

reviews = []
with open(file_name) as f : 
    for line in f : 
        review = json.loads(line)
        reviews.append(Review(review['reviewText'], review['overall']))

reviews[5].sentiment

'POSITIVE'

# Prep Data

In [11]:
training, test = train_test_split(reviews, test_size=0.33, random_state=42)

In [12]:
train_x = [x.text for x in training]
train_y = [y.sentiment for y in training]

test_x = [x.text for x in test]
test_y = [y.sentiment for y in test]

## Bag of words

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
train_x_vectors = vectorizer.fit_transform(train_x)
test_x_vectors = vectorizer.transform(test_x)

train_x_vectors

<670x7372 sparse matrix of type '<class 'numpy.int64'>'
	with 41455 stored elements in Compressed Sparse Row format>

In [14]:
print(train_x[0])
print(train_x_vectors[0])
print(train_x_vectors[0].toarray())

Vivid characters and descriptions. The author has created a tale that grabs your attention and I couldn't put it down.
  (0, 7086)	1
  (0, 1148)	1
  (0, 350)	2
  (0, 1800)	1
  (0, 6595)	1
  (0, 562)	1
  (0, 3054)	1
  (0, 1558)	1
  (0, 6475)	1
  (0, 6593)	1
  (0, 2895)	1
  (0, 7353)	1
  (0, 539)	1
  (0, 1515)	1
  (0, 5197)	1
  (0, 3545)	1
  (0, 2007)	1
[[0 0 0 ... 0 0 0]]


## Classification

### Linear SVM

In [15]:
from sklearn import svm

clf_svm = svm.SVC(kernel="linear")
clf_svm.fit(train_x_vectors, train_y)

print(test_x[0])
print(test_x_vectors[0])

Every new Myke Cole book is better than the last, and this is no exception. If you haven't read the Shadow Ops series before start with Control Point, but go ahead and order Fortress Frontier and Breach Zone as well - you're going to want them.
  (0, 261)	1
  (0, 350)	3
  (0, 483)	1
  (0, 688)	1
  (0, 752)	1
  (0, 841)	1
  (0, 989)	1
  (0, 1296)	1
  (0, 1463)	1
  (0, 2304)	1
  (0, 2332)	1
  (0, 2860)	1
  (0, 2868)	1
  (0, 3065)	1
  (0, 3297)	1
  (0, 3534)	2
  (0, 3787)	1
  (0, 4478)	1
  (0, 4503)	1
  (0, 4640)	1
  (0, 4648)	1
  (0, 4938)	1
  (0, 5270)	1
  (0, 5281)	1
  (0, 5844)	1
  (0, 5881)	1
  (0, 6212)	1
  (0, 6588)	1
  (0, 6595)	2
  (0, 6600)	1
  (0, 6632)	1
  (0, 6709)	1
  (0, 7114)	1
  (0, 7182)	1
  (0, 7260)	1
  (0, 7348)	2


In [16]:
clf_svm.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

### Decision Tree

In [17]:
from sklearn.tree import DecisionTreeClassifier

clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_x_vectors, train_y)

clf_dec.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

### Naive Bayes

In [27]:
type(test_x_vectors[0])

scipy.sparse.csr.csr_matrix

In [28]:
from sklearn.naive_bayes import GaussianNB

clf_gnb = GaussianNB()
clf_gnb.fit(train_x_vectors.toarray(), train_y)

clf_gnb.predict(test_x_vectors[0].toarray())

array(['POSITIVE'], dtype='<U8')

### Logistic Regression

In [19]:
from sklearn.linear_model import LogisticRegression

clf_log = LogisticRegression()
clf_log.fit(train_x_vectors, train_y)

clf_log.predict(test_x_vectors[0])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


array(['POSITIVE'], dtype='<U8')

## Evaluation

### Mean Accuracy

In [29]:
clf_svm.score(test_x_vectors, test_y)

0.8242424242424242

In [30]:
clf_dec.score(test_x_vectors, test_y)

0.7424242424242424

In [33]:
clf_gnb.score(test_x_vectors.toarray(), test_y)

0.8121212121212121

In [34]:
clf_log.score(test_x_vectors, test_y)

0.8303030303030303

### F1-score

In [36]:
from sklearn.metrics import f1_score

print("clf_svm = " , f1_score(test_y, clf_svm.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE]))
print("clf_dec = " , f1_score(test_y, clf_dec.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE]))
print("clf_gnb = " , f1_score(test_y, clf_gnb.predict(test_x_vectors.toarray()), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE]))
print("clf_log = " , f1_score(test_y, clf_log.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE]))

clf_svm =  [0.91319444 0.21052632 0.22222222]
clf_dec =  [0.86120996 0.09375    0.        ]
clf_gnb =  [0.89678511 0.08510638 0.09090909]
clf_log =  [0.91370558 0.12244898 0.1       ]


This is only good in determining positive reviews. This is not the issue of the model itself, but the training data

In [37]:
print(f"positive_count = {train_y.count(Sentiment.POSITIVE)}")
print(f"negative_count = {train_y.count(Sentiment.NEGATIVE)}")

positive_count = 552
negative_count = 47


There are a lot of positive, very less in negative. Too little training data! <br>
We need more data

# Load more data

In [38]:
# Create a new class so that data distribution is more even
import random
class ReviewContainer : 
    def __init__(self, reviews) :
        self.reviews = reviews
    
    def get_text(self) : 
        return [x.text for x in self.reviews]
    
    def get_sentiment(self) : 
        return [x.sentiment for x in self.reviews]
    
    def evenly_distribute(self) : 
        negative = list(filter(lambda x : x.sentiment == Sentiment.NEGATIVE, self.reviews))
        positive = list(filter(lambda x : x.sentiment == Sentiment.POSITIVE, self.reviews))
        
        positive_shrunk = positive[:len(negative)]
        self.reviews = negative + positive_shrunk
        random.shuffle(self.reviews)

In [39]:
file_name = "data/Books_small_10000.json"

reviews2 = []
with open(file_name) as f : 
    for line in f : 
        review = json.loads(line)
        reviews2.append(Review(review['reviewText'], review['overall']))

reviews2[5].sentiment

'POSITIVE'

In [40]:
training, test = train_test_split(reviews2, test_size=0.33, random_state=42)

train_container = ReviewContainer(training)
test_container = ReviewContainer(test)

In [53]:
train_container.evenly_distribute()
train_x = train_container.get_text()
train_y = train_container.get_sentiment()

test_container.evenly_distribute()
test_x = test_container.get_text()
test_y = test_container.get_sentiment()

print(train_y.count(Sentiment.POSITIVE))
print(train_y.count(Sentiment.NEGATIVE))

436
436


In [54]:
vectorizer = CountVectorizer()
train_x_vectors = vectorizer.fit_transform(train_x)
test_x_vectors = vectorizer.transform(test_x)

Fit the model with the new data

SVM

In [55]:
clf_svm2 = svm.SVC(kernel="linear")
clf_svm2.fit(train_x_vectors, train_y)
clf_svm2.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

Decision Tree

In [56]:
clf_dec2 = DecisionTreeClassifier()
clf_dec2.fit(train_x_vectors, train_y)
clf_dec2.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

Naive Bayes

In [57]:
clf_gnb2 = GaussianNB()
clf_gnb2.fit(train_x_vectors.toarray(), train_y)
clf_gnb2.predict(test_x_vectors[0].toarray())

array(['NEGATIVE'], dtype='<U8')

Logistic Regression

In [58]:
clf_log2 = LogisticRegression()
clf_log2.fit(train_x_vectors, train_y)
clf_log2.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

In [59]:
# Mean accuracy
print(clf_svm2.score(test_x_vectors, test_y))
print(clf_dec2.score(test_x_vectors, test_y))
print(clf_gnb2.score(test_x_vectors.toarray(), test_y))
print(clf_log2.score(test_x_vectors, test_y))

0.7980769230769231
0.6370192307692307
0.6346153846153846
0.8149038461538461


The average accuracy decreases, but let's see the F1-score

In [60]:
print("clf_svm2 = " , f1_score(test_y, clf_svm2.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE]))
print("clf_dec2 = " , f1_score(test_y, clf_dec2.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE]))
print("clf_gnb2 = " , f1_score(test_y, clf_gnb2.predict(test_x_vectors.toarray()), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE]))
print("clf_log2 = " , f1_score(test_y, clf_log2.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE]))

clf_svm2 =  [0.8028169  0.         0.79310345]
clf_dec2 =  [0.643026   0.         0.63080685]
clf_gnb2 =  [0.59574468 0.         0.66666667]
clf_log2 =  [0.82051282 0.         0.808933  ]


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


The negative one is better, but still not good

After adding "test_container.evenly_distribute()", the F1-score really got better

In [70]:
# Now test with new data
test_set = ["I thoroughly enjoy this, 5 stars", "Bad book, do not buy", "Horrible waste of time", "great", "not great", "very good book", "very fun"]
new_test = vectorizer.transform(test_set)
new_test

clf_svm2.predict(new_test)

array(['POSITIVE', 'NEGATIVE', 'NEGATIVE', 'POSITIVE', 'NEGATIVE',
       'POSITIVE', 'POSITIVE'], dtype='<U8')

## Use TFIDF Vectorizer

In [71]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
train_x_vectors = vectorizer.fit_transform(train_x)
test_x_vectors = vectorizer.transform(test_x)

In [72]:
clf_svm3 = svm.SVC(kernel="linear")
clf_svm3.fit(train_x_vectors, train_y)
clf_svm3.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

In [73]:
clf_dec3 = DecisionTreeClassifier()
clf_dec3.fit(train_x_vectors, train_y)
clf_dec3.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

In [74]:
clf_gnb3 = GaussianNB()
clf_gnb3.fit(train_x_vectors.toarray(), train_y)
clf_gnb3.predict(test_x_vectors[0].toarray())

array(['POSITIVE'], dtype='<U8')

In [75]:
clf_log3 = LogisticRegression()
clf_log3.fit(train_x_vectors, train_y)
clf_log3.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

In [76]:
# Mean accuracy
print(clf_svm3.score(test_x_vectors, test_y))
print(clf_dec3.score(test_x_vectors, test_y))
print(clf_gnb3.score(test_x_vectors.toarray(), test_y))
print(clf_log3.score(test_x_vectors, test_y))

0.8076923076923077
0.6586538461538461
0.6610576923076923
0.8052884615384616


Only logistic regression went down, which is normal

In [77]:
# F1-score
print("clf_svm3 = " , f1_score(test_y, clf_svm3.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE]))
print("clf_dec3 = " , f1_score(test_y, clf_dec3.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE]))
print("clf_gnb3 = " , f1_score(test_y, clf_gnb3.predict(test_x_vectors.toarray()), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE]))
print("clf_log3 = " , f1_score(test_y, clf_log3.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE]))

clf_svm3 =  [0.80582524 0.         0.80952381]
clf_dec3 =  [0.64676617 0.         0.66976744]
clf_gnb3 =  [0.65693431 0.         0.66508314]
clf_log3 =  [0.80291971 0.         0.80760095]


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


### Tuning the model (with Grid Search)

In [79]:
from sklearn.model_selection import GridSearchCV

parameters = {'kernel' : ('linear', 'rbf'), 'C' : (1,4,8,16,32)}
svc = svm.SVC()
clf = GridSearchCV(svc, parameters, cv=5)
clf.fit(train_x_vectors, train_y)

GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': (1, 4, 8, 16, 32), 'kernel': ('linear', 'rbf')})

We can change the model's parameter depending on the estimator

In this case, it's 'rbf' and C=1

In [80]:
clf.score(test_x_vectors, test_y)

0.8076923076923077

Not a big of a different, well it happens. But there is room for improvements <br>
Possible problem : <br>
<ol>
    <li> "Good" vs "Good!", they are treated as different words. </li>
    <li> Using other vectorizers </li>
</ol>

## Save the model

In [83]:
import pickle

with open('./models/sentiment_classifier.pkl', 'wb') as f :
    pickle.dump(clf, f)

## Load the model

In [84]:
with open('./models/sentiment_classifier.pkl', 'rb') as f : 
    loaded_clf = pickle.load(f)

In [86]:
print(test_x[0])
loaded_clf.predict(test_x_vectors[0])

I knew most of stuff. The book is for the very beginner. I really learned nada.Oh well. Another learning experience.


array(['POSITIVE'], dtype='<U8')