### Data Class

In [8]:
import random

class Sentiment:
    NEGATIVE = "NEGATIVE"
    NEUTRAL = "NEUTRAL"
    POSITIVE = "POSITIVE"

class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
        
    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.NEGATIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else: #Score of 4 or 5
            return Sentiment.POSITIVE    

        
class ReviewContainer:
    def __init__(self, reviews):
        self.reviews = reviews
        
    def get_text(self):
        return [x.text for x in self.reviews]
    
    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]
        
    def evenly_distribute(self):
        negative = list(filter(lambda x: x.sentiment == Sentiment.NEGATIVE, self.reviews))
        positive = list(filter(lambda x: x.sentiment == Sentiment.POSITIVE, self.reviews))
        positive_shrunk = positive[:len(negative)]
        self.reviews = negative + positive_shrunk
        random.shuffle(self.reviews)
        
        

### Load Data

In [9]:
import json

file_name = './data/sentiment/books_small_10000.json'

reviews = []
with open(file_name) as f:
    #print(f)
    for line in f:
        #print(line)
        #break
        review = json.loads(line)
        reviews.append(Review(review['reviewText'], review['overall']))
        
print(reviews[5].text)
print(reviews[5].score)
print(reviews[5].sentiment)

I hoped for Mia to have some peace in this book, but her story is so real and raw.  Broken World was so touching and emotional because you go from Mia's trauma to her trying to cope.  I love the way the story displays how there is no "just bouncing back" from being sexually assaulted.  Mia showed us how those demons come for you every day and how sometimes they best you. I was so in the moment with Broken World and hurt with Mia because she was surrounded by people but so alone and I understood her feelings.  I found myself wishing I could give her some of my courage and strength or even just to be there for her.  Thank you Lizzy for putting a great character's voice on a strong subject and making it so that other peoples story may be heard through Mia's.
5.0
POSITIVE


In [10]:
len(reviews)

10000

### Prep Data

In [12]:
from sklearn.model_selection import train_test_split

training, test = train_test_split(reviews, test_size=0.33, random_state=42)

train_container = ReviewContainer(training)

test_container = ReviewContainer(test)


In [13]:
train_container.evenly_distribute()
train_x = train_container.get_text()
train_y = train_container.get_sentiment()

test_container.evenly_distribute()
test_x = test_container.get_text()
test_y = test_container.get_sentiment()

print(train_y.count(Sentiment.POSITIVE))
print(train_y.count(Sentiment.NEGATIVE))



436
436


#### Bag of words vectorization

In [32]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

#TfidfVectorizer=>Term Frequency inverse document frequency

# This book is great !
# This book was so bad
#This book is great was so bad
#[1,    1    1   1   0    0  0]=>stmt1
#[1,    1    0   0   1    1  1]=>stmt2

vectorizer = TfidfVectorizer()#CountVectorizer()#
train_x_vectors = vectorizer.fit_transform(train_x)

test_x_vectors = vectorizer.transform(test_x)

print(train_x[0])
print(train_x_vectors[0].toarray())




Planning, packing for and setting up a proper camp site is getting to be a dying art. There's a lot to it that modern folks never think about, and as a result, head out for a week of camping with a cheap tent, flimsy air mattress and grossly inadequate food and equipment and are perfectly miserable. My Dad, old curmudgeon that he was, used to take us cross-country with a cardboard box with a couple frying pans, Dutch oven and a coffee pot, and a couple air mattresses. That was HIS idea of &#34;camping&#34;, when he was too cheap to pay $11 a night for a motel room. Mr. Hall gives you the real-lowdown on everything you need to know about proper equipment, planning for your trip, what food to take, how to choose the proper equipment, and gives some good, easy recipes. This is all given in down-to-earth language,, spoken from one whose learned it all the hard way, apparently. As someone who finally got the knack of cooking everything from a respectable beef stew to a pretty tasty dump cak

In [17]:
vectorizer.get_feature_names()


['00',
 '000',
 '01',
 '04',
 '10',
 '100',
 '101',
 '11',
 '114',
 '115',
 '119',
 '12',
 '120the',
 '128532',
 '13',
 '130',
 '14',
 '140',
 '143',
 '15',
 '150',
 '154',
 '157',
 '16',
 '164',
 '16th',
 '17',
 '175',
 '18',
 '1800s',
 '1885',
 '1887',
 '1896',
 '19',
 '1920',
 '1923',
 '1936',
 '1948',
 '1954',
 '1960s',
 '1962',
 '1967',
 '1982',
 '1987',
 '1992',
 '1998',
 '1999',
 '19th',
 '1st',
 '20',
 '200',
 '2000',
 '2001',
 '2009',
 '2010',
 '2011',
 '2012',
 '2013',
 '2014',
 '2016',
 '203',
 '20th',
 '21',
 '210',
 '21st',
 '23',
 '233',
 '237',
 '24',
 '25',
 '250',
 '254',
 '26',
 '268',
 '27',
 '288',
 '2nd',
 '30',
 '300',
 '30yrs',
 '33',
 '34',
 '345',
 '35',
 '37',
 '390',
 '39clues',
 '3can',
 '3rd',
 '3so',
 '40',
 '400',
 '40am',
 '421',
 '45pm',
 '4th',
 '50',
 '50s',
 '53',
 '5color',
 '5th',
 '5that',
 '60',
 '62',
 '630',
 '65',
 '70',
 '700',
 '75',
 '7am',
 '7th',
 '80',
 '8211',
 '8212',
 '8216',
 '8217',
 '8220',
 '8221',
 '8230',
 '85',
 '8th',
 '90',
 

## Classification

#### Linear SVM



















In [26]:
from sklearn import svm
#SVM=>Support Vector Machine

clf_svm = svm.SVC(kernel='linear')

clf_svm.fit(train_x_vectors, train_y)

clf_svm.predict(test_x_vectors[0])




len(train_y),train_x_vectors.toarray().shape

(872, (872, 8906))

#### Decision Tree

In [22]:
from sklearn.tree import DecisionTreeClassifier

clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_x_vectors, train_y)

clf_dec.predict(test_x_vectors[0])


array(['NEGATIVE'], dtype='<U8')

#### Naive Bayes

In [38]:
from sklearn.naive_bayes import GaussianNB

clf_gnb = GaussianNB()
clf_gnb.fit(train_x_vectors.todense(), train_y)

clf_gnb.predict(test_x_vectors[0])

TypeError: A sparse matrix was passed, but dense data is required. Use X.toarray() to convert to a dense numpy array.

#### Logistic Regression

In [39]:
from sklearn.linear_model import LogisticRegression

clf_log = LogisticRegression()
clf_log.fit(train_x_vectors, train_y)

clf_log.predict(test_x_vectors[0])



array(['NEGATIVE'], dtype='<U8')

## Evaluation

In [40]:
# Mean Accuracy
print(clf_svm.score(test_x_vectors, test_y))
print(clf_dec.score(test_x_vectors, test_y))
#print(clf_gnb.score(test_x_vectors, test_y))
print(clf_log.score(test_x_vectors, test_y))

0.6754807692307693
0.5048076923076923
0.8052884615384616


In [42]:
# F1 Scores
from sklearn.metrics import f1_score

#f1_score(test_y, clf_svm.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE])
f1_score(test_y, clf_log.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, 
                                                                        Sentiment.NEGATIVE])




  average, "true nor predicted", 'F-score is', len(true_sum)


array([0.80291971, 0.        , 0.80760095])

In [45]:
test_set = ['very fun', "bad book do not buy", 'horrible waste of time']
new_test = vectorizer.transform(test_set)

clf_svm.predict(new_test)


array(['POSITIVE', 'NEGATIVE', 'NEGATIVE'], dtype='<U8')

### Tuning our model (with Grid Search)

GridSearchCV implements a “fit” and a “score” method. It also implements “predict”, “predict_proba”, “decision_function”, “transform” and “inverse_transform” if they are implemented in the estimator used.

In [46]:
from sklearn.model_selection import GridSearchCV

parameters = {'kernel': ('linear', 'rbf'), 'C': (1,4,8,16,32)}

svc = svm.SVC()
clf = GridSearchCV(svc, parameters, cv=5)
clf.fit(train_x_vectors, train_y)
















GridSearchCV(cv=5, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': (1, 4, 8, 16, 32), 'kernel': ('linear', 'rbf')},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [66]:
print(clf.score(test_x_vectors, test_y))

0.8076923076923077


## Saving Model

#### Save model

In [68]:
import pickle

with open('./models/sentiment_classifier.pkl', 'wb') as f:
    pickle.dump(clf, f)

#### Load model

In [70]:
with open('./models/entiment_classifier.pkl', 'rb') as f:
    loaded_clf = pickle.load(f)

In [72]:
print(test_x[0])

loaded_clf.predict(test_x_vectors[0])

I loved this book and the previous books in this series. It brings out every emotion you can think of. I look forward to reading more books by this author.


array(['POSITIVE'], dtype='<U8')

In [48]:
from sklearn.linear_model import Perceptron

clf = Perceptron(tol=1e-3, random_state=0)
clf.fit(train_x_vectors, train_y)  

Perceptron(alpha=0.0001, class_weight=None, early_stopping=False, eta0=1.0,
           fit_intercept=True, max_iter=1000, n_iter_no_change=5, n_jobs=None,
           penalty=None, random_state=0, shuffle=True, tol=0.001,
           validation_fraction=0.1, verbose=0, warm_start=False)