In [1]:
import json
import random
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
import pickle

### Classes

In [2]:
class Sentiment:
    POSITIVE = 'POSITIVE'
    NEUTRAL = 'NEUTRAL'
    NEGATIVE = 'NEGATIVE'
    
class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
    
    def get_sentiment(self):
        if self.score >= 4:
            return Sentiment.POSITIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else: 
            return Sentiment.NEGATIVE
        
class ReviewContainer:
    def __init__(self, reviews):
        self.reviews = reviews
        
    def get_text(self):
        return [x.text for x in self.reviews]
   
    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]
        
    def evenly_distribute(self):
        positive = list(filter(lambda x: x.sentiment == Sentiment.POSITIVE, self.reviews))
        negative = list(filter(lambda x: x.sentiment == Sentiment.NEGATIVE, self.reviews))
        positive_shrunk = positive[:len(negative)]
        self.reviews = negative + positive_shrunk
        random.shuffle(self.reviews)

### Load data

In [3]:
file_name = './data/sentiment/Books_1.json'

reviews = []
with open(file_name) as file:
    for line in file:
        review = json.loads(line)
        reviews.append(Review(review['reviewText'], review['overall']))
        
reviews[5].text

'I hoped for Mia to have some peace in this book, but her story is so real and raw.  Broken World was so touching and emotional because you go from Mia\'s trauma to her trying to cope.  I love the way the story displays how there is no "just bouncing back" from being sexually assaulted.  Mia showed us how those demons come for you every day and how sometimes they best you. I was so in the moment with Broken World and hurt with Mia because she was surrounded by people but so alone and I understood her feelings.  I found myself wishing I could give her some of my courage and strength or even just to be there for her.  Thank you Lizzy for putting a great character\'s voice on a strong subject and making it so that other peoples story may be heard through Mia\'s.'

### Preparation data

In [4]:
training, test = train_test_split(reviews, test_size=0.33, random_state=42)

train_container = ReviewContainer(training)

test_container = ReviewContainer(test)

In [5]:
train_container.evenly_distribute()
train_x = train_container.get_text()
train_y = train_container.get_sentiment()

test_container.evenly_distribute()
test_x = test_container.get_text()
test_y = test_container.get_sentiment()

print(train_y.count(Sentiment.POSITIVE))
print(train_y.count(Sentiment.NEGATIVE))

436
436


### Bags of words

In [6]:
vectorizer = TfidfVectorizer()
train_x_vectors = vectorizer.fit_transform(train_x)
test_x_vectors = vectorizer.transform(test_x)

print(train_x[5])
print(train_x_vectors[5].toarray())

One of Francine Rivers best series books!
[[0. 0. 0. ... 0. 0. 0.]]


### Classification

#### Linear SVM

In [7]:
clf_svm = svm.SVC(kernel='linear')
clf_svm.fit(train_x_vectors, train_y)
clf_svm.predict(test_x_vectors[0])


array(['POSITIVE'], dtype='<U8')

#### Decision tree

In [8]:
clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_x_vectors, train_y)
clf_dec.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

#### Logistic Regression

In [9]:
clf_log = LogisticRegression()
clf_log.fit(train_x_vectors, train_y)
clf_log.predict(test_x_vectors[0])


array(['NEGATIVE'], dtype='<U8')

### Evalution

In [10]:
# Mean accuracy
print(clf_svm.score(test_x_vectors, test_y))
print(clf_dec.score(test_x_vectors, test_y))
print(clf_log.score(test_x_vectors, test_y))

0.8076923076923077
0.6322115384615384
0.8052884615384616


#### F1 scores

In [11]:
f1_score(test_y, clf_svm.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE])
# f1_score(test_y, clf_dec.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE])
# f1_score(test_y, clf_log.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE])


array([0.80582524, 0.80952381])

In [12]:
test_set = ['very fun', 'bad book do not buy', 'horrible waste of time']
new_test = vectorizer.transform(test_set)

clf_svm.predict(new_test)

array(['POSITIVE', 'NEGATIVE', 'NEGATIVE'], dtype='<U8')

### Tuning model

In [13]:
parameters = {'kernel': ('linear', 'rbf'), 'C':(1,4,8,16,32)}

svc = svm.SVC()
clf = GridSearchCV(svc, parameters, cv=5)
clf.fit(train_x_vectors, train_y)

print(clf.score(test_x_vectors, test_y))



0.8197115384615384


### Saving Model

#### Save model

In [14]:
with open('./models/sentiment_1/sentiment_classifier.plk', 'wb') as f:
    pickle.dump(clf, f)

#### Load model

In [15]:
with open('./models/sentiment_1/sentiment_classifier.plk', 'rb') as f:
    loaded_clf = pickle.load(f)

In [16]:
print(test_x[0])

loaded_clf.predict(test_x_vectors[0])

All my annoyance melted. "You dumb-a@#," I crooned, kissing her on the forehead. "You don't share me. You own me."This book owned me. I couldn't even think about putting it down and if I had to then all I could think about was getting back to it. Honestly I was quite content and happy with how If I Stay ended. It was an epic ending if you ask me. But of course if Gayle Forman is going to offer more of Adam and Mia's story, then I'm going to take it. And I did and loved it. It crushed me and then made me whole again.It was very hard for me to not flip to the end to see how this story was going to conclude but I kept control and just moved forward with the story. It was just that I really didn't know how this was going to end and it was ripping me apart. This year I've read some books with unhappy endings and have actually enjoyed them, kind of like a breath of fresh-air, something different, but there's no way I could have handled an ending like that with this book.I fell in love with G

array(['NEGATIVE'], dtype='<U8')