In [1]:
import random

class Sentiment:
    NEGATIVE = 'NEGATIVE'
    NUETRAL = 'NUETRAL'
    POSITIVE = 'POSITIVE'
    
class Review: 
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
        
    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.NEGATIVE
        elif self.score == 3:
            return Sentiment.NUETRAL
        else:
            return Sentiment.POSITIVE

class ReviewContainer:
    def __init__(self, reviews):
        self.reviews = reviews
        
    def get_text(self):
        return [x.text for x in self.reviews]
    
    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]
        
    def evenly_distribute(self):
        negative = list(filter(lambda x: x.sentiment == Sentiment.NEGATIVE, self.reviews))
        positive = list(filter(lambda x: x.sentiment == Sentiment.POSITIVE, self.reviews))
        positive_shrunk = positive[:len(negative)]
        self.reviews = negative + positive_shrunk
        random.shuffle(self.reviews)
        

### Load Data

In [2]:
import json

filename = './data/sentiment/Books_small_10000.json'

reviews = []

with open(filename) as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review['reviewText'], review['overall']))
reviews[5].sentiment

'POSITIVE'

### Prepare Data

In [3]:
from sklearn.model_selection import train_test_split

training, test = train_test_split(reviews, test_size=0.33, random_state=42)

train_container = ReviewContainer(training)
test_container = ReviewContainer(test)

In [4]:
train_container.evenly_distribute()
train_x = train_container.get_text()
train_y = train_container.get_sentiment()

test_container.evenly_distribute()
test_x = test_container.get_text()
test_y = test_container.get_sentiment()

train_y.count(Sentiment.POSITIVE)

436

### Bag Of Words Vectorization

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
train_x_vectors = vectorizer.fit_transform(train_x)

test_x_vectors = vectorizer.transform(test_x)

print(train_x[0])
print(train_x_vectors[0].toarray())




There is no way I could read these stories in order.  I read a few, then take a break for a few weeks. Each  story is very short, tightly packed, and you can read it multiple times before it becomes clear what she is saying.Highly recommended.
[[0 0 0 ... 0 0 0]]


### Classification using Linear SVM

In [6]:
from sklearn  import svm

clf_svm = svm.SVC(kernel='linear')

clf_svm.fit(train_x_vectors, train_y)

test_x[0]

clf_svm.predict(test_x_vectors[0])


array(['NEGATIVE'], dtype='<U8')

### Evaluation of the SVM Classifier

In [7]:
# Mean Accuracy
clf_svm.score(test_x_vectors, test_y)

0.7980769230769231

In [8]:
# F1 Score
from sklearn.metrics import f1_score

f1_score(test_y, clf_svm.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE])

array([0.8028169 , 0.79310345])

### Test New Data

In [11]:
test_set = ['this movie', 'i had a terrible night', 'good book', 'bad book']

In [12]:

new_test = vectorizer.transform(test_set)
clf_svm.predict(new_test)

array(['NEGATIVE', 'NEGATIVE', 'POSITIVE', 'NEGATIVE'], dtype='<U8')