## Data Loading Classes

In [76]:
import random 

In [80]:
class Sentiment:
    NEGATIVE = "Negative"
    NEUTRAL = "Neutral"
    POSITIVE = "Positive"

class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
        
    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.NEGATIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else:
            return Sentiment.POSITIVE
        
class ReviewContainer:
    def __init__(self, reviews):
        self.reviews = reviews
    
    def evenly_distribute(self):
        neg = list(filter(lambda x: x.sentiment == Sentiment.NEGATIVE, self.reviews))
        pos = list(filter(lambda x: x.sentiment == Sentiment.POSITIVE, self.reviews))
        pos_small = pos[:len(neg)]
        self.reviews = neg + pos_small
        random.shuffle(self.reviews)
        
    def get_text(self):
        return [x.text for x in self.reviews]
    
    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]

## Load Data

In [18]:
import json

file_name = 'book_reviews.json'

reviews = []
with open(file_name) as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review['reviewText'], review['overall']))

In [20]:
reviews[0].text

"I bought both boxed sets, books 1-5.  Really a great series!  Start book 1 three weeks ago and just finished book 5.  Sloane Monroe is a great character and being able to follow her through both private life and her PI life gets a reader very involved!  Although clues may be right in front of the reader, there are twists and turns that keep one guessing until the last page!  These are books you won't be disappointed with."

## Prep Data

In [114]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split

In [88]:
train, test = train_test_split(reviews, test_size=0.33, random_state=123)

train_cont = ReviewContainer(train)
train_cont.evenly_distribute()
train_x = train_cont.get_text()
train_y = train_cont.get_sentiment()

test_cont = ReviewContainer(test)
test_cont.evenly_distribute()
test_x = test_cont.get_text()
test_y = test_cont.get_sentiment()

In [96]:
from collections import Counter

print(Counter(train_y))
print(Counter(test_y))

Counter({'Negative': 418, 'Positive': 418})
Counter({'Positive': 226, 'Negative': 226})


In [168]:
vect = TfidfVectorizer(max_df=0.8)
train_x_vect = vect.fit_transform(train_x)
test_x_vect = vect.transform(test_x)

## Classification

In [169]:

from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

In [170]:
clf_svc = SVC(kernel='linear')
clf_svc.fit(train_x_vect, train_y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [171]:
clf_tree = DecisionTreeClassifier()
clf_tree.fit(train_x_vect, train_y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [172]:
clf_log = LogisticRegression()
clf_log.fit(train_x_vect, train_y)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

## Evaluation

In [191]:
from sklearn.metrics import confusion_matrix, f1_score

### Mean accuracy

In [173]:
print(clf_svc.score(test_x_vect, test_y))
print(clf_tree.score(test_x_vect, test_y))
print(clf_log.score(test_x_vect, test_y))

0.8429203539823009
0.6238938053097345
0.8429203539823009


### F1 Score

In [174]:
print(f1_score(
    test_y, clf_svc.predict(test_x_vect), average=None, 
    labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE, Sentiment.NEUTRAL]))

print(f1_score(
    test_y, clf_tree.predict(test_x_vect), average=None, 
    labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE, Sentiment.NEUTRAL]))

print(f1_score(
    test_y, clf_log.predict(test_x_vect), average=None, 
    labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE, Sentiment.NEUTRAL]))

[0.84463895 0.84116331 0.        ]
[0.61711712 0.63043478 0.        ]
[0.84116331 0.84463895 0.        ]


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


### Confusion Matrix

In [192]:
cm = confusion_matrix(
    test_y, clf_svc.predict(test_x_vect), 
    labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE, Sentiment.NEUTRAL])

In [193]:
cm

array([[193,  33,   0],
       [ 38, 188,   0],
       [  0,   0,   0]])

## Parameter Tuning

In [175]:
from sklearn.model_selection import GridSearchCV

In [186]:
parameters = {
    'kernel': ('linear', 'rbf', 'poly'),
    'C': (0.1, 0.5, 1.0),
    'degree': (2, 3, 4, 5)
}

svc = SVC(gamma='auto')
clf = GridSearchCV(svc, parameters, cv=5)
clf.fit(train_x_vect, train_y)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='warn', n_jobs=None,
             param_grid={'C': (0.1, 0.5, 1.0), 'degree': (2, 3, 4, 5),
                         'kernel': ('linear', 'rbf', 'poly')},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [187]:
clf.best_estimator_

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=2, gamma='auto', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [190]:
print(clf.score(test_x_vect, test_y))

0.8429203539823009


In [188]:
print(f1_score(
    test_y, clf.predict(test_x_vect), average=None, 
    labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE, Sentiment.NEUTRAL]))

[0.84463895 0.84116331 0.        ]


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


## Try own sentence

In [113]:
example = "aaaa"
example = [example]
clf_log.predict(vect.transform(example))

array(['Negative'], dtype='<U8')

## Next steps
- consider punctuations -> good! and good should be equal
- back of n_words -> not ugly will be split into not and ungly but also the combination should be considered thus a back of words with n word combinations
- meta data like number of words
- check confusion matrix
- what about predicting neutral?
- doesn't have sklearn a build in evenly distribution?

## Saving Model

In [None]:
#import pickle

#with open('/models/sentiment_classifier.pkl', 'wb') as f:
#    pickle.dump(clf, fb)

In [None]:
#with open('/models/sentiment_classifier.pkl', 'rb') as f:
#    loaded_clf = pickle.load(f)
#loaded_clf.predict(test_x_vect[0])