#### Data Class

In [60]:
import random
class Sentiment:
    NEGATIVE = "NEGATIVE"
    NETURAL = "NETURAL"
    POSITIVE = "POSITIVE"

class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment() 
    
    def get_sentiment(self):
        if self.score <=2:
            return Sentiment.NEGATIVE
        elif self.score == 3:
            return Sentiment.NETURAL
        else: #4&5
            return Sentiment.POSITIVE
        
class ReviewContainer:
    def __init__(self, reviews):
        self.reviews = reviews
    
    def get_text(self):
        return  [x.text for x in self.reviews]
    
    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]
        
    
    def evenly_distribute(self):
        negative = list(filter(lambda x: x.sentiment == Sentiment.NEGATIVE, self.reviews))
        positive = list(filter(lambda x: x.sentiment == Sentiment.POSITIVE, self.reviews))
        positive_shrunk = positive[:len(negative)]
        self.reviews = negative + positive_shrunk
        random.shuffle(self.reviews)

#### Load Data

In [56]:
import json
file_name = './data/sentiment/Books_small_10000.json'
reviews = []
with open(file_name) as f:
    for line in f:
        review = json.loads(line)
#         reviews.append((review['reviewText'], review['overall'])) #build a tuple
        reviews.append(Review(review['reviewText'], review['overall']))
        
    
reviews[3].score

4.0

#### Prep Data

In [66]:
from sklearn.model_selection import train_test_split
training, test = train_test_split(reviews, test_size=0.33, random_state=42)

train_container = ReviewContainer(training)
test_container = ReviewContainer(test)

In [69]:
train_container.evenly_distribute()
train_x = train_container.get_text()
train_y = train_container.get_sentiment()

test_x = test_container.get_text()
test_y = test_container.get_sentiment()

In [71]:
print(train_y.count(Sentiment.POSITIVE))
print(train_y.count(Sentiment.NEGATIVE))

436
436


#### Bag of Words

In [72]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
train_x_vectors = vectorizer.fit_transform(train_x)
# print(train_x[0])
# print(train_x_vectors[0])
test_x_vectors = vectorizer.transform(test_x)

#### Classification

##### Linear SVM

In [75]:
from sklearn import svm
clf_svm = svm.SVC(kernel='linear')
clf_svm.fit(train_x_vectors, train_y)

SVC(kernel='linear')

In [76]:
clf_svm.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

##### Decision Tree

In [77]:
from sklearn.tree import DecisionTreeClassifier
clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_x_vectors, train_y)
clf_dec.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

##### Naive Bayes

In [78]:
train_x_vectors

<872x8906 sparse matrix of type '<class 'numpy.int64'>'
	with 53647 stored elements in Compressed Sparse Row format>

In [79]:
# from sklearn.naive_bayes import MultinomialNB
# clf_gnb = MultinomialNB
# clf_gnb.fit(train_x_vectors, train_y)
# clf_gnb.predict(test_x_vectors[0])

##### Logistic Regression

In [80]:
from sklearn.linear_model import LogisticRegression
clf_lr = LogisticRegression().fit(train_x_vectors, train_y)
clf_lr.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

#### Evaluation

In [81]:
#Mean accuracy
print(clf_svm.score(test_x_vectors, test_y))
print(clf_dec.score(test_x_vectors, test_y))
print(clf_lr.score(test_x_vectors, test_y))

0.7124242424242424
0.6115151515151516
0.7448484848484849


In [82]:
#F1 Score
from sklearn.metrics import f1_score

print(f1_score(test_y, clf_svm.predict(test_x_vectors), average = None, labels= [Sentiment.POSITIVE, Sentiment.NETURAL, Sentiment.NEGATIVE]))
print(f1_score(test_y, clf_dec.predict(test_x_vectors), average = None, labels= [Sentiment.POSITIVE, Sentiment.NETURAL, Sentiment.NEGATIVE]))
print(f1_score(test_y, clf_lr.predict(test_x_vectors), average = None, labels= [Sentiment.POSITIVE, Sentiment.NETURAL, Sentiment.NEGATIVE]))

[0.85363477 0.         0.28146853]
[0.77478957 0.         0.18660969]
[0.8783008  0.         0.31077216]
