In [9]:
import random 

class Sentiment:
        NEGATIVE= "NEGATIVE"
        NEUTRAL = "NEUTRAL"
        POSITIVE = "POSITIVE"
class Review:
    def __init__(self,text,score):
        self.text=text
        self.score=score
        self.sentiment = self.get_sentiment()
        
    def get_sentiment(self):
        if self.score <= 2 : 
            return Sentiment.NEGATIVE
        elif self.score == 3 :
            return Sentiment.NEUTRAL
        else :
            return Sentiment.POSITIVE
        
class ReviewContainer:
    def __init__(self,reviews):
        self.reviews = reviews
        
    def get_text(self):
        return [x.text for x in self.reviews]
        
    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]
    def evenly_distribute(self):
        negative = list(filter(lambda x: x.sentiment==Sentiment.NEGATIVE,self.reviews))
        positive = list(filter (lambda x: x.sentiment== Sentiment.POSITIVE,self.reviews))
        
        positive_shrink = positive[:len(negative)]
        self.reviews=negative +positive_shrink
        
        random.shuffle(self.reviews)
        

In [10]:
import json 

file_name = 'Books.json'


reviews=[]
with open(file_name) as f:
    for line in f :
        review = json.loads(line)
        reviews.append(Review(review['reviewText'],review['overall']))
        

reviews[5].sentiment

'POSITIVE'

# Prep Data

In [6]:
len(reviews)

10000

In [15]:
from sklearn.model_selection import train_test_split

train,test=train_test_split(reviews,test_size=0.2,train_size=0.8,random_state=42)

train_container = ReviewContainer(train)
test_container = ReviewContainer(test)

In [34]:
train_container.evenly_distribute()

train_x= train_container.get_text()
train_y= train_container.get_sentiment()

test_container.evenly_distribute()
test_x = test_container.get_text()
test_y = test_container.get_sentiment()

print (train_y.count(Sentiment.POSITIVE))
print (train_y.count(Sentiment.NEGATIVE))

print (test_y.count(Sentiment.POSITIVE))
print (test_y.count(Sentiment.NEGATIVE))

#Even distribution so that our model is not "biased"

513
513
131
131


# Bag of words vectorization

In [35]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
train_x_vectors=vectorizer.fit_transform(train_x)

test_x_vectors=vectorizer.transform(test_x)
#print(train_x[0])
#print(train_x_vectors[0].toarray())



# Classification

### Linear SVM Method

In [36]:
from sklearn import svm

clf_svm=svm.SVC(kernel='linear')

clf_svm.fit(train_x_vectors,train_y)

test_x[0]

clf_svm.predict(test_x_vectors[0])

array(['NEGATIVE'], dtype='<U8')

### Decision Tree Method

In [37]:
from sklearn.tree import DecisionTreeClassifier
clf_dec=DecisionTreeClassifier()
clf_dec.fit(train_x_vectors,train_y)

clf_dec.predict(test_x_vectors[0])

array(['NEGATIVE'], dtype='<U8')

### Evaluation

In [None]:
#Accuracy scores

In [38]:
clf_svm.score(test_x_vectors,test_y)

0.7557251908396947

In [39]:
clf_dec.score(test_x_vectors,test_y)

0.683206106870229

In [None]:
#F1 scores

In [40]:
from sklearn.metrics import f1_score

print(f1_score(test_y,clf_svm.predict(test_x_vectors),average=None,labels=[Sentiment.POSITIVE,Sentiment.NEUTRAL,Sentiment.NEGATIVE]))
print(f1_score(test_y,clf_dec.predict(test_x_vectors),average=None,labels=[Sentiment.POSITIVE,Sentiment.NEUTRAL,Sentiment.NEGATIVE]))

[0.76642336 0.         0.744     ]
[0.68199234 0.         0.68441065]


  _warn_prf(


In [42]:
test1 = ['I enjoyed it very much','A really bad book', 'not so good']
new_test = vectorizer.transform(test1)

clf_svm.predict(new_test)

array(['POSITIVE', 'NEGATIVE', 'NEGATIVE'], dtype='<U8')