In [56]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd 
import json 
import random

In [57]:
class Sentiment:
    negative="NEGATIVE"
    positive="POSITIVE"
    
class Review:
    def __init__(self,text,score):
        self.text=text
        self.score=score
        self.sentiment=self.get_sentiment()
    def get_sentiment(self):
        if self.score<3:
            return Sentiment.negative
        else:#when score is 3 or 4 or 5
            return Sentiment.positive

class ReviewContainer:
        def __init__(self,reviews):
            self.reviews=reviews
        def evenly_distribute(self):
            negative=list(filter(lambda x:x.sentiment==Sentiment.negative,self.reviews))
            positive=list(filter(lambda x:x.sentiment==Sentiment.positive,self.reviews))
            #as data contains more positive data sets,we can reduce it for negative for more better accuracy.
            postive_shrunk=positive[:len(negative)]
            self.reviews=negative+postive_shrunk
            random.shuffle(self.reviews)

In [58]:
reviews=[]
with open("Books_small_10000.json") as f:
   for line in f:
        review_json=json.loads(line)
        review=Review(review_json['reviewText'], review_json['overall'])
        reviews.append(review)

In [59]:
reviews[7].text
reviews[7].score
reviews[7].sentiment

'POSITIVE'

In [13]:
from sklearn.model_selection import train_test_split
training,testing=train_test_split(reviews,test_size=0.33)

In [60]:
len(training)
len(testing)

3300

In [61]:
#evenly distributing data to get better accuracy
traincontainer=ReviewContainer(training)
traincontainer.evenly_distribute()
testcontainer=ReviewContainer(testing)
testcontainer.evenly_distribute()



In [66]:
#data split
x_train=[x.text for x in traincontainer.reviews]
x_test=[x.text for x in testcontainer.reviews]
y_train=[y.sentiment for y in traincontainer.reviews]
y_test=[y.sentiment for y in testcontainer.reviews]

In [67]:
y_train.count(Sentiment.positive)
y_train.count(Sentiment.negative)

434

In [68]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer=TfidfVectorizer()
x_train_vector=vectorizer.fit_transform(x_train)
x_test_vector=vectorizer.transform(x_test)

In [70]:
from sklearn.svm import SVC
clf_svm=SVC()
clf_svm.fit(x_train_vector,y_train)
y_pred=clf_svm.predict(x_test_vector)

from sklearn.metrics import accuracy_score,f1_score
clf_svm_score=accuracy_score(y_test,y_pred)
print(accuracy_score(y_test,y_pred))
print(f1_score(y_test,y_pred,average=None,labels=[Sentiment.positive,Sentiment.negative]))



0.8261904761904761
[0.81975309 0.83218391]


In [71]:
from sklearn.tree import DecisionTreeClassifier
clf_dec=DecisionTreeClassifier()
clf_dec.fit(x_train_vector,y_train)
y_pred=clf_dec.predict(x_test_vector)

from sklearn.metrics import accuracy_score,f1_score
clf_dec_score=accuracy_score(y_test,y_pred)
print(accuracy_score(y_test,y_pred))
print(f1_score(y_test,y_pred,average=None,labels=[Sentiment.positive,Sentiment.negative]))


0.6547619047619048
[0.67268623 0.63476071]


In [72]:
from sklearn.linear_model import LogisticRegression
clf_log=LogisticRegression()
clf_log.fit(x_train_vector,y_train)
y_pred=clf_log.predict(x_test_vector)

from sklearn.metrics import accuracy_score,f1_score
clf_log_score=accuracy_score(y_test,y_pred)
print(accuracy_score(y_test,y_pred))
print(f1_score(y_test,y_pred,average=None,labels=[Sentiment.positive,Sentiment.negative]))




0.819047619047619
[0.81642512 0.82159624]


In [73]:
#we can clearly see that logistic regression best suits for the problem

clf_log.predict(vectorizer.transform(["very bad","awesome",]))

array(['NEGATIVE', 'POSITIVE'], dtype='<U8')