# Preparing Classes

In [110]:
import random

class Sentiment:
    NEGATIVE = 'NEGATIVE'
    NEUTRAL = 'NEUTRAL'
    POSITIVE = 'POSITIVE'

class Review:
    def __init__(self,text,score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
        
    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.NEGATIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else:
            return Sentiment.POSITIVE
        
class ReviewContainer:
    def __init__(self,reviews):
        self.reviews = reviews
        
    def get_text(self):
        return [x.text for x in self.reviews]
    
    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]
        
    def evenly_distribute(self):
        negative = list(filter(lambda x: x.sentiment == Sentiment.NEGATIVE, self.reviews))
        positive = list(filter(lambda x: x.sentiment == Sentiment.POSITIVE, self.reviews))
        positive_adjusted = positive[:len(negative)+20]
        self.reviews = positive_adjusted + negative
        random.shuffle(self.reviews)
        

In [2]:
import json

# Load Data

In [105]:
file_name = 'Books_small_10000.json'

reviews = []
with open(file_name) as file:
    for line in file:
        review = json.loads(line)
        reviews.append(Review(review['reviewText'],review['overall'],))

In [106]:
reviews[10].sentiment, reviews[10].text

('POSITIVE',
 "My only complaint about this book is that it is much too short. I love this author and this series, and I can't wait for the next installment.")

# Prep Data

In [107]:
len(reviews)

10000

In [63]:
from sklearn.model_selection import train_test_split

In [111]:
train,test = train_test_split(reviews,test_size=0.33,random_state=5)

In [65]:
len(train), len(test)

(6700, 3300)

In [67]:
train[0].sentiment, train[0].text

('POSITIVE',
 'This is a good part of the trilogy about three sisters and their inheritance. I also enjoyed it as a standalone.  I found it different from the Brodie series in that there was part of the paranormal and the past in all the books. This seems to be a new direction for Kat Martin, or perhaps it is because I am finished with the Brodie series and reading different series Kat has written.')

In [68]:
x_train = [x.text for x in train]
y_train = [x.sentiment for x in train]

In [69]:
y_train[0:5]

['POSITIVE', 'POSITIVE', 'POSITIVE', 'POSITIVE', 'POSITIVE']

In [70]:
x_train[5], y_train[5]

("Ellington Darden provides a comprehensive and balanced discussion of high intensity training (HIT).  Some form of HIT is really beneficial for strength and size gains.  A lot of training modalities don't work as well when you reach 67 years old (like me) but modified HIT does work, even for an old f_ _k like me with structural issues.",
 'POSITIVE')

In [71]:
x_test = [x.text for x in test]
y_test = [x.sentiment for x in test]

In [73]:
type(x_test), len(x_test)

(list, 3300)

### Bags of words vectorization

In [74]:
from sklearn.feature_extraction.text import CountVectorizer

In [75]:
vectorizer = CountVectorizer()
x_train_vector = vectorizer.fit_transform(x_train)

In [76]:
print(x_train[0],'\n', x_train_vector.shape)

This is a good part of the trilogy about three sisters and their inheritance. I also enjoyed it as a standalone.  I found it different from the Brodie series in that there was part of the paranormal and the past in all the books. This seems to be a new direction for Kat Martin, or perhaps it is because I am finished with the Brodie series and reading different series Kat has written. 
 (6700, 26986)


In [77]:
x_test_vector = vectorizer.transform(x_test)

# Classification

#### Linear SVM

In [78]:
from sklearn.svm import SVC

clf_svm = SVC(kernel = 'linear')

clf_svm.fit(x_train_vector,y_train)

clf_svm.predict(x_test_vector[2])

array(['NEGATIVE'], dtype='<U8')

In [84]:
x_test[73], type(x_test[73])

("I have read the trilogy now. What a brilliant story. Ms Pitts Caine has been painstaking in her research which delivers a detailed and richly engrossing story. I immediately recognised the characters from Cairo and The Tempering Agent and was excited to be reunited with them again and to learn more of their backgrounds.I particularly enjoyed the descriptions of Texas countryside - the lush greenery, the hills slightly rolled, the wide open skies; the 'Blackland Prairie'. I really felt I was on that journey with Mel and Addie.",
 str)

In [30]:
random_text_neg = ["Don't like this book at all. I will not buy this book"]
text_neg = 'Disgusting'

vectorizer.transform(random_text_neg).toarray()

array([[0, 0, 0, ..., 0, 0, 0]], dtype=int64)

#### Decision Tree

In [85]:
from sklearn.tree import DecisionTreeClassifier

clf_tree = DecisionTreeClassifier()

clf_tree.fit(x_train_vector,y_train)

clf_tree.predict(x_test_vector[87])


array(['POSITIVE'], dtype='<U8')

#### Naive Bayes

In [86]:
from sklearn.naive_bayes import GaussianNB
import numpy as np

clf_NB = GaussianNB()

clf_NB.fit(x_train_vector.toarray(),y_train)

clf_NB.predict(x_test_vector[87].toarray())

array(['NEGATIVE'], dtype='<U8')

#### Logistic Regression

In [87]:
from sklearn.linear_model import LogisticRegression

clf_lr = LogisticRegression()

clf_lr.fit(x_train_vector,y_train)

clf_lr.predict(x_test_vector[87])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


array(['POSITIVE'], dtype='<U8')

## Evaluation

In [88]:
#Mean Accuracy Score

clf_svm.score(x_test_vector,y_test), clf_tree.score(x_test_vector,y_test), clf_lr.score(x_test_vector,y_test), clf_NB.score(x_test_vector.toarray(),y_test)

(0.8081818181818182,
 0.7706060606060606,
 0.8396969696969697,
 0.6509090909090909)

In [89]:
#F1 Score

from sklearn.metrics import f1_score

print(' SVM: ',f1_score(y_test,clf_svm.predict(x_test_vector),average=None,labels=[Sentiment.POSITIVE,Sentiment.NEUTRAL,Sentiment.NEGATIVE]),'\n',
      'Decision Tree: ',f1_score(y_test,clf_tree.predict(x_test_vector),average=None,labels=[Sentiment.POSITIVE,Sentiment.NEUTRAL,Sentiment.NEGATIVE]),'\n',
      'Logistic Regression ',f1_score(y_test,clf_lr.predict(x_test_vector),average=None,labels=[Sentiment.POSITIVE,Sentiment.NEUTRAL,Sentiment.NEGATIVE]),'\n',
     'Naive_Bayes: ',f1_score(y_test,clf_NB.predict(x_test_vector.toarray()),average=None,labels=[Sentiment.POSITIVE,Sentiment.NEUTRAL,Sentiment.NEGATIVE]))

 SVM:  [0.90228841 0.28125    0.40969163] 
 Decision Tree:  [0.87830876 0.13804714 0.15915119] 
 Logistic Regression  [0.92134039 0.3016158  0.40214477] 
 Naive_Bayes:  [0.7933757  0.11292719 0.13623978]


###### So as we can see the model works very well in case of positive sentiments, however it works poorly in case of neutral and especially in negative cases.

## Improving our model

In [51]:
type(y_train)

list

In [59]:
elements = np.asarray(y_train)

np.unique(elements,return_counts="True")

(array(['NEGATIVE', 'NEUTRAL', 'POSITIVE'], dtype='<U8'),
 array([ 39,  71, 560], dtype=int64))

##### So in order to resolve the issue a little bit, we need to have more data, so that the models will have more values for neutral and negative cases. I downloaded a bigger version of the same file.

In [90]:
elements_new = np.asarray(y_train)

np.unique(elements_new,return_counts="True")

(array(['NEGATIVE', 'NEUTRAL', 'POSITIVE'], dtype='<U8'),
 array([ 432,  643, 5625], dtype=int64))

#### As we can see the data still contains large number of positive review cases, so we need to evenly distribute our data so that our models don't get biased with positive reviews.

In [133]:
train_container = ReviewContainer(train)
test_container = ReviewContainer(test)

train_container.evenly_distribute()

x_train_cont = train_container.get_text()
y_train_cont = train_container.get_sentiment()

x_test_cont = test_container.get_text()
y_test_cont = test_container.get_sentiment()

y_train_cont.count(Sentiment.NEGATIVE), y_train_cont.count(Sentiment.POSITIVE)

(432, 452)

In [134]:
len(train_container.reviews)

884

In [135]:
x_train_vector_cont = vectorizer.fit_transform(x_train_cont)
x_test_vector_cont = vectorizer.transform(x_test_cont)

In [125]:
len(x_test),len(y_test)

(3300, 3300)

In [138]:
def model_fit(model,x_train_vector_cont,y_train_cont):
    model.fit(x_train_vector_cont,y_train_cont)
    
def calculate_f1_score(model,x_test_vector,y_test):
    model_fit(model,x_train_vector_cont,y_train_cont)
    f1_score(y_test,model.predict(x_test_vector_cont),average=None,labels=[Sentiment.POSITIVE,Sentiment.NEGATIVE])

model_fit(clf_svm,x_train_vector_cont,y_train_cont)
model_fit(clf_lr,x_train_vector_cont,y_train_cont)
model_fit(clf_tree,x_train_vector_cont,y_train_cont)


print('SVM score: ',clf_svm.score(x_test_vector_cont,y_test_cont), 'DT score: ',clf_tree.score(x_test_vector_cont,y_test_cont), 'LR score: ',clf_lr.score(x_test_vector_cont,y_test_cont),'\n',sep='\n')
# print('SVM F1: ',calculate_f1_score(clf_svm,x_test_vector,y_test_cont), 'DT F1: ',calculate_f1_score(clf_tree,x_test_vector,y_test),
#       'LR F1: ',calculate_f1_score(clf_lr,x_test_vector,y_test))

SVM score: 
0.7427272727272727
DT score: 
0.610909090909091
LR score: 
0.7533333333333333


