In [2]:
#Traditional,Algorithmic Models - Scikit-learn
#Neural Network Models - TensorFlow/Keras, PyTorch

In [3]:
import random

class Sentiment:#this is an enumeration class used for defining constants(this is optional)
    NEGATIVE = 'NEGATIVE'
    NEUTRAL = 'NEUTRAL'
    POSITIVE = 'POSITIVE'

class Review: 
    def __init__(self, text , score): #it runs when u create a Review(... ) object
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()#this will call the function below
    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.NEGATIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else:
            return Sentiment.POSITIVE
        
class ReviewContainer: #ReviewContainer holds a list of Review objects.
                       #self.reviews = [Review(), Review(), ...].
    def __init__(self,reviews):
        self.reviews = reviews

    def get_text(self):
        return [x.text for x in self.reviews]
    
    def get_sentiments(self):
        return [x.sentiment for x in self.reviews]

    def evenly_distribute(self):
        negative = list(filter(lambda x : x.sentiment == Sentiment.NEGATIVE , self.reviews))
        positive = list(filter(lambda x : x.sentiment == Sentiment.POSITIVE , self.reviews))

        positive_shrunk = positive[:len(negative)]

        self.reviews = negative + positive_shrunk
        random.shuffle(self.reviews)#shuffled the list of reviews to mix negative and positive reviews randomly

In [4]:
'''reviews   → [Review(), Review(), Review()]  (a plain Python list holding many Review objects.)

ReviewContainer(reviews) → 
    - reviews stored inside
    - get_text()
    - get_sentiments()
    - evenly_distribute()
'''
'''Review(text = "Great book",score = 5,sentiment = "POSITIVE")#but this is not added into reviews list yet.
'''

'Review(text = "Great book",score = 5,sentiment = "POSITIVE")#but this is not added into reviews list yet.\n'

## Load data

In [5]:
import json

file_name = 'Books_small_10000.json'

reviews = []
with open(file_name) as file:
    for line in file:
        review = json.loads(line)#this will convert json string to python dictionary
        reviews.append(Review(review['reviewText'], review['overall']))
#reviews is now a list of Review objects
#reviews[5].text
#reviews[5].score
#reviews[5].sentiment


## Prep Data

In [6]:
from sklearn.model_selection import train_test_split

training, testing = train_test_split(reviews,test_size = 0.33,random_state=42)
#print(training[0].text)


train_container = ReviewContainer(training)#object of ReviewContainer class
#train_container.evenly_distribute() this will balance the dataset by reducing the number of positive reviews

test_container = ReviewContainer(testing)


In [7]:
#x is what we feed to the model
#y is what we expect the model to predict
#for training we use train_x and train_y together to train the model
#for testing we use test_x and see if the model predicts test_y correctly.
train_container.evenly_distribute()

train_x = train_container.get_text()
train_y = train_container.get_sentiments()

test_container.evenly_distribute()
test_x = test_container.get_text()
test_y = test_container.get_sentiments()

print(train_y.count(Sentiment.POSITIVE))
print(train_y.count(Sentiment.NEGATIVE))
print(test_y.count(Sentiment.POSITIVE))
print(test_y.count(Sentiment.NEGATIVE))

436
436
208
208


In [8]:
print(f'Training size: {len(training)}')
print(f'Testing size: {len(testing)}')

Training size: 6700
Testing size: 3300


#### Bag of words Vectorization

In [23]:
#converting the text data into numerical data using Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer , TfidfVectorizer
'''vectorizer = CountVectorizer()'''


vectorizer = TfidfVectorizer()
#fit_transform will learn the vocabulary and return the document-term matrix
train_x_vectors = vectorizer.fit_transform(train_x)#u can do in 2 steps(fit + transform) also this is a 2 in 1 function
test_x_vectors = vectorizer.transform(test_x)#here we only transform using the vocabulary learned from training data.we dont fit because we want to use the same vocabulary as training data

#print(train_x_vectors)
#train_y



#print(train_x[0]) #gives the original text
#print(train_x_vectors[0]) #gives the vectorized representation

## Classification

#### Linear SVM Classifier

In [24]:
from sklearn import svm


clf_svm = svm.SVC(kernel='linear')
clf_svm.fit(train_x_vectors, train_y)#svm model is trained here

#print(test_x[0]) 
clf_svm.predict(test_x_vectors[0])#predicting the sentiment of the given review in the test set

array(['POSITIVE'], dtype='<U8')

#### Decision Tree

In [25]:
from sklearn.tree import DecisionTreeClassifier


clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_x_vectors, train_y)#decision tree model is trained here

'''print(test_x[0])
print(test_y[0])'''
clf_dec.predict(test_x_vectors[0])#predicting the sentiment of the given review in the test set

array(['POSITIVE'], dtype='<U8')

#### Naive Bayes

In [26]:
from sklearn.naive_bayes import GaussianNB

clf_gnb = GaussianNB()
clf_gnb.fit(train_x_vectors.toarray(), train_y)#naive bayes model is trained here

clf_gnb.predict(test_x_vectors[0].toarray())#predicting the sentiment of the given review in the test set

#'.toarray()' is used to convert sparse matrix to dense matrix because GaussianNB does not accept sparse matrix as input

array(['POSITIVE'], dtype='<U8')

#### Logistic Regression

In [27]:
from sklearn.linear_model import LogisticRegression

clf_log = LogisticRegression()
clf_log.fit(train_x_vectors, train_y)

clf_log.predict(test_x_vectors[0])


array(['POSITIVE'], dtype='<U8')

### Evaluation

In [28]:
#Mean Accuracy on the test set
print(clf_svm.score(test_x_vectors,test_y))
print(clf_dec.score(test_x_vectors,test_y))
print(clf_gnb.score(test_x_vectors.toarray(),test_y))
print(clf_log.score(test_x_vectors,test_y))

0.8076923076923077
0.6394230769230769
0.6610576923076923
0.8028846153846154


In [29]:
#F1 Score
from sklearn.metrics import f1_score
print(f1_score(test_y,clf_svm.predict(test_x_vectors),average = None,labels = [Sentiment.NEGATIVE,Sentiment.POSITIVE]))
print(f1_score(test_y,clf_dec.predict(test_x_vectors),average = None,labels = [Sentiment.NEGATIVE,Sentiment.POSITIVE]))
print(f1_score(test_y,clf_gnb.predict(test_x_vectors.toarray()),average = None,labels = [Sentiment.NEGATIVE,Sentiment.POSITIVE]))
print(f1_score(test_y,clf_log.predict(test_x_vectors),average = None,labels = [Sentiment.NEGATIVE,Sentiment.POSITIVE]))

[0.80952381 0.80582524]
[0.64285714 0.63592233]
[0.66508314 0.65693431]
[0.8047619  0.80097087]


In [16]:
print(test_y.count(Sentiment.POSITIVE))
print(test_y.count(Sentiment.NEGATIVE))
print(test_y.count(Sentiment.NEUTRAL))
#the reason the f1 score is low on the negative side is 
# beacuse we trained the model on a balanced dataset 
# but the test dataset is imbalanced with more positive reviews than negative reviews

208
208
0


In [30]:
test_set = ['brilliant ','bad book do not buy' , 'horrible waste of time']
new_test = vectorizer.transform(test_set)

clf_svm.predict(new_test)

array(['POSITIVE', 'NEGATIVE', 'NEGATIVE'], dtype='<U8')

### Tuning our model (with Grid search)

In [32]:
from sklearn.model_selection import GridSearchCV

parameters = {'kernel':('linear','rbf'),'C':(1,4,8,16,32)}
svc = svm.SVC()
clf = GridSearchCV(svc,parameters, cv=5)#cv is cross validation folds
clf.fit(train_x_vectors,train_y)

In [33]:
print(clf.best_params_)

{'C': 4, 'kernel': 'rbf'}


In [34]:
print(clf_svm.score(test_x_vectors,test_y))

0.8076923076923077


## Saving Model

In [None]:
import pickle
with open('sentiment_classifier.pkl','wb') as f:
    pickle.dump(clf,f)
    
#wb means write binary.if the file does not exist it will be created and if it exists it will be overwritten.
#dumping the clf object into the file f.clf is the model after grid search.
#we are saving the model as a pickle file so that we can use it later without building and training it again.

### Load data

In [42]:
with open('sentiment_classifier.pkl','rb') as f:
    loaded_clf = pickle.load(f)

In [43]:
loaded_clf.predict(new_test)

array(['POSITIVE', 'NEGATIVE', 'NEGATIVE'], dtype='<U8')