<a href="https://colab.research.google.com/github/kssmp/NLP_basic_skl/blob/main/NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NLP based on Amazon Reviews

Dataset from : http://jmcauley.ucsd.edu/data/amazon/


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
import json

##Loading and Cleaning Data

In [None]:
# the books_small file is heavily biased towards positive reviews which is affecting our models thus we are choosing a deeper dataset which is more even for the training dataset to learn
#file_name = "Books_small.json"
file_name = "Books_small_10000.json"

In [None]:
# Data class
class Review:
  def __init__(self,text,score):
    self.text = text
    self.score = score
    self.sentiment = self.get_sentiment()

  def get_sentiment(self):
    if self.score <= 2 :
      return Sentiment.NEGATIVE
    elif self.score == 3 :
      return Sentiment.NEUTRAL
    else :
      return Sentiment.POSITIVE

class Sentiment:
  NEGATIVE = "NEGATIVE"
  POSITIVE = "POSITIVE"
  NEUTRAL = "NEUTRAL"

class ReviewContainer:
  def __init__ (self,reviews):
    self.reviews = reviews

  def evenly_distribute(self):
    negative = list(filter(lambda x: x.sentiment == Sentiment.NEGATIVE , self.reviews))
    positive = list(filter(lambda x: x.sentiment == Sentiment.POSITIVE , self.reviews))
    positive_shrunk = positive[:len(negative)]
    self.reviews = negative + positive_shrunk
    random.shuffle(self.reviews)

  def get_text(self):
    return [x.text for x in self.reviews]

  def get_sentiment(self):
    return [x.sentiment for x in self.reviews]

In [None]:
reviews = []
with open(file_name) as f:
  for line in f:
    review = json.loads(line)
    reviews.append(Review(review["reviewText"],review["overall"]))

#.text||.sentiment||.score

In [None]:
reviews[5].score

5.0

## Splitting Data into training and testing datasets

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
training , test = train_test_split(reviews, test_size = 0.33, random_state=42)

train_container = ReviewContainer(training)
test_container = ReviewContainer(test)

print(len(training),len(test))

6700 3300


In [None]:
train_container.evenly_distribute()

train_x = train_container.get_text()
train_y = train_container.get_sentiment()

test_container.evenly_distribute()
test_x = test_container.get_text()
test_y = test_container.get_sentiment()

## Bag of Words Vectorization

In [None]:
# Bag of Words in order to convert text into a computer friendly numerical feature vectors

from sklearn.feature_extraction.text import CountVectorizer
#vectorizer = CountVectorizer()

#Countvectorizer weighs all the words equally in a sentence so "This" and "amazing" hold the same value but while calculating the sentiment attached to the sentence this fails us
#Instead we use Term Frequency Inverse Document Frequency whihc gives us weighted values for the words i.e depends on how many times a word has been used => low importance word

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

train_x_vector = vectorizer.fit_transform(train_x)
test_x_vector = vectorizer.transform(test_x)


## Classification and Different possible models

####Linear SVM

In [None]:
from sklearn import svm

clf_svm = svm.SVC(kernel="linear")
clf_svm.fit(train_x_vector,train_y)

clf_svm.predict(test_x_vector[0])

array(['NEGATIVE'], dtype='<U8')

####  Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_x_vector,train_y)

clf_dec.predict(test_x_vector[0])


array(['NEGATIVE'], dtype='<U8')

#### Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

clf_gnb = GaussianNB()
clf_gnb.fit(train_x_vector.toarray(),train_y)

clf_gnb.predict(test_x_vector[0].toarray())

array(['NEGATIVE'], dtype='<U8')

#### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

clf_logreg = LogisticRegression()
clf_logreg.fit(train_x_vector,train_y)

clf_logreg.predict(test_x_vector[0])


array(['NEGATIVE'], dtype='<U8')

##Evaluation

In [None]:
# Mean Accuracy
print(clf_svm.score(test_x_vector,test_y))
print(clf_dec.score(test_x_vector,test_y))
print(clf_gnb.score(test_x_vector.toarray(),test_y))
print(clf_logreg.score(test_x_vector,test_y))

0.8076923076923077
0.6177884615384616
0.6610576923076923
0.8052884615384616


In [None]:
from sklearn.metrics import f1_score

f1_score(test_y,clf_svm.predict(test_x_vector),average=None, labels = [Sentiment.POSITIVE , Sentiment.NEGATIVE])

array([0.80582524, 0.80952381])

## Tuning Model using Grid Search

In [None]:
from sklearn.model_selection import GridSearchCV

parameters = {'kernel':('linear','rbf'),'C':(1,4,8,16,32)}

svc = svm.SVC()
clf = GridSearchCV(svc,parameters,cv=5)
clf.fit(train_x_vector,train_y)

## Saving Model

In [None]:
import pickle

In [None]:
with open('./classifier.pkl','wb') as f:
  pickle.dump(clf,f)

In [None]:
with open('./classifier.pkl','rb') as f:
  loaded_clf = pickle.load(f)

In [None]:
test_set = ["Data science is very interesting"]

In [None]:
clf.predict(vectorizer.transform(test_set))

array(['POSITIVE'], dtype='<U8')