In [1]:
# @itsvenu_
# NLP - this notebook - bag-of-words model for Sentiment analysis

# NLP parts
# if/else rules
# audio frequency components analysis
# bag-of-words model

# DeepNLP
# CNN for text recognition/classifcation
# seq2seq

## bag-of-words ##
# 20k most frequetly used english words -  a vector
# each position is for a specifci word
# SOS EOS <- beginning 2 elements; StartOfSentence, EndOfSentence
# special words <- last elements
# logistic regression to bag-of-words
# can also use DeepNN as input is a vector of numbers

In [42]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import nltk # to remove stop words, e.g. the; she; he...
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer # stemming - to round up the context e.g. loved=love
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

[nltk_data] Downloading package stopwords to /Users/venu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
dat = pd.read_csv('../../data/Restaurant_Reviews.tsv', delimiter='\t', quoting=3)
dat.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [20]:
# cleaning the text

corpus = []
for i in range(0, 1000):
    
    # remove punctuations
    review = re.sub('[^a-zA-Z]', ' ', dat['Review'][i])
    
    # all capital letters into lower
    review = review.lower()
    
    # split each review into words
    review = review.split()
    
    # stemming
    ps = PorterStemmer()
    all_stopwords = stopwords.words('english')
    all_stopwords.remove('not')
    review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
    review = ' '.join(review)
    corpus.append(review)

In [22]:
corpus[:5]

['wow love place',
 'crust not good',
 'not tasti textur nasti',
 'stop late may bank holiday rick steve recommend love',
 'select menu great price']

In [27]:
# bag-of-words model
# tokenization - preparing a sparse matrix
# each row - review
# each column - word from all reviews

cv = CountVectorizer()
X = cv.fit_transform(corpus).toarray()
Y = dat.iloc[:, -1].values

In [29]:
X.shape 

(1000, 1566)

In [30]:
#1566 words
# hopefully remove words that appear only once, e.g. names/texture

cv = CountVectorizer(max_features=1500) #1500 most frequent words
X = cv.fit_transform(corpus).toarray()
Y = dat.iloc[:, -1].values
X.shape

(1000, 1500)

In [32]:
X[:5]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [35]:
# split the data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [37]:
# NaiveBayes model
nbClassifier = GaussianNB()

In [38]:
nbClassifier.fit(X_train, Y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [40]:
# training model accuracy
accuracy_score(Y_train, nbClassifier.predict(X_train))

0.91875

In [41]:
# test data accuracy
accuracy_score(Y_test, nbClassifier.predict(X_test))

0.67

In [43]:
# can we beat this accuracy with LogisticRegression?

lrClassifer = LogisticRegression(random_state=42)

In [44]:
lrClassifer.fit(X_train, Y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=42, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [46]:
# LR - training accuracy
accuracy_score(Y_train, lrClassifer.predict(X_train))

0.975

In [47]:
# LR - test data accuracy
accuracy_score(Y_test, lrClassifer.predict(X_test))

0.765

In [None]:
# here logistic regression performed better than NaiveBayes 
# model. One can also increase the accuracy by testing 
# other classifier algorithms