# Natural Language Processing

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
#Importing the dataset
dataset = pd.read_csv('Dataset\\Restaurant_Reviews.tsv', delimiter='\t', quoting=3)
#Our file is a tsv file but we are reading it as csv file so we need to pass a delimiter as "\t"
#In future we don't need a trouble due to " "(quotes) so will ignore using command 'qoting'
# 3 is for ignoring quotes

In [3]:
dataset.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [4]:
import re 
import nltk

In [5]:
nltk.download('stopwords')
review = re.sub('[^a-zA-Z]',' ',dataset['Review'][0],)
#Here we are working only for first review.

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Samir\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
print("Text with only alphabets left, numbers and punctuation being removed : ",review)

Text with only alphabets left, numbers and punctuation being removed :  Wow    Loved this place 


In [7]:
review = review.lower()

In [8]:
print('Case changed to lower case of the above text : ',review)

Case changed to lower case of the above text :  wow    loved this place 


In [9]:
review =review.split()

In [10]:
print('string being converted into list ',review)

string being converted into list  ['wow', 'loved', 'this', 'place']


In [11]:
from nltk.corpus import stopwords
review = [word for word in review if not word in set(stopwords.words('english'))]# List comprehensions

In [12]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]#This step can be done 
#in the above line while we were implementing stopwords on different wordsby making a small change in the code above.
#(repeated for learning process only.)

In [13]:
print('After aplying stemming on the above colection of words ',review)

After aplying stemming on the above colection of words  ['wow', 'love', 'place']


In [14]:
#Converting list to string back again
review = ' '.join(review)

In [15]:
print('List converted back to string : ',review)

List converted back to string :  wow love place


In [16]:
corpus=[]
for i in range(0,1000):
    review = re.sub('[^a-zA-Z]',' ',dataset['Review'][i],)
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)#We need to append clean review to corpus

In [17]:
corpus[:5]#Getting first 5 results

['wow love place',
 'crust good',
 'tasti textur nasti',
 'stop late may bank holiday rick steve recommend love',
 'select menu great price']

In [18]:
from sklearn.feature_extraction.text import CountVectorizer
#for starting of tokenization
cv = CountVectorizer(max_features=1500)# Number of relevent words that we want to keep
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:,1].values#Taking the dependent variable

In [19]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [20]:
y[:10]#displaying top 10 dependent variables.

array([1, 0, 0, 1, 1, 0, 0, 0, 1, 1], dtype=int64)

In [21]:
X.shape

(1000, 1500)

**Trying different clasification Models**

In [22]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

# Naive Bayes

In [23]:
# Fitting Naive Bayes to the Training set
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)
# Predicting the Test set results
y_pred = classifier.predict(X_test)

In [24]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm

array([[55, 42],
       [12, 91]], dtype=int64)

In [25]:
total = cm[0,0]+cm[0,1]+cm[1,0]+cm[1,1]
correct = cm[0,0]+cm[1,1]
incorrect = cm[0,1]+cm[1,0]
accuracy = correct/total
precision = cm[0,0]/(cm[0,0]+cm[0,1])
recall = cm[0,0]/(cm[0,0]+cm[1,0])
f1_score = 2*((precision*recall)/(precision+recall))

In [26]:
print('Our model made',cm[0,0],'correct prediction of negative reviews,',cm[1,1],'correct prediction of postive reviews',
     cm[0,1],'incorrect predictions of positive reviews and',cm[1,0],'incorrect prediction of negative reviews out of total',
     total,'reviews.\n')
print("So number of correct prediction is ",correct,'\nand number of incorrect predictions is ',incorrect)

print('\nAccuracy = ',accuracy)
print('\nPrecision = ',precision)
print('\nRecall = ',recall)
print('\nF1 Score = ',f1_score)

Our model made 55 correct prediction of negative reviews, 91 correct prediction of postive reviews 42 incorrect predictions of positive reviews and 12 incorrect prediction of negative reviews out of total 200 reviews.

So number of correct prediction is  146 
and number of incorrect predictions is  54

Accuracy =  0.73

Precision =  0.5670103092783505

Recall =  0.8208955223880597

F1 Score =  0.6707317073170731


# Logistic Regression

In [27]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)
# Predicting the Test set results
y_pred = classifier.predict(X_test)
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)



In [28]:
cm

array([[76, 21],
       [37, 66]], dtype=int64)

In [29]:
total = cm[0,0]+cm[0,1]+cm[1,0]+cm[1,1]
correct = cm[0,0]+cm[1,1]
incorrect = cm[0,1]+cm[1,0]
accuracy = correct/total
precision = cm[0,0]/(cm[0,0]+cm[0,1])
recall = cm[0,0]/(cm[0,0]+cm[1,0])
f1_score = 2*((precision*recall)/(precision+recall))

print('Our model made',cm[0,0],'correct prediction of negative reviews,',cm[1,1],'correct prediction of postive reviews',
     cm[0,1],'incorrect predictions of positive reviews and',cm[1,0],'incorrect prediction of negative reviews out of total',
     total,'reviews.\n')
print("So number of correct prediction is ",correct,'\nand number of incorrect predictions is ',incorrect)

print('\nAccuracy = ',accuracy)
print('\nPrecision = ',precision)
print('\nRecall = ',recall)
print('\nF1 Score = ',f1_score)

Our model made 76 correct prediction of negative reviews, 66 correct prediction of postive reviews 21 incorrect predictions of positive reviews and 37 incorrect prediction of negative reviews out of total 200 reviews.

So number of correct prediction is  142 
and number of incorrect predictions is  58

Accuracy =  0.71

Precision =  0.7835051546391752

Recall =  0.672566371681416

F1 Score =  0.7238095238095238


# K-NN

In [30]:
# Fitting K-NN to the Training set
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier.fit(X_train, y_train)
# Predicting the Test set results
y_pred = classifier.predict(X_test)
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm

array([[74, 23],
       [55, 48]], dtype=int64)

In [31]:
total = cm[0,0]+cm[0,1]+cm[1,0]+cm[1,1]
correct = cm[0,0]+cm[1,1]
incorrect = cm[0,1]+cm[1,0]
accuracy = correct/total
precision = cm[0,0]/(cm[0,0]+cm[0,1])
recall = cm[0,0]/(cm[0,0]+cm[1,0])
f1_score = 2*((precision*recall)/(precision+recall))

print('Our model made',cm[0,0],'correct prediction of negative reviews,',cm[1,1],'correct prediction of postive reviews',
     cm[0,1],'incorrect predictions of positive reviews and',cm[1,0],'incorrect prediction of negative reviews out of total',
     total,'reviews.\n')
print("So number of correct prediction is ",correct,'\nand number of incorrect predictions is ',incorrect)

print('\nAccuracy = ',accuracy)
print('\nPrecision = ',precision)
print('\nRecall = ',recall)
print('\nF1 Score = ',f1_score)

Our model made 74 correct prediction of negative reviews, 48 correct prediction of postive reviews 23 incorrect predictions of positive reviews and 55 incorrect prediction of negative reviews out of total 200 reviews.

So number of correct prediction is  122 
and number of incorrect predictions is  78

Accuracy =  0.61

Precision =  0.7628865979381443

Recall =  0.5736434108527132

F1 Score =  0.654867256637168


# SVM

In [32]:
# Fitting SVM to the Training set
from sklearn.svm import SVC
classifier = SVC(kernel = 'linear', random_state = 0)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm

array([[74, 23],
       [33, 70]], dtype=int64)

In [33]:
total = cm[0,0]+cm[0,1]+cm[1,0]+cm[1,1]
correct = cm[0,0]+cm[1,1]
incorrect = cm[0,1]+cm[1,0]
accuracy = correct/total
precision = cm[0,0]/(cm[0,0]+cm[0,1])
recall = cm[0,0]/(cm[0,0]+cm[1,0])
f1_score = 2*((precision*recall)/(precision+recall))

print('Our model made',cm[0,0],'correct prediction of negative reviews,',cm[1,1],'correct prediction of postive reviews',
     cm[0,1],'incorrect predictions of positive reviews and',cm[1,0],'incorrect prediction of negative reviews out of total',
     total,'reviews.\n')
print("So number of correct prediction is ",correct,'\nand number of incorrect predictions is ',incorrect)

print('\nAccuracy = ',accuracy)
print('\nPrecision = ',precision)
print('\nRecall = ',recall)
print('\nF1 Score = ',f1_score)

Our model made 74 correct prediction of negative reviews, 70 correct prediction of postive reviews 23 incorrect predictions of positive reviews and 33 incorrect prediction of negative reviews out of total 200 reviews.

So number of correct prediction is  144 
and number of incorrect predictions is  56

Accuracy =  0.72

Precision =  0.7628865979381443

Recall =  0.6915887850467289

F1 Score =  0.7254901960784315


# Kernal SVM

In [34]:
# Fitting classifier to the Training set
from sklearn.svm import SVC
classifier = SVC(kernel = 'rbf',random_state = 0)
classifier.fit(X_train,y_train)
# Predicting the Test set results
y_pred = classifier.predict(X_test)
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm



array([[ 97,   0],
       [103,   0]], dtype=int64)

In [35]:
total = cm[0,0]+cm[0,1]+cm[1,0]+cm[1,1]
correct = cm[0,0]+cm[1,1]
incorrect = cm[0,1]+cm[1,0]
accuracy = correct/total
precision = cm[0,0]/(cm[0,0]+cm[0,1])
recall = cm[0,0]/(cm[0,0]+cm[1,0])
f1_score = 2*((precision*recall)/(precision+recall))

print('Our model made',cm[0,0],'correct prediction of negative reviews,',cm[1,1],'correct prediction of postive reviews',
     cm[0,1],'incorrect predictions of positive reviews and',cm[1,0],'incorrect prediction of negative reviews out of total',
     total,'reviews.\n')
print("So number of correct prediction is ",correct,'\nand number of incorrect predictions is ',incorrect)

print('\nAccuracy = ',accuracy)
print('\nPrecision = ',precision)
print('\nRecall = ',recall)
print('\nF1 Score = ',f1_score)

Our model made 97 correct prediction of negative reviews, 0 correct prediction of postive reviews 0 incorrect predictions of positive reviews and 103 incorrect prediction of negative reviews out of total 200 reviews.

So number of correct prediction is  97 
and number of incorrect predictions is  103

Accuracy =  0.485

Precision =  1.0

Recall =  0.485

F1 Score =  0.6531986531986532


# Decision Tree

In [36]:
# Fitting classifier to the Training set
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion= "entropy",random_state=0)
classifier.fit(X_train,y_train)
# Predicting the Test set results
y_pred = classifier.predict(X_test)
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm

array([[74, 23],
       [35, 68]], dtype=int64)

In [37]:
total = cm[0,0]+cm[0,1]+cm[1,0]+cm[1,1]
correct = cm[0,0]+cm[1,1]
incorrect = cm[0,1]+cm[1,0]
accuracy = correct/total
precision = cm[0,0]/(cm[0,0]+cm[0,1])
recall = cm[0,0]/(cm[0,0]+cm[1,0])
f1_score = 2*((precision*recall)/(precision+recall))

print('Our model made',cm[0,0],'correct prediction of negative reviews,',cm[1,1],'correct prediction of postive reviews',
     cm[0,1],'incorrect predictions of positive reviews and',cm[1,0],'incorrect prediction of negative reviews out of total',
     total,'reviews.\n')
print("So number of correct prediction is ",correct,'\nand number of incorrect predictions is ',incorrect)

print('\nAccuracy = ',accuracy)
print('\nPrecision = ',precision)
print('\nRecall = ',recall)
print('\nF1 Score = ',f1_score)

Our model made 74 correct prediction of negative reviews, 68 correct prediction of postive reviews 23 incorrect predictions of positive reviews and 35 incorrect prediction of negative reviews out of total 200 reviews.

So number of correct prediction is  142 
and number of incorrect predictions is  58

Accuracy =  0.71

Precision =  0.7628865979381443

Recall =  0.6788990825688074

F1 Score =  0.7184466019417477


# Random Forest

In [38]:
# Fitting classifier to the Training set
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=10,criterion="entropy",random_state=0)
classifier.fit(X_train,y_train)
# Predicting the Test set results
y_pred = classifier.predict(X_test)
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm

array([[87, 10],
       [46, 57]], dtype=int64)

In [39]:
total = cm[0,0]+cm[0,1]+cm[1,0]+cm[1,1]
correct = cm[0,0]+cm[1,1]
incorrect = cm[0,1]+cm[1,0]
accuracy = correct/total
precision = cm[0,0]/(cm[0,0]+cm[0,1])
recall = cm[0,0]/(cm[0,0]+cm[1,0])
f1_score = 2*((precision*recall)/(precision+recall))

print('Our model made',cm[0,0],'correct prediction of negative reviews,',cm[1,1],'correct prediction of postive reviews',
     cm[0,1],'incorrect predictions of positive reviews and',cm[1,0],'incorrect prediction of negative reviews out of total',
     total,'reviews.\n')
print("So number of correct prediction is ",correct,'\nand number of incorrect predictions is ',incorrect)

print('\nAccuracy = ',accuracy)
print('\nPrecision = ',precision)
print('\nRecall = ',recall)
print('\nF1 Score = ',f1_score)

Our model made 87 correct prediction of negative reviews, 57 correct prediction of postive reviews 10 incorrect predictions of positive reviews and 46 incorrect prediction of negative reviews out of total 200 reviews.

So number of correct prediction is  144 
and number of incorrect predictions is  56

Accuracy =  0.72

Precision =  0.8969072164948454

Recall =  0.6541353383458647

F1 Score =  0.7565217391304349


From here we can conclude the various accuracy and f1 scores :<br><br>
Naive Bayes : Accuracy = 73% F1-Score = 67%<br>
Logistic Regression : Accuracy = 71% F1-Score = 72%<br>
K-NN : Accuracy = 61% F1-Score = 65% <br>
SVM : Accuracy = 72% F1-Score = 72% <br>
kernel-SVM : Accuracy = 48% F1-Score = 65% <br>
Decision Tree: Accuracy = 71% F1-Score = 71% <br>
Random Forest: Accuracy = 72% F1-Score = 75% <br><br>
So as we can see that the best model was the Random Forest Model.