# Natural Language Processing

## Importing the libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [None]:
dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter= '\t', quoting= 3 )

## Cleaning the texts

In [None]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []
for i in range(0,1000):
  review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
  review = review.lower()
  review = review.split()
  ps = PorterStemmer()
  all_stopwords = stopwords.words('english')
  all_stopwords.remove('not')
  all_stopwords.remove("isn't")
  review = [ps.stem(word) for word in review if not word in set (all_stopwords)]
  review = ' '.join(review)
  corpus.append(review)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Creating the Bag of Words model

In [None]:
# print(corpus)
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:,-1].values

## Splitting the dataset into the Training set and Test set

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(X,y, test_size =0.2, random_state = 0)

In [None]:
FP = [0]*10
FN = [0]*10
TP = [0]*10
TN = [0]*10
P = [0]*10
R = [0]*10

In [None]:
#Accuracy Calculation
def accuracy(TP, TN, FP, FN):
  return (TP + TN) * 100/ (TP + TN + FN + FP)

#Precision Calculation
def precision(TP, FP):
  P[i] = TP/ (TP + FP)
  return P[i] * 100

#Recall Calculation
def recall(TP, FN):
  R[i] = TP/(TP +FN)
  return R[i] * 100

#F!_score Calculation
def F1_score(P, R):
  return (2 * P * R * 100)/ (P + R)

## Training the Logistic Regression Model on the Training set

In [None]:
from sklearn.linear_model import LogisticRegression
classifier_LR = LogisticRegression(random_state = 0)
classifier_LR.fit(x_train, y_train)
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, classifier_LR.predict(x_test))
FN[0] = cm[1][0]
FP[0] = cm[0][1]
TP[0] = cm[0][0]
TN[0] = cm[1][1]

# Training the KNN Model on the Training set


In [None]:
from sklearn.neighbors import KNeighborsClassifier
classifier_KNN = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier_KNN.fit(x_train, y_train)
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, classifier_KNN.predict(x_test))
FN[1] = cm[1][0]
FP[1] = cm[0][1]
TP[1] = cm[0][0]
TN[1] = cm[1][1]

# Training the SVM Model on the Training set

In [None]:
from sklearn.svm import SVC
classifier_SVC = SVC(kernel = 'linear', random_state = 0)
classifier_SVC.fit(x_train, y_train)
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, classifier_SVC.predict(x_test))
FN[2] = cm[1][0]
FP[2] = cm[0][1]
TP[2] = cm[0][0]
TN[2] = cm[1][1]

# Training the Kernel SVM Model on the Trining set

In [None]:
from sklearn.svm import SVC
classifier_KSVC = SVC(kernel = 'rbf', random_state = 0)
classifier_KSVC.fit(x_train, y_train)
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, classifier_KSVC.predict(x_test))
FN[3] = cm[1][0]
FP[3] = cm[0][1]
TP[3] = cm[0][0]
TN[3] = cm[1][1]

# Training the Naive Bayes model on the Training set

In [None]:
from sklearn.naive_bayes import GaussianNB
classifier_NB = GaussianNB()
classifier_NB.fit(x_train, y_train)
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, classifier_NB.predict(x_test))
FN[4] = cm[1][0]
FP[4] = cm[0][1]
TP[4] = cm[0][0]
TN[4] = cm[1][1]

# Training the Decision Tree Classification Model on the Trining set

In [None]:
from sklearn.tree import DecisionTreeClassifier
classifier_DTC = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier_DTC.fit(x_train, y_train)
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, classifier_DTC.predict(x_test))
FN[5] = cm[1][0]
FP[5] = cm[0][1]
TP[5] = cm[0][0]
TN[5] = cm[1][1]

# Training the Random Forest Classification Model on the Trining set

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier_RFC = RandomForestClassifier(n_estimators = 300, criterion = 'entropy', random_state = 0)
classifier_RFC.fit(x_train, y_train)
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, classifier_RFC.predict(x_test))
FN[6] = cm[1][0]
FP[6] = cm[0][1]
TP[6] = cm[0][0]
TN[6] = cm[1][1]

RESULTS

In [None]:
r2_dict = {
    'Logistic Regression' : FN[0],
    'KNN' : FN[1],
    'SVM': FN[2],
    'Kernel SVM' : FN[3],
    'Naive Bayes' : FN[4],
    'Decision Tree' : FN[5],
    'Random Forest' : FN[6]
}

#default values
best = 100
best_name = 'Winner Winner Chicken Dinner!!'

#printing table of results
print("{: <40} {: <20} {: <20} {: <20} {: <20} {: <20}".format('Classification Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score', 'False Positive'))
print('-' * 130)
i = 0
for key, val in r2_dict.items():
  if best > val:
    best = val
    best_name = key
  print("{: <40} {: <20} {: <20} {: <20} {: <20} {: <20}".format(key, accuracy(TP[i], TN[i], FP[i], FN[i]), precision(TP[i], FP[i]), recall(TP[i], FN[i]), F1_score(P[i], R[i] ), val ))
  i = i + 1

#Printing the best model
print('-' * 130)
print(best_name, 'model gave the best results as it had ', best, ' number of False Negatives which are considered as Dangerous for any model.')

Classification Model                     Accuracy             Precision            Recall               F1 Score             False Positive      
----------------------------------------------------------------------------------------------------------------------------------
Logistic Regression                      77.5                 82.4742268041237     74.07407407407408    78.04878048780486    28                  
KNN                                      64.0                 68.04123711340206    61.6822429906542     64.70588235294117    41                  
SVM                                      80.0                 81.44329896907216    78.21782178217822    79.79797979797979    22                  
Kernel SVM                               77.5                 91.75257731958763    70.63492063492063    79.8206278026906     37                  
Naive Bayes                              73.0                 56.70103092783505    82.08955223880598    67.07317073170731    12            