# Natural Language Processing

## Importing the libraries

In [19]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [20]:
# tsv is tab separted value, quoting is used to ignore all the double quotes in the data
dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter = '\t', quoting = 3)

## Cleaning the texts

In [21]:
import re
# nltk is a very old nlp library that includes the stop words (words that are not going to 
# help us in the prediction like "the" "a" 
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
# Stemming takes the root of the word instead of considering all the forms a word 
# for example "liked" would be transformed to like. It reduces the dimention of matrix
from nltk.stem.porter import PorterStemmer
corpus = []
for i in range(0, 1000):
  # remove all punctuation would be replaced by space 
  review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
  review = review.lower()
  review = review.split()
  ps = PorterStemmer()
  all_stopwords = stopwords.words('english')
  all_stopwords.remove('not')
  review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
  review = ' '.join(review)
  corpus.append(review)

[nltk_data] Downloading package stopwords to /Users/jaber/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


print(corpus)

## Creating the Bag of Words model

In [23]:
from sklearn.feature_extraction.text import CountVectorizer
# max_features determines 
cv = CountVectorizer(max_features = 1500)
# fit would take all the words from review and transform would put all the words into the columns
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, -1].values

## Splitting the dataset into the Training set and Test set

In [24]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

## Training the Naive Bayes model on the Training set

In [25]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

# Testing CART

In [66]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(max_depth=10)
classifier.fit(X_train, y_train)

# Testing max entropy 

In [76]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(max_iter = 1000)
classifier.fit(X_train, y_train)

## Predicting the Test set results

In [77]:
y_pred = classifier.predict(X_test)
#print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

## Making the Confusion Matrix

In [78]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
cm = confusion_matrix(y_test, y_pred)
print(cm)
print(accuracy_score(y_test, y_pred))
report = classification_report(y_test, y_pred)
print(report)

[[80 17]
 [28 75]]
0.775
              precision    recall  f1-score   support

           0       0.74      0.82      0.78        97
           1       0.82      0.73      0.77       103

    accuracy                           0.78       200
   macro avg       0.78      0.78      0.77       200
weighted avg       0.78      0.78      0.77       200



# Predicting single review is postive or negative

In [43]:
# All steps would be similar to previous one
def PredictReview(singleReview):
    singleReview = re.sub('[^a-zA-Z]' , ' ', singleReview)
    singleReview = singleReview.lower()
    singleReview = singleReview.split()
    ps = PorterStemmer()
    all_stopwords = stopwords.words('english')
    all_stopwords.remove('not')
    singleReview = new_review = [ps.stem(word) for word in singleReview if not word in set(all_stopwords)]
    singleReview = ' '.join(singleReview)
    singleCorpus = [singleReview]
    print(singleCorpus)
    new_X_test = cv.transform(singleCorpus).toarray()
    new_y_pred = classifier.predict(new_X_test)
    print(new_y_pred)

In [44]:
review = "I love this restaurant so much"
PredictReview(review)
review = 'I hate this restaurant so much'
PredictReview(review)

['love restaur much']
[0]
['hate restaur much']
[0]
