<a href="https://colab.research.google.com/github/kevinegan31/Udemy_Machnine_Learning_A-Z/blob/main/Python/Natural_Language_Processing/natural_language_processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Natural Language Processing

## Importing the libraries

In [9]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [10]:
# Setting quoting parameter to three to ignore quotes
dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter = "\t", quoting=3)

## Cleaning the texts

In [11]:
# re library used to simplify the reviews
import re
# nltk library allows us to download the ensemble of stopwords
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
# Removing conjugation of the verbs and just keeping
# the present tense of the reviews. This simplifies the sparse
# matrix so that we can more easily create our bag of words model.
from nltk.stem.porter import PorterStemmer
# Corpus will simply contain all cleaned reviews.
# For loop will iterate through reviews, put all letters in lowerase,
# remove punctuation, remove stopwords, etc.
corpus = []
for i in range(0,len(dataset)):
  # Replace any non-letter from A-Z by a space.
  # ^ = not anything that follows
  review = re.sub('\[^a-zA-Z]', ' ', dataset['Review'][i])
  review = review.lower()
  review = review.split()
  ps = PorterStemmer()
  all_stopwords = stopwords.words('english')
  all_stopwords.remove('not')
  review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
  review = ' '.join(review)
  corpus.append(review)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
print(corpus)

['wow... love place.', 'crust not good.', 'not tasti textur nasty.', 'stop late may bank holiday rick steve recommend love it.', 'select menu great prices.', 'get angri want damn pho.', 'honeslti tast fresh.)', 'potato like rubber could tell made ahead time kept warmer.', 'fri great too.', 'great touch.', 'servic prompt.', 'would not go back.', 'cashier care ever say still end wayyy overpriced.', 'tri cape cod ravoli, chicken, cranberry...mmmm!', 'disgust pretti sure human hair.', 'shock sign indic cash only.', 'highli recommended.', 'waitress littl slow service.', 'place not worth time, let alon vegas.', 'not like all.', 'burritto blah!', 'food, amazing.', 'servic also cute.', 'could care less... interior beautiful.', 'performed.', "that' right....th red velvet cake.....ohhh stuff good.", '- never brought salad ask for.', 'hole wall great mexican street tacos, friendli staff.', 'took hour get food 4 tabl restaur food luke warm, sever run around like total overwhelmed.', 'worst salmon 

## Creating the Bag of Words model

In [20]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=1800)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, -1].values

In [21]:
len(X[0])

1800

## Splitting the dataset into the Training set and Test set

In [22]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

## Training the Naive Bayes model on the Training set

In [23]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

GaussianNB()

## Predicting the Test set results

In [24]:
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[1 0]
 [1 0]
 [1 0]
 [0 0]
 [0 0]
 [1 0]
 [1 1]
 [0 0]
 [1 0]
 [1 1]
 [0 1]
 [1 1]
 [1 0]
 [1 1]
 [1 1]
 [1 1]
 [1 0]
 [1 0]
 [0 0]
 [1 1]
 [0 0]
 [0 1]
 [1 1]
 [1 0]
 [1 0]
 [0 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [1 0]
 [1 0]
 [0 0]
 [1 1]
 [1 1]
 [1 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [1 0]
 [1 0]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 0]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 0]
 [0 0]
 [1 0]
 [1 0]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [1 0]
 [1 1]
 [0 1]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 0]
 [1 1]
 [1 1]
 [1 0]
 [0 0]
 [1 1]
 [1 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [1 0]
 [1 1]
 [1 0]
 [1 1]
 [1 1]
 [1 0]
 [0 1]
 [1 1]
 [1 1]
 [1 0]
 [0 1]
 [1 0]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [0 1]
 [1 1]
 [0 0]
 [1 0]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 0]
 [1 1]
 [1 0]
 [0 0]
 [0 0]
 [1 1]
 [1 0]
 [0 0]
 [1 1]
 [1 0]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 0]
 [0 1]
 [1 1]
 [1 1]

## Making the Confusion Matrix

In [25]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[ 65  52]
 [ 15 118]]


0.732