# Natural Language Processing

## Importing the libraries

In [64]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Importing the dataset

In [65]:
dataset = pd.read_csv("Restaurant_Reviews.tsv", delimiter = '\t', quoting = 3) # quoting = 3 for ignore the - " -

## Cleaning the texts

In [66]:
import re
import nltk
nltk.download("stopwords") # downloaded stopwords
from nltk.corpus import stopwords # import downloaded stopwords
from nltk.stem.porter import PorterStemmer

corpus = []
N = dataset.shape[0]
for i in range(0, N):
    review = re.sub("[^a-zA-Z]", ' ', dataset["Review"][i])# replace all punctuations (NOT a-z or A-Z) by space
    review = review.lower() # convert all capital letters to lower case letters
    review = review.split() # review to list of words
    ps = PorterStemmer()
    all_stopwords = stopwords.words("english")
    all_stopwords.remove("not") # remove not from the list
    # if not word in set(stopwords.words["english"])
    # ignore unnecessary words in review like "the", "a", "an" .. or words not helpful for this prediction
    # ps.stem(word)
    # apply stemming on our review (ex : loved -> love   ) this helps to reduse dimentions
    # ['wow', 'loved', 'this', 'place'] -> ['wow', 'love', 'place']
    review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
    # ['wow', 'love', 'place'] -> 'wow love place'
    review = ' '.join(review)
    # ['wow love place', 'crust good'] example of two reviews
    corpus.append(review)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kavinda\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Creating the Bag of Words model

In [67]:
from sklearn.feature_extraction.text import CountVectorizer
# take only most frequent words this will ignore words that not relevant to our model like "steve" only appear ones
# first we run with all to identify the number of words and then we reduse it
# cv = CountVectorizer()
cv = CountVectorizer(max_features = 1500) # take only 1500 most frequent words from 1566 words
# fit will take all words and transform will put it in to columns [0 0 0 ... 0]
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:,-1].values

In [68]:
print(len(X[0])) # for get number of words for reduse (1566)

1500


## Splitting the dataset into the Training set and Test set

In [69]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

## Training the Naive Bayes model on the Training set

In [70]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

GaussianNB()

## Predicting the Test set results

In [71]:
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[1 0]
 [1 0]
 [1 0]
 [0 0]
 [0 0]
 [1 0]
 [1 1]
 [1 0]
 [1 0]
 [1 1]
 [1 1]
 [1 1]
 [1 0]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 1]
 [1 1]
 [1 0]
 [1 0]
 [0 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [1 0]
 [0 0]
 [1 0]
 [1 1]
 [1 1]
 [1 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [1 0]
 [1 0]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 0]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 0]
 [0 0]
 [1 0]
 [1 0]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [1 0]
 [1 1]
 [0 1]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 0]
 [0 0]
 [1 1]
 [1 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [1 0]
 [1 1]
 [1 0]
 [1 1]
 [1 1]
 [1 0]
 [0 1]
 [1 1]
 [1 1]
 [1 0]
 [0 1]
 [1 0]
 [1 1]
 [1 1]
 [0 0]
 [0 1]
 [0 1]
 [1 1]
 [0 0]
 [1 0]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 0]
 [0 0]
 [0 0]
 [1 1]
 [1 0]
 [0 0]
 [1 1]
 [1 0]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 0]
 [0 1]
 [1 1]
 [1 1]

## Making the Confusion Matrix

In [72]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[55 42]
 [12 91]]


0.73