# Natural Language Processing

## Importing libraries

In [33]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pickle

## Importing dataset

In [34]:
dataset = pd.read_csv('Augmented_Reviews.tsv', delimiter = '\t', quoting = 3)

## Cleaning the dataset

In [35]:
import re
import nltk
nltk.download('stopwords')
# stopwords -> words that doesn't change the context of the sentence
# example : he, she, it, as, was, etc.,
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []
for i in range(0, 10000): #because of the presence of 1000 reviews in the dataset
  #[^a-zA-Z] is the regular expression pattern. It matches any character that is not an English letter (both uppercase and lowercase).
  #The second argument, ' ', is a space. This means that any character that matches the pattern (i.e., any non-letter character) will be replaced by a space.
  ratings = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
  ratings = ratings.lower()
  ratings = ratings.split()

  #The PorterStemmer is used for stemming words, which means reducing words to their root or base form.
  #This is useful in text preprocessing for natural language processing tasks. For example, the words "running", "runner", and "ran" would all be reduced to the root form "run".
  ps = PorterStemmer()
  all_stopwords = stopwords.words('english') #chosen language -> english
  all_stopwords.remove('not') #removing 'not' word from stop words beacuse it can determine the sentiment of the review (tasty->positive, not tasty->negative)

  #The entire line is a list comprehension that creates a new list.
  #It iterates over each word in the zomato_ratings list. The condition "if not word in set(all_stopwords)" ensures that only words that are not stopwords are processed further.
  #For each word that is not a stopword, it applies the ps.stem() method, which stems the word using the Porter Stemmer.
  ratings = [ps.stem(word) for word in ratings if not word in set(all_stopwords)]
  ratings = ' '.join(ratings)
  corpus.append(ratings)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\harsh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [36]:
print(corpus)
len(corpus)

['wow love place', 'crust not good', 'not tasti textur nasti', 'stop late may bank holiday rick steve recommend love', 'select menu great price', 'get angri want damn pho', 'honeslti tast fresh', 'potato like rubber could tell made ahead time kept warmer', 'fri great', 'great touch', 'servic prompt', 'would not go back', 'cashier care ever say still end wayyy overpr', 'tri cape cod ravoli chicken cranberri mmmm', 'disgust pretti sure human hair', 'shock sign indic cash', 'highli recommend', 'waitress littl slow servic', 'place not worth time let alon vega', 'not like', 'burritto blah', 'food amaz', 'servic also cute', 'could care less interior beauti', 'perform', 'right red velvet cake ohhh stuff good', 'name', 'hole wall great mexican street taco friendli staff', 'took hour get food tabl restaur food luke warm sever run around like total overwhelm', 'worst salmon sashimi', 'also combo like burger fri beer decent deal', 'like final blow', 'found place accid could not happier', 'seem li

10000

## Bag of words creation

In [37]:
#CountVectorizer is used to convert a collection of text documents to a matrix of token counts.
from sklearn.feature_extraction.text import CountVectorizer

#(max_features = 1500): This specifies that the vectorizer should consider only the top 1500 most frequent words in the corpus.
cv = CountVectorizer(max_features = 3000)

#.fit: Learns the vocabulary dictionary of all tokens in the corpus.
#.transform: Transforms the corpus into a document-term matrix.
#.toarray(): Converts the sparse matrix to a dense numpy array. The resulting X is a matrix where each row represents a document and each column represents a word from the vocabulary.
#.toarray(): Converts the sparse matrix to a dense numpy array. The resulting X is a matrix where each row represents a document and each column represents a word from the vocabulary.
X = cv.fit_transform(corpus).toarray()

#dataset.iloc[:, -1]: Uses .iloc to select all rows (:) and the last column (-1) of the dataset. This is based on the assumption that the target variable is in the last column.
#.values: Converts the selected column to a numpy array.
#y: This variable now contains the target labels corresponding to each document in the corpus.
y = dataset.iloc[:, -1].values

In [38]:
print(X)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [39]:
print(y)

[1 0 0 ... 0 0 0]


## Splitting data into test and training sets

In [40]:
print(X.shape)
print(y.shape)


(10000, 2860)
(10000,)


In [41]:
print("Number of reviews:", len(corpus))
print("Number of target labels:", len(y))

Number of reviews: 10000
Number of target labels: 10000


In [42]:
#The provided code splits the dataset into training and testing sets for the purposes of model training and evaluation.
#X: The feature matrix containing the input data.
#y: The target vector containing the labels.
#test_size = 0.20: Specifies that 20% of the data should be allocated to the test set, and the remaining 80% should be allocated to the training set.
#random_state = 21: Sets the seed for the random number generator to ensure reproducibility. Using the same random_state value ensures that the same split is obtained every time the code is run.
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 21)

## Training Naive Bayes

In [43]:
#The provided code is used to train a Naive Bayes classifier using the training data.
#GaussianNB is a Naive Bayes classifier that assumes the features follow a Gaussian (normal) distribution. It is particularly suited for continuous data.
# from sklearn.naive_bayes import GaussianNB
# classifier = GaussianNB()

from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(max_iter=1000)

#This line trains the Gaussian Naive Bayes classifier using the training data.
classifier.fit(X_train, y_train)

## Prediction on the test set

In [44]:
#This line uses the trained classifier to predict the labels for the test data.
#X_test: The feature matrix containing the test data.
#y_pred: The predicted labels for the test data.
y_pred = classifier.predict(X_test)

#This line prints the predicted labels along with the actual labels for the test data.
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[0 0]
 [0 0]
 [1 1]
 ...
 [1 1]
 [0 0]
 [1 1]]


## Making Confusion Matrix

In [48]:
#The provided code evaluates the performance of the trained Naive Bayes classifier using a confusion matrix and the accuracy score.
#y_test: The actual labels for the test data.
#y_pred: The predicted labels for the test data.
from sklearn.metrics import confusion_matrix, accuracy_score

cm = confusion_matrix(y_test, y_pred)
#cm: The confusion matrix is displayed in the format:
# [[TrueNegative FalsePositive]
#  [FalseNegative TruePositive]]
print(cm)

#This line computes the accuracy of the classifier on the test
accuracy_score(y_test, y_pred)

[[946  28]
 [ 38 988]]


0.967

Naive Bayes [GausianNB()] => 88.65%

Logistic Regression => 96.7%

In [49]:
with open('Review_model.pkl','wb') as file:
    pickle.dump(classifier,file)

In [50]:
with open('CountVectorizer.pkl', 'wb') as cv_file:
    pickle.dump(cv, cv_file)