In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [88]:
df = pd.read_csv('/Restaurant_Reviews.tsv', sep = '\t', quoting=3)
df.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [89]:
df.isnull().sum()

Unnamed: 0,0
Review,0
Liked,0


In [90]:
df['Liked'].value_counts()

Unnamed: 0_level_0,count
Liked,Unnamed: 1_level_1
1,500
0,500


We are having balanced data

- Cleaning Text Data

In [91]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize

In [92]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [93]:
df['Review'][0]

'Wow... Loved this place.'

In [94]:
review = re.sub('[^a-zA-Z]', ' ', df['Review'][0])
review = review.lower()
review = review.split()
review

['wow', 'loved', 'this', 'place']

In [95]:
review = [word for word in review if word not in stopwords.words('english')]
review

['wow', 'loved', 'place']

In [96]:
ps = PorterStemmer()

In [97]:
review = [ps.stem(word) for word in review ]
review = ' '.join(review)
review

'wow love place'

In [98]:
ps = PorterStemmer()
corpus = []
def clean_text(text):
    review = re.sub('[^a-zA-Z]', ' ', text)
    review = review.lower()
    review = word_tokenize(review)
    review = [ps.stem(word) for word in review if word not in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)
for i in range(len(df)):
  clean_text(df['Review'][i])
  if i == len(df)-1:
    review = corpus
    corpus = []

In [99]:
review[:2], corpus[:2]

(['wow love place', 'crust good'], [])

Bag of Word Model

In [100]:
from sklearn.feature_extraction.text import CountVectorizer

In [101]:
cv = CountVectorizer(max_features=1500)
x = cv.fit_transform(review).toarray()

In [102]:
x.shape

(1000, 1500)

In [103]:
y = df.iloc[:,1].values

In [104]:
y.shape

(1000,)

Naive Bayes

In [105]:
from sklearn.model_selection import train_test_split

In [106]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [107]:
X_train.shape, y_train.shape

((800, 1500), (800,))

In [108]:
X_test.shape, y_test.shape

((200, 1500), (200,))

In [109]:
from sklearn.naive_bayes import GaussianNB

In [110]:
classifier = GaussianNB()

In [111]:
classifier.fit(X_train, y_train)

In [147]:
y_pred = classifier.predict(X_test)

In [113]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [114]:
accuracy_score(y_test, y_pred) * 100

73.0

In [115]:
confusion_matrix(y_test, y_pred)

array([[55, 42],
       [12, 91]])

In [116]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.57      0.67        97
           1       0.68      0.88      0.77       103

    accuracy                           0.73       200
   macro avg       0.75      0.73      0.72       200
weighted avg       0.75      0.73      0.72       200



- Real time testing

In [121]:
myReview1 = "Food is delicious!!"

corpus = []
clean_text(myReview1)
print(corpus)
print(myReview1, "  ",classifier.predict(cv.transform(corpus).toarray())[0])


['food delici']
Food is delicious!!    1


In [143]:

myReview2 = "I am going to visit again"
corpus = []
clean_text(myReview2)
print(corpus)
print(myReview2, "  ",classifier.predict(cv.transform(corpus).toarray())[0])


['go visit']
I am going to visit again    1


In [139]:
myReview3 = "Food is not good!!"
corpus = []
clean_text(myReview3)
print(corpus)
print(myReview3, "  ",classifier.predict(cv.transform(corpus).toarray())[0])

['food good']
Food is not good!!    1


In [162]:
print('not' in stopwords.words('english'))


True


From above scenario, the word 'not' is eliminated as it is treated as a stopword during the cleaning process, leading the model to classify the sentence as positive.

In [158]:
data = df[df['Review'].str.contains('not good')]
len(data)

5

We don't have sufficent data to train the model on "not good" scenarios. To mitigate the issue,


Negation Handling: modifying text preprocessing to handle negations more effectively. For example, replace phrases like "not good" with "bad" during the cleaning process.

In [181]:
def handle_negations(text):
  review = re.sub('[^a-zA-Z]', ' ', text)
  review = review.lower()
  # list of negation phrases and their replacements
  negations = {
        r'\bnot good\b': 'bad',
        r'\bnot great\b': 'bad',
        r'\bnot satisfied\b': 'unsatisfied',
        r'\bnot happy\b': 'unhappy',
        r'\bnot like\b': 'dislike',

    }

  # Replace negation phrases
  for negation, replacement in negations.items():
      review = re.sub(negation, replacement, review, flags=re.IGNORECASE)

  review = word_tokenize(review)
  review = [ps.stem(word) for word in review if word not in stopwords.words('english')]

  review = ' '.join(review)
  corpus.append(review)
corpus = []
handle_negations(myReview3)
print(myReview3, "--->  ",corpus)
print("Prediction: " ,myReview3, "  ",classifier.predict(cv.transform(corpus).toarray())[0])

Food is not good!! --->   ['food bad']
Prediction:  Food is not good!!    0


In [173]:
myReview4 = "I am not satisfied with the food"
print(myReview4)
corpus = []
clean_text(myReview4)
print("before handling: ",corpus,' ',classifier.predict(cv.transform(corpus).toarray())[0])
corpus=[]
handle_negations(myReview4)
print("after handling: ",corpus,' ',classifier.predict(cv.transform(corpus).toarray())[0])

I am not satisfied with the food
before handling:  ['satisfi food']   1
after handling:  ['unsatisfi food']   0


Successfully handled above 'not' scenario.