# Restaurant Review prediction

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [8]:
data = pd.read_csv('Restaurant_Reviews.tsv', sep='\t', quoting=3)

In [9]:
data.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [10]:
data.tail()

Unnamed: 0,Review,Liked
995,I think food should have flavor and texture an...,0
996,Appetite instantly gone.,0
997,Overall I was not impressed and would not go b...,0
998,"The whole experience was underwhelming, and I ...",0
999,"Then, as if I hadn't wasted enough of my life ...",0


In [11]:
data['Liked'].value_counts()

0    500
1    500
Name: Liked, dtype: int64

In [12]:
#cleaning the text data
import nltk
import re

In [13]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [14]:
from nltk.corpus import stopwords

In [15]:
data['Review'][0]

'Wow... Loved this place.'

In [18]:
review = re.sub('[^a-zA-Z]', ' ', data['Review'][0])

In [19]:
review

'Wow    Loved this place '

In [20]:
review.lower()

'wow    loved this place '

In [21]:
review = review.split()

In [22]:
review

['Wow', 'Loved', 'this', 'place']

In [23]:
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [24]:
preview = []
for word in review:
    if word not in stopwords.words('english'):
        preview.append(word)

In [25]:
preview

['Wow', 'Loved', 'place']

In [33]:
review = [word.lower() for word in review if word not in stopwords.words('english')]

In [34]:
review

['wow', 'loved', 'place']

In [35]:
from nltk.stem.porter import PorterStemmer

In [36]:
ps = PorterStemmer()

In [37]:
review = [ps.stem(word) for word in review]

In [38]:
review

['wow', 'love', 'place']

In [39]:
review = " ".join(review)

In [41]:
print(review)

wow love place


In [43]:
#preprocessing for the whole text of review

corpus =[]
ps = PorterStemmer()
for i in range(len(data)):
    review = re.sub('[^a-zA-Z]', ' ', data['Review'][i])
    review = review.split()
    review = [ps.stem(word) for word in review if word not in stopwords.words('english')]
    review = " ".join(review)
    corpus.append(review)
    

In [44]:
print(corpus)

['wow love place', 'crust good', 'not tasti textur nasti', 'stop late may bank holiday rick steve recommend love', 'the select menu great price', 'now i get angri i want damn pho', 'honeslti tast that fresh', 'the potato like rubber could tell made ahead time kept warmer', 'the fri great', 'a great touch', 'servic prompt', 'would go back', 'the cashier care ever i say still end wayyy overpr', 'i tri cape cod ravoli chicken cranberri mmmm', 'i disgust i pretti sure human hair', 'i shock sign indic cash', 'highli recommend', 'waitress littl slow servic', 'thi place worth time let alon vega', 'like', 'the burritto blah', 'the food amaz', 'servic also cute', 'i could care less the interior beauti', 'so perform', 'that right red velvet cake ohhh stuff good', 'they never brought salad ask', 'thi hole wall great mexican street taco friendli staff', 'took hour get food tabl restaur food luke warm our sever run around like total overwhelm', 'the worst salmon sashimi', 'also combo like burger fr

In [45]:
#words cannot be fed to the model therefore we will convert into number using algorithm
#bag of word model

from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer( max_features= 1500)


In [46]:
x = cv.fit_transform(corpus).toarray()

In [51]:
x

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [48]:
y = data.iloc[:,1].values

In [49]:
y.shape

(1000,)

In [50]:
y[:10]

array([1, 0, 0, 1, 1, 0, 0, 0, 1, 1], dtype=int64)

In [52]:
#train and test data split

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=0)

In [53]:
x_train.shape, x_test.shape

((800, 1500), (200, 1500))

In [54]:
y_train.shape, y_test.shape

((800,), (200,))

In [55]:
from sklearn.naive_bayes import GaussianNB

In [56]:
classifier = GaussianNB()

In [57]:
classifier.fit(x_train, y_train)

GaussianNB()

In [58]:
y_pred = classifier.predict(x_test)

In [59]:
y_pred

array([1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0,
       1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0,
       0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0,
       1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1,
       1, 1], dtype=int64)

In [60]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [61]:
accuracy_score(y_test, y_pred)

0.735

In [63]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.57      0.67        97
           1       0.69      0.89      0.78       103

    accuracy                           0.73       200
   macro avg       0.76      0.73      0.73       200
weighted avg       0.76      0.73      0.73       200



In [64]:
confusion_matrix(y_test, y_pred)

array([[55, 42],
       [11, 92]], dtype=int64)