In [1]:
# Import basics packages

import os 
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('Restaurant_Reviews.tsv', sep='\t', quoting = 3)
data.head(7)

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1
5,Now I am getting angry and I want my damn pho.,0
6,Honeslty it didn't taste THAT fresh.),0


In [3]:
data['Liked'].value_counts()

1    500
0    500
Name: Liked, dtype: int64

In [4]:
# Cleaning Text Data

import nltk
import re

nltk.download('stopwords')

from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
data['Review'][0]

'Wow... Loved this place.'

In [6]:
review = re.sub('[^a-zA-Z]', ' ', data['Review'][0])

In [7]:
review

'Wow    Loved this place '

In [8]:
review = review.lower()

In [9]:
review

'wow    loved this place '

In [10]:
review = review.split()

In [11]:
review

['wow', 'loved', 'this', 'place']

In [12]:
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [13]:
preview = []

for word in review:
    if word not in stopwords.words('english'):
        preview.append(word)

In [14]:
preview

['wow', 'loved', 'place']

In [15]:
review = [word for word in review if word not in stopwords.words('english')]
review

['wow', 'loved', 'place']

In [16]:
# stemming 

from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

review = [ps.stem(word) for word in review]
review

['wow', 'love', 'place']

In [17]:
# Join
review = ' '.join(review)
review

'wow love place'

In [18]:
# gather all data together

corpus = []

for i in range (len(data)):
    print (i)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

In [19]:

corpus = []

for i in range (len(data)):
    review = re.sub('[^a-zA-Z]', ' ', data['Review'][i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if word not in stopwords.words('english')]
    review = ' '.join(review)
    
    corpus.append(review)
    
print (corpus)

['wow love place', 'crust good', 'tasti textur nasti', 'stop late may bank holiday rick steve recommend love', 'select menu great price', 'get angri want damn pho', 'honeslti tast fresh', 'potato like rubber could tell made ahead time kept warmer', 'fri great', 'great touch', 'servic prompt', 'would go back', 'cashier care ever say still end wayyy overpr', 'tri cape cod ravoli chicken cranberri mmmm', 'disgust pretti sure human hair', 'shock sign indic cash', 'highli recommend', 'waitress littl slow servic', 'place worth time let alon vega', 'like', 'burritto blah', 'food amaz', 'servic also cute', 'could care less interior beauti', 'perform', 'right red velvet cake ohhh stuff good', 'never brought salad ask', 'hole wall great mexican street taco friendli staff', 'took hour get food tabl restaur food luke warm sever run around like total overwhelm', 'worst salmon sashimi', 'also combo like burger fri beer decent deal', 'like final blow', 'found place accid could happier', 'seem like go

# Bag of word model

In [20]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features = 1500)

x = cv.fit_transform(corpus).toarray()

x.shape

(1000, 1500)

In [21]:
y = data.iloc[:, 1].values
y.shape, y[:10]

((1000,), array([1, 0, 0, 1, 1, 0, 0, 0, 1, 1], dtype=int64))

In [22]:
# Splitting the data into train and test for model building

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size= 0.75, random_state = 1150)

print (x_train.shape, x_test.shape)
print (y_train.shape, y_test.shape)

(750, 1500) (250, 1500)
(750,) (250,)


# Naive Bayes Theorem

In [23]:
from sklearn.naive_bayes import GaussianNB
classfier = GaussianNB()
classfier.fit(x_train, y_train)

In [24]:
# Predict

y_pred_train = classfier.predict(x_train)
y_pred_test = classfier.predict(x_test)

In [25]:
# Evaluaton - performance matrix

from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [26]:
print (confusion_matrix(y_train, y_pred_train))
print ('#####'*50)
print (confusion_matrix(y_test, y_pred_test))

[[313  56]
 [  0 381]]
##########################################################################################################################################################################################################################################################
[[ 70  61]
 [ 16 103]]


In [27]:
print (classification_report(y_train, y_pred_train))
print ('#####'*50)
print (classification_report(y_test, y_pred_test))

              precision    recall  f1-score   support

           0       1.00      0.85      0.92       369
           1       0.87      1.00      0.93       381

    accuracy                           0.93       750
   macro avg       0.94      0.92      0.92       750
weighted avg       0.93      0.93      0.92       750

##########################################################################################################################################################################################################################################################
              precision    recall  f1-score   support

           0       0.81      0.53      0.65       131
           1       0.63      0.87      0.73       119

    accuracy                           0.69       250
   macro avg       0.72      0.70      0.69       250
weighted avg       0.73      0.69      0.68       250



In [28]:
print (accuracy_score(y_train, y_pred_train))
print ('#####'*50)
print (accuracy_score(y_test, y_pred_test))

0.9253333333333333
##########################################################################################################################################################################################################################################################
0.692


In [31]:
# cross validation (k-fold method)

from sklearn.model_selection import cross_val_score
accuracy = cross_val_score(classfier, x_test, y_test, cv = 10)
accuracy[7]

0.8