## Business Problem

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [4]:
data = pd.read_csv('Restaurant_Reviews.tsv', sep='\t', quoting=3)

In [5]:
data.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [6]:
data.tail()

Unnamed: 0,Review,Liked
995,I think food should have flavor and texture an...,0
996,Appetite instantly gone.,0
997,Overall I was not impressed and would not go b...,0
998,"The whole experience was underwhelming, and I ...",0
999,"Then, as if I hadn't wasted enough of my life ...",0


In [9]:
data['Liked'].value_counts()

1    500
0    500
Name: Liked, dtype: int64

## Cleaning Text Data

In [11]:
import nltk 
import re

In [12]:
#stop words remove
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [14]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer


In [16]:
ps = PorterStemmer()
corpus = []
for i in range(len(data)):
  review = re.sub('[^a-zA-z]', ' ', data['Review'][i])
  review = review.lower()
  review = review.split()
  review = [ps.stem(word) for word in review if word not in stopwords.words('english')]
  review = " ".join(review)

  corpus.append(review)

## Bag of Word Model

In [18]:
from sklearn.feature_extraction.text import CountVectorizer

In [29]:
cv = CountVectorizer(max_features=2000)
x = cv.fit_transform(corpus).toarray()
x.shape

(1000, 1565)

In [30]:
y = data.iloc[:, 1].values

## Apply Naive Bayes Algorithm

In [31]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0, shuffle=True)

In [32]:
X_train.shape

(800, 1565)

In [33]:
from sklearn.naive_bayes import GaussianNB

classifier = GaussianNB()
classifier.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [34]:
y_pred = classifier.predict(X_test)

In [35]:
from sklearn.metrics import accuracy_score

In [36]:
accuracy_score(y_test, y_pred)

0.73