## Import libraries

In [185]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import nltk
nltk.download('stopwords')  
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Load dataset

In [186]:
dataset = pd.read_csv("Restaurant_Reviews.tsv" , delimiter= '\t' , quoting= 3)

In [187]:
dataset.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


## Cleaning data

## Example on one row

In [188]:
dataset['Review'][0]

'Wow... Loved this place.'

### 1- choose letters (capital & small) , and replace unwanted punctuation by space 

In [189]:
review = re.sub('[^a-zA-Z]' , ' ' , dataset['Review'][0])  
review

'Wow    Loved this place '

### 2- convert letters to lowercase 

In [190]:
review = review.lower()
review

'wow    loved this place '

### 3- convert data to array to create ... matrix

In [191]:
review = review.split()

In [192]:
review

['wow', 'loved', 'this', 'place']

### 4- drop the unwanted words , then convert to set 

In [193]:
ps = PorterStemmer()

In [194]:
review = [ ps.stem (word) for word in review if not word in set(stopwords.words('english'))]

### 5- return to string 

In [195]:
review = "".join(review)

## Loop to apply above steps on all rows

In [196]:
corpus = []

In [197]:
for i in range (0,1000):
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [198]:
print (pd.DataFrame(corpus).head())

                                                   0
0                                     wow love place
1                                         crust good
2                                 tasti textur nasti
3  stop late may bank holiday rick steve recommen...
4                            select menu great price


##  Creating the Bag of Words model
       to convert words to numbers

In [199]:
cv = CountVectorizer( max_features= 1500) 

In [200]:
X = cv.fit_transform(corpus).toarray()

## Data Preprocessing

In [201]:
y = dataset.iloc[:, 1].values

In [202]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

### Apply Random Forest Classification

In [203]:
classifier = RandomForestClassifier(n_estimators = 10 ,criterion = 'entropy',random_state = 0 )
classifier.fit(X_train, y_train)

RandomForestClassifier(criterion='entropy', n_estimators=10, random_state=0)

In [204]:
y_pred = classifier.predict(X_test)

In [205]:
# Confusion Matrix:
cm = confusion_matrix(y_test, y_pred)
cm

array([[87, 10],
       [46, 57]], dtype=int64)

In [206]:
classifier.score(X_test,y_test)

0.72

## Apply Decision Tree Classification

In [207]:
classifier = DecisionTreeClassifier (random_state = 0 )
classifier.fit(X_train, y_train)

DecisionTreeClassifier(random_state=0)

In [208]:
y_pred = classifier.predict(X_test)

In [209]:
# Confusion Matrix:
cm = confusion_matrix(y_test, y_pred)
cm

array([[71, 26],
       [44, 59]], dtype=int64)

In [210]:
classifier.score(X_test,y_test)

0.65

### Apply Naive Bayes Classification

In [211]:
classifier = GaussianNB()
classifier.fit(X_train, y_train)

GaussianNB()

In [212]:
y_pred = classifier.predict(X_test)

In [213]:
cm = confusion_matrix(y_test, y_pred)
cm

array([[55, 42],
       [12, 91]], dtype=int64)

In [214]:
classifier.score(X_test,y_test)

0.73