## Import libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Import dataset

In [2]:
dataset=pd.read_csv('Restaurant_Reviews.tsv', delimiter='\t', quoting=3)

## Cleaning the texts

In [3]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus=[]

all_stopwords=stopwords.words('english')
word_to_remove=['not', 'awfully', 'best','better','like','liked','reasonably','unfortunately','useful',  "don't", "wasn't", "shouldn't"]
# word_to_remove=[]
new_stopwords=[element for element in all_stopwords if element not in word_to_remove]
# all_stopwords.remove('not')
    
    
for i in range (0,1000):
    review=re.sub('[^a-zA-Z]',' ', dataset['Review'][i]) #remove all thing (e.g punctuation) except letter/alphabet
    review=review.lower() #change to lowercase
    review=review.split() #prepare for stemming
    ps=PorterStemmer()
    
    review=[ps.stem(word) for word in review if not word in set (new_stopwords)]
    review=' '.join(review)
    corpus.append(review)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Creating Bag of Words model

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_features=1500)
x=cv.fit_transform(corpus).toarray()
y=dataset.iloc[:,-1].values


In [5]:
print(x)
print(len(x[0]))

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
1500


## Splitting dataset into training & test set

In [6]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test=train_test_split(x, y, test_size=0.2, random_state=0)

## Training the Naives Bayes model on the training set

In [7]:
from sklearn.metrics import confusion_matrix, accuracy_score

from sklearn.linear_model import LogisticRegression
classifierLR=LogisticRegression(random_state=0)
classifierLR.fit(x_train, y_train)
weightLR=accuracy_score(y_test, classifierLR.predict(x_test))

from sklearn.neighbors import KNeighborsClassifier
classifierKNN = KNeighborsClassifier(n_neighbors=5, metric='minkowski',p =2)
classifierKNN.fit(x_train, y_train)
weightKNN=accuracy_score(y_test, classifierKNN.predict(x_test))

from sklearn.naive_bayes import GaussianNB
classifierNB=GaussianNB()
classifierNB.fit(x_train,y_train)
weightNB=accuracy_score(y_test, classifierNB.predict(x_test))

from sklearn.ensemble import RandomForestClassifier
classifierRF = RandomForestClassifier(n_estimators=200, criterion='entropy', random_state=42)
classifierRF.fit(x_train, y_train)
weightRF=accuracy_score(y_test, classifierRF.predict(x_test))

from sklearn.svm import SVC
classifierSVM=SVC(kernel='rbf',  random_state=0)
classifierSVM.fit(x_train, y_train)
weightSVM=accuracy_score(y_test, classifierSVM.predict(x_test))

In [8]:

weightAll = weightKNN + weightLR + weightNB + weightRF + weightSVM 
tresshold = .6
y_pred = 1 * (weightNB * classifierNB.predict(x_test) + 
              weightRF * classifierRF.predict(x_test) + 
              weightLR * classifierLR.predict(x_test) + 
              weightKNN * classifierKNN.predict(x_test) + 
              weightSVM * classifierSVM.predict(x_test) > tresshold * weightAll)

## Predicting the Test set results

In [9]:
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 0]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 1]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [0 1]
 [1 1]
 [1 1]
 [0 0]
 [0 1]
 [0 1]
 [0 1]
 [0 1]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [0 1]
 [0 0]
 [0 0]
 [0 0]
 [1 0]
 [1 0]
 [0 0]
 [0 0]
 [1 1]
 [0 1]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [0 1]
 [0 1]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [1 0]
 [0 1]
 [0 0]
 [1 1]
 [0 1]
 [0 1]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [0 1]
 [0 0]
 [0 0]
 [0 1]
 [1 1]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [0 1]
 [0 1]
 [1 0]
 [0 1]
 [1 1]
 [1 1]
 [1 0]
 [0 1]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [0 1]
 [0 1]
 [1 1]
 [0 0]
 [1 0]
 [0 1]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 1]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 1]

## Making the confusion matrix

In [10]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm=confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[89  8]
 [35 68]]


0.785