In [2]:
# Importing necessary Libraries

In [3]:
import pandas as pd
import numpy as np

In [4]:
# Importing Dataset

In [5]:
dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter='\t', quoting=3)

In [6]:
dataset

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1
...,...,...
995,I think food should have flavor and texture an...,0
996,Appetite instantly gone.,0
997,Overall I was not impressed and would not go b...,0
998,"The whole experience was underwhelming, and I ...",0


In [7]:
# Cleaning the texts

In [8]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\anshuman.kundu1\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [20]:
all_stopwords = stopwords.words('english')
all_stopwords.remove('not')
ps = PorterStemmer()
corpus=[]
for i in range(0,1000):
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
    review = ' '.join(review)
    corpus.append(review)

In [21]:
# Creating the bag of words model

In [22]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=1500)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:,-1].values

In [24]:
# Splitting the dataset into training and test sets

In [25]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [26]:
# Classification Model Evaluation

In [28]:
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
model_accuracies = {}

In [29]:
# Logistic Regression
logistic_model = LogisticRegression()
logistic_model.fit(X_train, y_train)
logistic_predictions = logistic_model.predict(X_test)
logistic_accuracy = accuracy_score(y_test, logistic_predictions)
model_accuracies['Logistic Regression'] = logistic_accuracy

In [30]:
# Decision Tree
tree_model = DecisionTreeClassifier()
tree_model.fit(X_train, y_train)
tree_predictions = tree_model.predict(X_test)
tree_accuracy = accuracy_score(y_test, tree_predictions)
model_accuracies['Decision Tree'] = tree_accuracy

In [31]:
# Random Forest
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
rf_predictions = rf_model.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_predictions)
model_accuracies['Random Forest'] = rf_accuracy

In [32]:
# Support Vector Machine (SVM)
svm_model = SVC()
svm_model.fit(X_train, y_train)
svm_predictions = svm_model.predict(X_test)
svm_accuracy = accuracy_score(y_test, svm_predictions)
model_accuracies['SVM'] = svm_accuracy

In [33]:
# k-Nearest Neighbors (KNN)
knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train)
knn_predictions = knn_model.predict(X_test)
knn_accuracy = accuracy_score(y_test, knn_predictions)
model_accuracies['KNN'] = knn_accuracy

In [34]:
# Naive Bayes
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
nb_predictions = nb_model.predict(X_test)
nb_accuracy = accuracy_score(y_test, nb_predictions)
model_accuracies['Naive Bayes'] = nb_accuracy

In [35]:
# Print accuracies for all models
for model, accuracy in model_accuracies.items():
    print(f'{model} Accuracy: {accuracy:.2f}')

Logistic Regression Accuracy: 0.78
Decision Tree Accuracy: 0.71
Random Forest Accuracy: 0.78
SVM Accuracy: 0.78
KNN Accuracy: 0.65
Naive Bayes Accuracy: 0.73


In [36]:
# Find the best model based on accuracy
best_model = max(model_accuracies, key=model_accuracies.get)
print(f'The best model is: {best_model} with an accuracy of {model_accuracies[best_model]:.2f}')

The best model is: SVM with an accuracy of 0.78


In [37]:
# Storing the best classification model into a picke file
import pickle
svm_model_file = 'svm_model_file.pkl'
with open(svm_model_file,'wb') as f:
    pickle.dump(svm_model, f)