In [226]:
import numpy as np
import pandas as pd

In [227]:
reviews = pd.read_csv("Restaurant_Reviews.tsv", delimiter= "\t")
reviews.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [228]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  1000 non-null   object
 1   Liked   1000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 15.8+ KB


In [212]:
import re

In [213]:
#Removing punctuation and numbers
reviews = re.sub("[^A-Za-z]", " ", reviews["Review"][0])
reviews

'Wow    Loved this place '

In [214]:
#turning all chracters into small letters
reviews = reviews.lower()
reviews

'wow    loved this place '

In [215]:
#splitting each word into a cell
reviews = reviews.split()
reviews

['wow', 'loved', 'this', 'place']

In [216]:
"""
removing stop words, words like "this", "is", "and", 'i', 'me', 'my', 'myself', 'we', 'our', 'ours' etc 
that do not contribute to the reviews and turning all words into present tense
"""
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
reviews = [ps.stem(word) for word in reviews if word not in stopwords.words("english")]
reviews

['wow', 'love', 'place']

In [217]:
#turninng each word back into a cell
reviews = " ".join(reviews)
reviews

'wow love place'

In [229]:
#Using the for loop to go through all rows in the data set
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
clean_text = []
for i in range(0, 1000):
    corpus = re.sub("[^A-Za-z]", " ", reviews["Review"][i])
    corpus = corpus.lower().split()
    corpus = [ps.stem(word) for word in corpus if word not in stopwords.words("english")]
    corpus = " ".join(corpus)
    clean_text.append(corpus)

In [235]:
#creating sparse matrix
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=1500)

In [247]:
X = cv.fit_transform(clean_text).toarray()
y = reviews["Liked"]

In [256]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

In [257]:
#splitting data into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [258]:
#Using the logistic regression model to classify as a good or bad review
model = LogisticRegression()

In [259]:
model.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [260]:
pred = model.predict(X_test)

In [261]:
print(confusion_matrix(y_test, pred))
print("\n")
print(classification_report(y_test, pred))

[[80 16]
 [36 68]]


              precision    recall  f1-score   support

           0       0.69      0.83      0.75        96
           1       0.81      0.65      0.72       104

    accuracy                           0.74       200
   macro avg       0.75      0.74      0.74       200
weighted avg       0.75      0.74      0.74       200



In [None]:
#Using the random forest

In [276]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=500)

In [277]:
rfc.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [278]:
pred2 = rfc.predict(X_test)

In [279]:
print(confusion_matrix(y_test, pred2))
print("\n")
print(classification_report(y_test, pred2))

[[83 13]
 [46 58]]


              precision    recall  f1-score   support

           0       0.64      0.86      0.74        96
           1       0.82      0.56      0.66       104

    accuracy                           0.70       200
   macro avg       0.73      0.71      0.70       200
weighted avg       0.73      0.70      0.70       200



In [281]:
#Using the support vector machine 
from sklearn.svm import SVC
svc = SVC()

In [282]:
svc.fit(X_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [283]:
pred3 = svc.predict(X_test)

In [284]:
print(confusion_matrix(y_test, pred3))
print("\n")
print(classification_report(y_test, pred3))

[[86 10]
 [49 55]]


              precision    recall  f1-score   support

           0       0.64      0.90      0.74        96
           1       0.85      0.53      0.65       104

    accuracy                           0.70       200
   macro avg       0.74      0.71      0.70       200
weighted avg       0.75      0.70      0.70       200



In [293]:
#Using Grid Search to find best parameters for the support vector machine
from sklearn.model_selection import GridSearchCV

In [294]:
param_grid = {"C" : [0.1,1,10,100,1000], "gamma" : [1, 0.1, 0.01, 0.001,0.0001]}

In [295]:
gs = GridSearchCV(SVC(), param_grid)

In [296]:
gs.fit(X_train, y_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [0.1, 1, 10, 100, 1000],
                         'gamma': [1, 0.1, 0.01, 0.001, 0.0001]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [297]:
pred5 = gs.predict(X_test)

In [298]:
print(confusion_matrix(y_test, pred5))
print("\n")
print(classification_report(y_test, pred5))

[[85 11]
 [42 62]]


              precision    recall  f1-score   support

           0       0.67      0.89      0.76        96
           1       0.85      0.60      0.70       104

    accuracy                           0.73       200
   macro avg       0.76      0.74      0.73       200
weighted avg       0.76      0.73      0.73       200

