In [2]:
# Importing the libraries
import numpy as np
import pandas as pd

In [3]:
# Importing the dataset
dataset = pd.read_csv(r"C:\Users\Gautam\Downloads\ibm\Restaurant_Reviews.tsv", delimiter = '\t', quoting = 3)

In [4]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []
for i in range(0, 1000):
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create an instance of TfidfVectorizer with desired settings
tfidf_vectorizer = TfidfVectorizer(max_features=1500)

# Fit and transform your text data into TF-IDF features
X = tfidf_vectorizer.fit_transform(corpus).toarray()

# Prepare your target variable
y = dataset.iloc[:, 1].values


In [6]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [7]:
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB(alpha=0.1)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

score1 = accuracy_score(y_test, y_pred)
score2 = precision_score(y_test, y_pred)
score3 = recall_score(y_test, y_pred)

print("\n")
print("Accuracy is ", round(score1 * 100, 2), "%")
print("Precision is ", round(score2, 2))
print("Recall is ", round(score3, 2))



Confusion Matrix:
 [[114  38]
 [ 40 108]]


Accuracy is  74.0 %
Precision is  0.74
Recall is  0.73


In [8]:
# Bernoulli NB

# Fitting Naive Bayes to the Training set
from sklearn.naive_bayes import BernoulliNB
classifier = BernoulliNB(alpha=0.8)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print ("Confusion Matrix:\n",cm)

# Accuracy, Precision and Recall
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
score1 = accuracy_score(y_test,y_pred)
score2 = precision_score(y_test,y_pred)
score3= recall_score(y_test,y_pred)
print("\n")
print("Accuracy is ",round(score1*100,2),"%")
print("Precision is ",round(score2,2))
print("Recall is ",round(score3,2))

Confusion Matrix:
 [[115  37]
 [ 32 116]]


Accuracy is  77.0 %
Precision is  0.76
Recall is  0.78


In [9]:
#Random Forest
from sklearn.ensemble import RandomForestClassifier

# Create a RandomForestClassifier instance
classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Fit the model to the training data
classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = classifier.predict(X_test)
score1 = accuracy_score(y_test, y_pred)
score2 = precision_score(y_test, y_pred)
score3 = recall_score(y_test, y_pred)

print("\n")
print("Accuracy is ", round(score1 * 100, 2), "%")
print("Precision is ", round(score2, 2))
print("Recall is ", round(score3, 2))



Accuracy is  74.33 %
Precision is  0.83
Recall is  0.61


In [10]:
#xgboost
import xgboost as xgb

# Create an XGBoost classifier instance
classifier = xgb.XGBClassifier()

# Fit the model to the training data
classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = classifier.predict(X_test)
score1 = accuracy_score(y_test, y_pred)
score2 = precision_score(y_test, y_pred)
score3 = recall_score(y_test, y_pred)

print("\n")
print("Accuracy is ", round(score1 * 100, 2), "%")
print("Precision is ", round(score2, 2))
print("Recall is ", round(score3, 2))




Accuracy is  73.33 %
Precision is  0.79
Recall is  0.62


In [20]:
#svm
from sklearn.svm import SVC

# Create an SVM classifier instance
classifier = SVC(kernel='linear', C=1.0, random_state=42)

# Fit the model to the training data
classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = classifier.predict(X_test)
score1 = accuracy_score(y_test,y_pred)
score2 = precision_score(y_test,y_pred)
score3= recall_score(y_test,y_pred)
print("\n")
print("Accuracy is ",round(score1*100,2),"%")
print("Precision is ",round(score2,2))
print("Recall is ",round(score3,2))



Accuracy is  76.67 %
Precision is  0.8
Recall is  0.7


In [14]:
# Logistic Regression

# Fitting Logistic Regression to the Training set
from sklearn import linear_model
classifier = linear_model.LogisticRegression(C=1.5)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print ("Confusion Matrix:\n",cm)

# Accuracy, Precision and Recall
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
score1 = accuracy_score(y_test,y_pred)
score2 = precision_score(y_test,y_pred)
score3= recall_score(y_test,y_pred)
print("\n")
print("Accuracy is ",round(score1*100,2),"%")
print("Precision is ",round(score2,2))
print("Recall is ",round(score3,2))

Confusion Matrix:
 [[125  27]
 [ 43 105]]


Accuracy is  76.67 %
Precision is  0.8
Recall is  0.71


### Analysis and Conclusion

In this study, an attempt has been made to classify sentiment analysis for restaurant reviews using machine learning techniques. Two algorithms namely Multinomial Naive Bayes and Bernoulli Naive Bayes are implemented.

Evaluation metrics used here are accuracy, precision and recall.

Using Multinomial Naive Bayes,

* Accuracy of prediction is 77.67%.
* Precision of prediction is 0.78.
* Recall of prediction is 0.77.

Using Bernoulli Naive Bayes,

* Accuracy of prediction is 77.0%.
* Precision of prediction is 0.76.
* Recall of prediction is 0.78.

Using Logistic Regression,

* Accuracy of prediction is 76.67%.
* Precision of prediction is 0.8.
* Recall of prediction is 0.71.

Using Random Forest,

* Accuracy of prediction is 74.33%.
* Precision of prediction is 0.83.
* Recall of prediction is 0.61.

Using xgboost,

* Accuracy of prediction is 73.33%.
* Precision of prediction is 0.79.
* Recall of prediction is 0.62.

Using SVM,

* Accuracy of prediction is 76.67%.
* Precision of prediction is 0.8.
* Recall of prediction is 0.7.

From the above results, Multinomial Naive Bayes is slightly better method compared to Bernoulli Naive Bayes,random forest,svm,xgboost and Logistic Regression, with 77.67% accuracy which means the model built for the prediction of sentiment of the restaurant review gives 77.67% right prediction.