# Sentiment Analysis of Resturant Reviews
Sentiment analysis of customer reviews has a crucial impact on a business's development strategy. because if our customers is not satisfied with our services, so may be we are doing something wrong. we need to listen what customer wants and improve our services based on their feedback. So, why not turn all that feedback into insights and learn how to improve both the customer experience and your business?

***The purpose of this analysis is to build a prediction model to predict whether a review on the restaurant given by customers is positive or negative.***

# Importing libraries

In [119]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

# Loading the raw data

In [120]:
data = pd.read_csv('Restaurant_Reviews.tsv',sep='\t')

In [121]:
data.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [122]:
# Information of the data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  1000 non-null   object
 1   Liked   1000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 15.8+ KB


# Data Cleaning and Preprocessing

In [123]:
# Importing essential libraries for performing Natural Language Processing
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [143]:
# we are removing the words from the stop words list: 'no', 'nor', 'not',isn't,"doesn't", "won't"
stopwords= set(['br', 'the', 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've",\
            "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', \
            'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their',\
            'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', \
            'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', \
            'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', \
            'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after',\
            'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further',\
            'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',\
            'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', \
            's', 't', 'can', 'will', 'just', 'don', 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', \
            've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn','hadn',\
            "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn','ma', 'mightn', "mightn't", 'mustn',\
            "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", \
            'won','wouldn', "wouldn't"])

In [144]:
# Cleaning the reviews
corpus = []
for i in range(len(data)):
    # Cleaning special character from the reviews
    review = re.sub('[^a-zA-Z]',' ',data['Review'][i])
    
    # Converting the entire review into lower case
    review = review.lower()
    
    # Tokenizing the review by words
    review = review.split()
    
    # Removing the stop words and Stemming the words
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if word not in stopwords]
    
    # Joining the stemmed words
    review = ' '.join(review)
    
    # Creating a corpus
    corpus.append(review)

In [145]:
corpus[:10]

['wow love place',
 'crust not good',
 'not tasti textur nasti',
 'stop late may bank holiday rick steve recommend love',
 'select menu great price',
 'get angri want damn pho',
 'honeslti tast fresh',
 'potato like rubber could tell made ahead time kept warmer',
 'fri great',
 'great touch']

In [146]:
# Creating the Bag of Words using CountVectorizer (we can also use tf-idf to convertinf text into vector)
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=1500)
x = cv.fit_transform(corpus).toarray()
y = data['Liked'].values

# Model Building

In [147]:
# split the data set into train and test
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test  = train_test_split(x, y, test_size=0.2, random_state=1)

# split the train data set into cross validation train and cross validation test
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=1)

In [148]:
# Training model using Naive bayes classifier
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(x_train, y_train)

MultinomialNB()

In [149]:
# Model Summary
y_pred = model.predict(x_test)

from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
print("Model Accuracy :",accuracy_score(y_test,y_pred))

Model Accuracy : 0.755


In [150]:
# Confusion matix
confusion_matrix(y_test,y_pred)

array([[81, 27],
       [22, 70]], dtype=int64)

In [151]:
# Classification Report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.79      0.75      0.77       108
           1       0.72      0.76      0.74        92

    accuracy                           0.76       200
   macro avg       0.75      0.76      0.75       200
weighted avg       0.76      0.76      0.76       200



# Hyperparameter Tuning
We are performing hyperparameter tuning on crossvalidation data.

In [152]:
best_accuracy = 0.0
alpha_val = 0.0
for i in np.arange(0.0,1.1,0.1):
    temp_classifier = MultinomialNB(alpha=i)
    temp_classifier.fit(x_train, y_train)
    # evaluate CV accuracy
    temp_y_pred = temp_classifier.predict(x_val)
    score = accuracy_score(y_val, temp_y_pred)
    print("Accuracy score for alpha={} is: {}%".format(round(i,1), round(score*100,2)))
    if score>best_accuracy:
        best_accuracy = score
        alpha_val = i
print('--------------------------------------------')
print('Best accuracy is {}% with alpha value as {}'.format(round(best_accuracy*100, 2), round(alpha_val,1)))

Accuracy score for alpha=0.0 is: 81.25%
Accuracy score for alpha=0.1 is: 82.5%
Accuracy score for alpha=0.2 is: 83.12%
Accuracy score for alpha=0.3 is: 83.12%
Accuracy score for alpha=0.4 is: 83.75%
Accuracy score for alpha=0.5 is: 84.38%
Accuracy score for alpha=0.6 is: 83.12%
Accuracy score for alpha=0.7 is: 83.12%
Accuracy score for alpha=0.8 is: 83.12%
Accuracy score for alpha=0.9 is: 83.75%
Accuracy score for alpha=1.0 is: 84.38%
--------------------------------------------
Best accuracy is 84.38% with alpha value as 0.5


In [153]:
# Train model with tuned parameter.
model = MultinomialNB(alpha=0.5)
model.fit(x_train,y_train)
pred = model.predict(x_test)

# Final accuracy check on test data
print("Model Accuracy :",accuracy_score(y_test,pred))

Model Accuracy : 0.755


# Predictions

In [154]:
def predict_sentiment(sample_review):
    sample_review = re.sub('[^a-zA-Z]',' ',string = sample_review)
    sample_review = sample_review.lower()
    sample_review = sample_review.split()
    
    ps = PorterStemmer()
    sample_review = [ps.stem(word) for word in sample_review if word not in stopwords]
    sample_review = ' '.join(sample_review)
    
    temp = cv.transform([sample_review]).toarray()
    return model.predict(temp)

In [166]:
# 1 means Positive review
predict_sentiment('Wow... Loved this place')

array([1], dtype=int64)

In [167]:
# 0 means Negative Review
predict_sentiment('The food quality is very very bad had order some soup it was so terrible could eat more than a spoonful.')

array([0], dtype=int64)

In [168]:
sample = 'The food quality is very very bad had order some soup it was so terrible could eat more than a spoonful.'
if predict_sentiment(sample)==True:
    print('Review is Positive')
else:
    print('Review is Negative')

Review is Negative


In [23]:
'''
# Creating a pickle file for the CountVectorizer
import pickle
pickle.dump(cv, open('cv-transform', 'wb'))

import pickle
pickle.dump(cv, open('cv-transform.pkl', 'wb'))

# save the model to disk
import pickle
pickle.dump(model, open('Sentiment_Prediction_model', 'wb'))

# load the model from disk
loaded_model = pickle.load(open('Sentiment_Prediction_model','rb'))
'''