In [1]:
import pandas as pd
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer


Read data from dataset

In [2]:
url = 'https://raw.githubusercontent.com/kaiwenm3/CS410/main/reviews.csv'
data = pd.read_csv(url)
data = data.drop(columns=["7514"])
data.head()

Unnamed: 0,Restaurant,Reviewer,Review,Rating,Metadata,Time,Pictures
0,Beyond Flavours,Rusha Chakraborty,"The ambience was good, food was quite good . h...",5,"1 Review , 2 Followers",5/25/2019 15:54,0
1,Beyond Flavours,Anusha Tirumalaneedi,Ambience is too good for a pleasant evening. S...,5,"3 Reviews , 2 Followers",5/25/2019 14:20,0
2,Beyond Flavours,Ashok Shekhawat,A must try.. great food great ambience. Thnx f...,5,"2 Reviews , 3 Followers",5/24/2019 22:54,0
3,Beyond Flavours,Swapnil Sarkar,Soumen das and Arun was a great guy. Only beca...,5,"1 Review , 1 Follower",5/24/2019 22:11,0
4,Beyond Flavours,Dileep,Food is good.we ordered Kodi drumsticks and ba...,5,"3 Reviews , 2 Followers",5/24/2019 21:37,0


Get the ratings from data to find sentiment

In [3]:
y = data['Rating']
y = y.replace({'Like':5})
y = pd.to_numeric(y)
y = y.fillna(1)
y.describe()

Unnamed: 0,Rating
count,10000.0
mean,3.5913
std,1.489257
min,1.0
25%,3.0
50%,4.0
75%,5.0
max,5.0


In [4]:
for i in range(0, len(y)):
    if (y[i]>=3):
        y[i] = 1
    else:
        y[i] = 0

In [5]:
y.describe()

Unnamed: 0,Rating
count,10000.0
mean,0.7515
std,0.432165
min,0.0
25%,1.0
50%,1.0
75%,1.0
max,1.0


NLP process for the reviews

In [6]:
ps = PorterStemmer()
corpus = []
for i in range(0, len(data)):
    review = re.sub('[^a-zA-Z]',' ', str(data['Review'][i]))
    review = review.lower()
    review = review.split()

    review = [ps.stem(word) for word in review if not word in stopwords.words('english')] #keep important words to extract features
    review = ' '.join(review) #join words back together
    corpus.append(review)

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=1000)
X = cv.fit_transform(corpus).toarray()

Split training set and testing set

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

Linear Regression Model:

In [9]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import confusion_matrix
lm_model = LinearRegression().fit(X_train, y_train)
y_pred = lm_model.predict(X_test)
for i in range(0, len(y_pred)):
    if (y_pred[i]>=0.5):
        y_pred[i] = 1
    else:
        y_pred[i] = 0
confusion_m = confusion_matrix(y_test, y_pred)
print(confusion_m)

[[ 370  222]
 [  75 1833]]


In [10]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)

0.8812


Logistic Regression Model:

In [11]:
from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression().fit(X_train, y_train)
y_pred = lr_model.predict(X_test)
for i in range(0, len(y_pred)):
    if (y_pred[i]>=0.5):
        y_pred[i] = 1
    else:
        y_pred[i] = 0
confusion_m = confusion_matrix(y_test, y_pred)
print(confusion_m)

[[ 439  153]
 [ 102 1806]]


In [12]:
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)

0.898


Logistic Regression with Cross Validation:

In [13]:
from sklearn.linear_model import LogisticRegressionCV
lrcv_model = LogisticRegressionCV(cv=10).fit(X_train, y_train)
y_pred = lrcv_model.predict(X_test)
for i in range(0, len(y_pred)):
    if (y_pred[i]>=0.5):
        y_pred[i] = 1
    else:
        y_pred[i] = 0
confusion_m = confusion_matrix(y_test, y_pred)
print(confusion_m)

[[ 434  158]
 [  88 1820]]


In [14]:
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)

0.9016


MultinomialNB Model:

In [15]:
from sklearn.naive_bayes import MultinomialNB
mtnb_model = MultinomialNB().fit(X_train, y_train)
y_pred = mtnb_model.predict(X_test)
confusion_m = confusion_matrix(y_test, y_pred)
print(confusion_m)

[[ 470  122]
 [ 126 1782]]


In [16]:
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)

0.9008


Logistic Regression with Cross Validation has the best overall performance, save the model for predict future review

In [17]:
#import pickle
#from google.colab import drive
#drive.mount('/content/drive')

#pickle.dump(lrcv_model, open('drive/My Drive/model.pkl', 'wb'))

Predict sentiment of new review with interact:

In [18]:
%pip install -q ipywidgets
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
from ipywidgets import FloatSlider


In [19]:
def f(Your_review):
    review = re.sub('[^a-zA-Z]',' ', str(Your_review))
    review = review.lower()
    review = review.split()

    review = [ps.stem(word) for word in review if not word in stopwords.words('english')] #keep important words to extract features
    review = ' '.join(review) #join words back together
    corpus.append(review)
    X = cv.fit_transform(corpus).toarray()[-1]
    review = lrcv_model.predict([X])
    if (review>=0.5):
        sentiment = 'positive'
    else:
        sentiment = 'negative'
    return sentiment

In [21]:
interact_manual(f, Your_review = 'Enter your review');

interactive(children=(Text(value='Enter your review', description='Your_review'), Button(description='Run Inte…