In [1]:
import pandas as pd
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report



In [2]:
df = pd.read_csv('Restaurant_Reviews.tsv', delimiter='\t', quoting = 3)


In [3]:
df.head(4)

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1


In [4]:
df.isnull().sum()

Review    0
Liked     0
dtype: int64

In [5]:
df.shape

(1000, 2)

In [6]:
df.sum()

Review    Wow... Loved this place.Crust is not good.Not ...
Liked                                                   500
dtype: object

In [7]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Jamshaid\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
def preprocess_review(review):
    # Convert to lowercase
    review = review.lower()
    # Remove non-alphabetic characters
    review = re.sub('[^a-zA-Z]', ' ', review)
    # Split the review into words
    review = review.split()
    # Stemming and removing stopwords
    ps = PorterStemmer()
    all_stopwords = stopwords.words('english')
    all_stopwords.remove('not')       
    review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
    # Join the processed words back into a string
    review = ' '.join(review)
    return review


In [9]:
tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(df['Review'])
y = df['Liked']

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [11]:
model = LogisticRegression()
model.fit(X_train, y_train)



In [12]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = model.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[85 12]
 [24 79]]


0.82

In [13]:
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.78      0.88      0.83        97
           1       0.87      0.77      0.81       103

    accuracy                           0.82       200
   macro avg       0.82      0.82      0.82       200
weighted avg       0.83      0.82      0.82       200



In [21]:
# Prediction
def predict_sentiment(review, vectorizer, model):
    review_tfidf = vectorizer.transform([review])
    sentiment = model.predict(review_tfidf)[0]
    probability = model.predict_proba(review_tfidf)[0][1] if sentiment == 1 else 1 - model.predict_proba(review_tfidf)[0][1]
    return sentiment, probability





new_review =   "this resturant has bad food"
sentiment, probability = predict_sentiment(new_review, tfidf_vectorizer, model)
if sentiment == 1:
    print("POSITIVE REVIEW. Probability of customer being happy:", probability * 100, "%")
else:
    print("NEGATIVE REVIEW. Probability of customer being happy:", (1 - probability) * 100, "%")


NEGATIVE REVIEW. Probability of customer being happy: 35.659662094881504 %
