In [1]:
# just some imports to be used later
import os
from zipfile import ZipFile

import numpy as np
import seaborn
import pandas as pd
import math

import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

# The Dataset

In [2]:
# Get dataset
!kaggle datasets download "nelgiriyewithana/mcdonalds-store-reviews"

Downloading mcdonalds-store-reviews.zip to D:\Work\Learning\NaturalLanguage\SentimentAnalysis




  0%|          | 0.00/1.78M [00:00<?, ?B/s]
 56%|#####6    | 1.00M/1.78M [00:00<00:00, 3.92MB/s]
100%|##########| 1.78M/1.78M [00:00<00:00, 2.56MB/s]
100%|##########| 1.78M/1.78M [00:00<00:00, 2.72MB/s]


In [3]:
# Extract dataset
with ZipFile('mcdonalds-store-reviews.zip','r') as zipped_file:
    zipped_file.extractall()

# Delete the zipfile
os.remove('mcdonalds-store-reviews.zip')

In [4]:
# Read Dataset
dataset = pd.read_csv('McDonald_s_Reviews.csv',encoding_errors='ignore')

#drop columns
dataset = dataset[['review','rating']].copy()

# change star ratings to integer
dataset['rating'] = dataset['rating'].apply(lambda x: int(x.split()[0]))
dataset.head()

Unnamed: 0,review,rating
0,Why does it look like someone spit on my food?...,1
1,It'd McDonalds. It is what it is as far as the...,4
2,Made a mobile order got to the speaker and che...,1
3,My mc. Crispy chicken sandwich was �����������...,5
4,"I repeat my order 3 times in the drive thru, a...",1


In [5]:
# drop ratings with no reviews
no_rev = list(dataset.index[dataset['review'].isna()])
dataset = dataset.drop(no_rev)

# drop neutral 3 star reviews
ratings_3star = list(dataset.index[dataset['rating']==3])
dataset = dataset.drop(ratings_3star)

#chnage rating to label, 1: positive and 0:negative
dataset['Sentiment'] = dataset['rating'].apply(lambda x: 1 if x>3 else 0)
dataset = dataset[['review','Sentiment']].copy()
dataset.head()

Unnamed: 0,review,Sentiment
0,Why does it look like someone spit on my food?...,0
1,It'd McDonalds. It is what it is as far as the...,1
2,Made a mobile order got to the speaker and che...,0
3,My mc. Crispy chicken sandwich was �����������...,1
4,"I repeat my order 3 times in the drive thru, a...",0


# Data Preprocessing

In [6]:
def preprocessing(ipString,tokenizer,stopwords,lemmatizer):
    #preprocessing
    op = ipString.lower()
    op = tokenizer.tokenize(op)
    op = [token for token in op if token not in stopwords]
    op = [lemmatizer.lemmatize(token) for token in op]
    return op

In [7]:
tokenizer = nltk.tokenize.WordPunctTokenizer()
stopwords = nltk.corpus.stopwords.words('english')
lemmatizer = nltk.stem.WordNetLemmatizer()

dataset['review'] = dataset['review'].apply(lambda review: preprocessing(review, tokenizer, stopwords, lemmatizer))
dataset.head()

Unnamed: 0,review,Sentiment
0,"[look, like, someone, spit, food, ?, normal, t...",0
1,"[', mcdonalds, ., far, food, atmosphere, go, ....",1
2,"[made, mobile, order, got, speaker, checked, ....",0
3,"[mc, ., crispy, chicken, sandwich, �����������...",1
4,"[repeat, order, 3, time, drive, thru, ,, still...",0


# Feature Extraction

In [8]:
# Should only use training data to generate learning features
train, test = train_test_split(dataset, test_size=0.3, random_state=64)

In [9]:
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(train['review'].apply(lambda tokens: ' '.join(tokens)))
X_test = vectorizer.transform(test['review'].apply(lambda tokens: ' '.join(tokens)))

# Training

In [10]:
classifier = MultinomialNB()
classifier.fit(X_train,train['Sentiment'])

In [11]:
preds_train = classifier.predict(X_train)
print('Training results')
print(classification_report(train['Sentiment'],preds_train))

Training results
              precision    recall  f1-score   support

           0       0.92      0.89      0.91      8760
           1       0.92      0.94      0.93     11239

    accuracy                           0.92     19999
   macro avg       0.92      0.92      0.92     19999
weighted avg       0.92      0.92      0.92     19999



In [12]:
preds_test = classifier.predict(X_test)
print('Testing results')
print(classification_report(test['Sentiment'],preds_test))

Testing results
              precision    recall  f1-score   support

           0       0.90      0.88      0.89      3755
           1       0.91      0.92      0.91      4817

    accuracy                           0.90      8572
   macro avg       0.90      0.90      0.90      8572
weighted avg       0.90      0.90      0.90      8572



# Inference

In [13]:
inferenceStrings = ['I hated the food here, it is so bad that I will never come back.', 'It is so good, i go there every week.']
preprocessedinferenceStrings = [' '.join(preprocessing(inferenceString,tokenizer,stopwords,lemmatizer)) for inferenceString in inferenceStrings]
X_inference = vectorizer.transform(preprocessedinferenceStrings)
preds_inference = classifier.predict(X_inference)
for i in range(len(inferenceStrings)):
    print(f'{inferenceStrings[i]}\tis\t{"positive" if preds_inference[i]>0 else "negative"}')

I hated the food here, it is so bad that I will never come back.	is	negative
It is so good, i go there every week.	is	positive
