In [1]:
# just some imports to be used later
import os
import pandas as pd
from zipfile import ZipFile
import nltk

# Getting the Dataset
Dataset sourced from Kaggle, [McDonald's Store Reviews](https://www.kaggle.com/datasets/nelgiriyewithana/mcdonalds-store-reviews)

<b>For first time usage:
    
1. Use: <i>pip install kaggle</i> or conda installation to install Kaggle API
1. Go to Kaggle > Settings > API > Create New Token
1. Copy the downloaded JSON to appropriate location (In windows: C:\Users\your_name\\\.kaggle)


In [8]:
# Get dataset
!kaggle datasets download "nelgiriyewithana/mcdonalds-store-reviews"

Downloading mcdonalds-store-reviews.zip to D:\Work\Learning\NaturalLanguage\SentimentAnalysis




  0%|          | 0.00/1.78M [00:00<?, ?B/s]
 56%|#####6    | 1.00M/1.78M [00:00<00:00, 1.77MB/s]
100%|##########| 1.78M/1.78M [00:00<00:00, 2.81MB/s]
100%|##########| 1.78M/1.78M [00:00<00:00, 2.55MB/s]


In [9]:
# Extract zip file
with ZipFile('mcdonalds-store-reviews.zip','r') as zipped_file:
    zipped_file.extractall()

In [10]:
# Delete the zipfile
os.remove('mcdonalds-store-reviews.zip')

# The Dataset

In [11]:
# Read Dataset
og_data = pd.read_csv('McDonald_s_Reviews.csv',encoding_errors='ignore')
og_data.head()

Unnamed: 0,reviewer_id,store_name,category,store_address,latitude,longitude,rating_count,review_time,review,rating
0,1,McDonald's,Fast food restaurant,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,3 months ago,Why does it look like someone spit on my food?...,1 star
1,2,McDonald's,Fast food restaurant,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,5 days ago,It'd McDonalds. It is what it is as far as the...,4 stars
2,3,McDonald's,Fast food restaurant,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,5 days ago,Made a mobile order got to the speaker and che...,1 star
3,4,McDonald's,Fast food restaurant,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,a month ago,My mc. Crispy chicken sandwich was �����������...,5 stars
4,5,McDonald's,Fast food restaurant,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,2 months ago,"I repeat my order 3 times in the drive thru, a...",1 star


The dataset has multiple columns but for my experiment, I only need the review and the rating. Dropping everything else.

In [12]:
#drop columns
dropcols_data = og_data[['review','rating']].copy()
dropcols_data.head()

Unnamed: 0,review,rating
0,Why does it look like someone spit on my food?...,1 star
1,It'd McDonalds. It is what it is as far as the...,4 stars
2,Made a mobile order got to the speaker and che...,1 star
3,My mc. Crispy chicken sandwich was �����������...,5 stars
4,"I repeat my order 3 times in the drive thru, a...",1 star


In [13]:
# change star rating to integer
dropcols_data['rating'] = dropcols_data['rating'].apply(lambda x: int(x.split()[0]))
dropcols_data.head()

Unnamed: 0,review,rating
0,Why does it look like someone spit on my food?...,1
1,It'd McDonalds. It is what it is as far as the...,4
2,Made a mobile order got to the speaker and che...,1
3,My mc. Crispy chicken sandwich was �����������...,5
4,"I repeat my order 3 times in the drive thru, a...",1


In [14]:
# Raw dataset
dropcols_data.count()

review    33385
rating    33396
dtype: int64

In [15]:
# Ratings without reviews
no_rev = list(dropcols_data.index[dropcols_data['review'].isna()])
print(f'Total ratings with no reviews: {len(no_rev)}')

# drop ratings with no reviews
dropcols_data = dropcols_data.drop(no_rev)
print(dropcols_data.count())

Total ratings with no reviews: 11
review    33385
rating    33385
dtype: int64


In [16]:
# Neutral reviews
ratings_3star = list(dropcols_data.index[dropcols_data['rating']==3])
print(f'Number of neutral ratings: {len(ratings_3star)}')

# drop neutral 3 star reviews
dropcols_data = dropcols_data.drop(ratings_3star)
dropcols_data.count()

Number of neutral ratings: 4814


review    28571
rating    28571
dtype: int64

In [17]:
#chnage rating to label, 1: positive and 0:negative
dropcols_data['Sentiment'] = dropcols_data['rating'].apply(lambda x: 1 if x>3 else 0)
dataset = dropcols_data[['review','Sentiment']].copy()
dataset.head()

Unnamed: 0,review,Sentiment
0,Why does it look like someone spit on my food?...,0
1,It'd McDonalds. It is what it is as far as the...,1
2,Made a mobile order got to the speaker and che...,0
3,My mc. Crispy chicken sandwich was �����������...,1
4,"I repeat my order 3 times in the drive thru, a...",0


# Preprocessing the dataset

In [37]:
# Lower case
dataset['preprocessed_review'] = dataset['review'].apply(lambda sentence: sentence.lower())
dataset.head()

Unnamed: 0,review,Sentiment,preprocessed_review
0,Why does it look like someone spit on my food?...,0,why does it look like someone spit on my food?...
1,It'd McDonalds. It is what it is as far as the...,1,it'd mcdonalds. it is what it is as far as the...
2,Made a mobile order got to the speaker and che...,0,made a mobile order got to the speaker and che...
3,My mc. Crispy chicken sandwich was �����������...,1,my mc. crispy chicken sandwich was �����������...
4,"I repeat my order 3 times in the drive thru, a...",0,"i repeat my order 3 times in the drive thru, a..."


In [38]:
# Tokenization
nltk.download('punkt') # if error
tokenizer = nltk.tokenize.WhitespaceTokenizer()
dataset['preprocessed_review'] = dataset['preprocessed_review'].apply(lambda x: tokenizer.tokenize(x))
dataset.head()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jatin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,review,Sentiment,preprocessed_review
0,Why does it look like someone spit on my food?...,0,"[why, does, it, look, like, someone, spit, on,..."
1,It'd McDonalds. It is what it is as far as the...,1,"[it'd, mcdonalds., it, is, what, it, is, as, f..."
2,Made a mobile order got to the speaker and che...,0,"[made, a, mobile, order, got, to, the, speaker..."
3,My mc. Crispy chicken sandwich was �����������...,1,"[my, mc., crispy, chicken, sandwich, was, ����..."
4,"I repeat my order 3 times in the drive thru, a...",0,"[i, repeat, my, order, 3, times, in, the, driv..."


In [39]:
# Handle Removal
# Not needed in this usecase but for tweets or other cases might be usefull.

In [40]:
# Stop word removal
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')
dataset['preprocessed_review'] = dataset['preprocessed_review'].apply(lambda tokens: [t for t in tokens if t not in stopwords])
dataset.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jatin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,review,Sentiment,preprocessed_review
0,Why does it look like someone spit on my food?...,0,"[look, like, someone, spit, food?, normal, tra..."
1,It'd McDonalds. It is what it is as far as the...,1,"[it'd, mcdonalds., far, food, atmosphere, go.,..."
2,Made a mobile order got to the speaker and che...,0,"[made, mobile, order, got, speaker, checked, i..."
3,My mc. Crispy chicken sandwich was �����������...,1,"[mc., crispy, chicken, sandwich, �������������..."
4,"I repeat my order 3 times in the drive thru, a...",0,"[repeat, order, 3, times, drive, thru,, still,..."


In [47]:
# Stemming or Lemmatization
nltk.download('wordnet')
lemmatizer = nltk.stem.WordNetLemmatizer()
dataset['preprocessed_review'] = dataset['preprocessed_review'].apply(lambda tokens: [lemmatizer.lemmatize(t) for t in tokens])
dataset.head()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jatin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,review,Sentiment,preprocessed_review
0,Why does it look like someone spit on my food?...,0,"[look, like, someone, spit, food?, normal, tra..."
1,It'd McDonalds. It is what it is as far as the...,1,"[it'd, mcdonalds., far, food, atmosphere, go.,..."
2,Made a mobile order got to the speaker and che...,0,"[made, mobile, order, got, speaker, checked, i..."
3,My mc. Crispy chicken sandwich was �����������...,1,"[mc., crispy, chicken, sandwich, �������������..."
4,"I repeat my order 3 times in the drive thru, a...",0,"[repeat, order, 3, time, drive, thru,, still, ..."


In [48]:
# Convert reviews to features

In [13]:
# Train Model

In [14]:
# Training and validation results

In [15]:
# Inference