In [1]:
# just some imports to be used later
import os
from zipfile import ZipFile

import numpy as np
import seaborn
import pandas as pd
import math

import nltk
from sklearn.model_selection import train_test_split
#from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

# The Dataset

In [2]:
# Get dataset
!kaggle datasets download "nelgiriyewithana/mcdonalds-store-reviews"

Downloading mcdonalds-store-reviews.zip to D:\Work\Learning\NaturalLanguage\SentimentAnalysis




  0%|          | 0.00/1.78M [00:00<?, ?B/s]
 56%|#####6    | 1.00M/1.78M [00:00<00:00, 4.33MB/s]
100%|##########| 1.78M/1.78M [00:00<00:00, 4.46MB/s]
100%|##########| 1.78M/1.78M [00:00<00:00, 4.43MB/s]


In [3]:
# Extract dataset
with ZipFile('mcdonalds-store-reviews.zip','r') as zipped_file:
    zipped_file.extractall()

# Delete the zipfile
os.remove('mcdonalds-store-reviews.zip')

In [4]:
# Read Dataset
dataset = pd.read_csv('McDonald_s_Reviews.csv',encoding_errors='ignore')

#drop columns
dataset = dataset[['review','rating']].copy()

# change star ratings to integer
dataset['rating'] = dataset['rating'].apply(lambda x: int(x.split()[0]))
dataset.head()

Unnamed: 0,review,rating
0,Why does it look like someone spit on my food?...,1
1,It'd McDonalds. It is what it is as far as the...,4
2,Made a mobile order got to the speaker and che...,1
3,My mc. Crispy chicken sandwich was �����������...,5
4,"I repeat my order 3 times in the drive thru, a...",1


In [5]:
# drop ratings with no reviews
no_rev = list(dataset.index[dataset['review'].isna()])
dataset = dataset.drop(no_rev)

# drop neutral 3 star reviews
ratings_3star = list(dataset.index[dataset['rating']==3])
dataset = dataset.drop(ratings_3star)

#chnage rating to label, 1: positive and 0:negative
dataset['Sentiment'] = dataset['rating'].apply(lambda x: 1 if x>3 else 0)
dataset = dataset[['review','Sentiment']].copy()
dataset.head()

Unnamed: 0,review,Sentiment
0,Why does it look like someone spit on my food?...,0
1,It'd McDonalds. It is what it is as far as the...,1
2,Made a mobile order got to the speaker and che...,0
3,My mc. Crispy chicken sandwich was �����������...,1
4,"I repeat my order 3 times in the drive thru, a...",0


# Data Preprocessing

In [6]:
def preprocessing(ipString,tokenizer,stopwords,lemmatizer):
    #preprocessing
    op = ipString.lower()
    op = tokenizer.tokenize(op)
    op = [token for token in op if token not in stopwords]
    op = [lemmatizer.lemmatize(token) for token in op]
    return op

In [7]:
tokenizer = nltk.tokenize.WordPunctTokenizer()
stopwords = nltk.corpus.stopwords.words('english')
lemmatizer = nltk.stem.WordNetLemmatizer()

dataset['review'] = dataset['review'].apply(lambda review: preprocessing(review, tokenizer, stopwords, lemmatizer))
dataset.head()

Unnamed: 0,review,Sentiment
0,"[look, like, someone, spit, food, ?, normal, t...",0
1,"[', mcdonalds, ., far, food, atmosphere, go, ....",1
2,"[made, mobile, order, got, speaker, checked, ....",0
3,"[mc, ., crispy, chicken, sandwich, �����������...",1
4,"[repeat, order, 3, time, drive, thru, ,, still...",0


# Calculating probabilities

In [8]:
# Should only use training data to generate learning features
train, test = train_test_split(dataset, test_size=0.3, random_state=64)

In [9]:
# find occurence of word in positive and negative sentences
def generateWordFreqTable(dataset):
    # init freq table
    distinctWords = []
    for review in dataset['review']:
        distinctWords.extend(review)
    distinctWords = list(set(distinctWords))
    initFreq = [0]*len(distinctWords)
    wordFreqTable = pd.DataFrame({'positiveFreq':initFreq,'negativeFreq':initFreq},index=distinctWords)
    
    # positive frequencies
    positiveRevs = dataset[dataset['Sentiment']==1]
    for rev in positiveRevs['review']:
        for token in set(rev):
            wordFreqTable.at[token,'positiveFreq'] = wordFreqTable.at[token,'positiveFreq'] +1
    
    #negative frequency
    negativeRevs = dataset[dataset['Sentiment']==0]
    for rev in negativeRevs['review']:
        for token in set(rev):
            wordFreqTable.at[token,'negativeFreq'] = wordFreqTable.at[token,'negativeFreq'] +1
    return wordFreqTable

In [10]:
# Calculate conditional probabilities
def generateWordProbTable(listeDeTokens):
    wordFreqTable = generateWordFreqTable(listeDeTokens)
    # origninal without smoothening, but has issue
    #wordFreqTable['positiveProb'] = wordFreqTable['positiveFreq']/wordFreqTable.positiveFreq.sum()
    #wordFreqTable['negativeProb'] = wordFreqTable['negativeFreq']/wordFreqTable.negativeFreq.sum()
    
    # With laplace smoothening
    wordFreqTable['positiveProb'] = (wordFreqTable['positiveFreq']+1)  / (wordFreqTable.positiveFreq.sum()+len(wordProbTable.index)) 
    wordFreqTable['negativeProb'] = (wordFreqTable['negativeFreq']+1) / (wordFreqTable.negativeFreq.sum()+len(wordProbTable.index))
    #print(wordFreqTable.positiveFreq.sum(),wordFreqTable.count())
    return wordFreqTable

In [11]:
if not os.path.exists('./McDonald_s_Reviews_probtbl.csv'):
    wordProbTable = generateWordProbTable(train)
    wordProbTable.to_csv('./McDonald_s_Reviews_probtbl.csv')
else:
    wordProbTable = pd.read_csv('./McDonald_s_Reviews_probtbl.csv',index_col=0)
wordProbTable.head()

Unnamed: 0,positiveFreq,negativeFreq,positiveProb,negativeProb
fur,1,0,2e-05,6e-06
easy,112,9,0.001102,5.9e-05
dios,0,1,1e-05,1.2e-05
cheesesteaks,0,2,1e-05,1.8e-05
spouting,0,1,1e-05,1.2e-05


# Classification using conditional probabilities

In [12]:
def predict(tokens,wordProbTable):
    loglikelihood = math.log(wordProbTable.positiveFreq.sum()/wordProbTable.negativeFreq.sum())
    for token in tokens:
        if token in wordProbTable.index:
            loglikelihood += math.log(wordProbTable.at[token,'positiveProb']/wordProbTable.at[token,'negativeProb'])
    return 1 if loglikelihood>0 else 0

Results on training set.

In [13]:
preds_train = train['review'].apply(lambda rev: predict(rev,wordProbTable))
print(classification_report(train['Sentiment'],preds_train))

              precision    recall  f1-score   support

           0       0.89      0.93      0.91      8760
           1       0.94      0.91      0.92     11239

    accuracy                           0.92     19999
   macro avg       0.91      0.92      0.92     19999
weighted avg       0.92      0.92      0.92     19999



Results on test set.

In [14]:
preds_test = test['review'].apply(lambda rev: predict(rev,wordProbTable))
print(classification_report(test['Sentiment'],preds_test))

              precision    recall  f1-score   support

           0       0.86      0.91      0.88      3755
           1       0.93      0.88      0.90      4817

    accuracy                           0.89      8572
   macro avg       0.89      0.90      0.89      8572
weighted avg       0.90      0.89      0.90      8572



# Inference

In [19]:
inferenceStrings = ['I hated the food here, it is so bad that I will never come back.', 'It is so good, i go there every week.']
preprocessedinferenceStrings = [preprocessing(inferenceString,tokenizer,stopwords,lemmatizer) for inferenceString in inferenceStrings]
[predict(preprocessedinferenceString,wordProbTable) for preprocessedinferenceString in preprocessedinferenceStrings]

[0, 1]