In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import contextily as ctx
from wordcloud import WordCloud
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder, StandardScaler, MaxAbsScaler
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, confusion_matrix, ConfusionMatrixDisplay, classification_report
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)
from nltk.corpus import stopwords, wordnet
from nltk import pos_tag
import string
from nltk.stem import WordNetLemmatizer
import re
from collections import Counter

In [45]:
%run Data_Cleaning_Notebook.ipynb

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33396 entries, 0 to 33395
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   reviewer_id    33396 non-null  int64  
 1   store_name     33396 non-null  object 
 2   category       33396 non-null  object 
 3   store_address  33396 non-null  object 
 4   latitude       32736 non-null  float64
 5   longitude      32736 non-null  float64
 6   rating_count   33396 non-null  object 
 7   review_time    33396 non-null  object 
 8   review         33396 non-null  object 
 9   rating         33396 non-null  object 
dtypes: float64(2), int64(1), object(7)
memory usage: 2.5+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33396 entries, 0 to 33395
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   store_address  33396 non-null  object 
 1   latitude       32736 non-null  float64
 2   longitude   

In [2]:
df_cleaned = pd.read_csv('data/cleaned.csv', encoding='latin-1', index_col=0)
df_cleaned

Unnamed: 0,store_address,latitude,longitude,rating_count,review_time,review,rating
0,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,3 months ago,Why does it look like someone spit on my food?...,1.0
1,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,5 days ago,It'd McDonalds. It is what it is as far as the...,4.0
2,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,5 days ago,Made a mobile order got to the speaker and che...,1.0
3,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,a month ago,My mc. Crispy chicken sandwich was Ã¯Â¿Â½Ã¯Â¿Â...,5.0
4,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,2 months ago,"I repeat my order 3 times in the drive thru, a...",1.0
...,...,...,...,...,...,...,...
33391,"3501 Biscayne Blvd, Miami, FL 33137, United St...",25.810000,-80.189098,2810,4 years ago,They treated me very badly.,1.0
33392,"3501 Biscayne Blvd, Miami, FL 33137, United St...",25.810000,-80.189098,2810,a year ago,The service is very good,5.0
33393,"3501 Biscayne Blvd, Miami, FL 33137, United St...",25.810000,-80.189098,2810,a year ago,To remove hunger is enough,4.0
33394,"3501 Biscayne Blvd, Miami, FL 33137, United St...",25.810000,-80.189098,2810,5 years ago,"It's good, but lately it has become very expen...",5.0


In [46]:
class_distribution = df_cleaned['rating'].value_counts()
print(class_distribution)

class_proportion = class_distribution / len(df_cleaned)
print(class_proportion)


rating
5.0    10059
1.0     9305
4.0     5646
3.0     4706
2.0     3020
Name: count, dtype: int64
rating
5.0    0.307276
1.0    0.284244
4.0    0.172471
3.0    0.143756
2.0    0.092253
Name: count, dtype: float64


LabelEncoder for 'rating' column. Dataset imbalanced, will have better accuracy scores.

Change 'ratings' to binary of Negative (1, 2, 3) or Positive (4, 5)

In [3]:
le = LabelEncoder()

In [4]:
print(df_cleaned['rating'].value_counts())

rating
5.0    10059
1.0     9305
4.0     5646
3.0     4706
2.0     3020
Name: count, dtype: int64


In [5]:
df_cleaned['nps_rating'] = df_cleaned['rating'].apply(lambda x: 0 if x in [1.0, 2.0, 3.0] else 1)

In [6]:
df_cleaned['nps_rating'].value_counts()

nps_rating
0    17031
1    15705
Name: count, dtype: int64

In [7]:
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 32736 entries, 0 to 33395
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   store_address  32736 non-null  object 
 1   latitude       32736 non-null  float64
 2   longitude      32736 non-null  float64
 3   rating_count   32736 non-null  object 
 4   review_time    32736 non-null  object 
 5   review         32736 non-null  object 
 6   rating         32736 non-null  float64
 7   nps_rating     32736 non-null  int64  
dtypes: float64(3), int64(1), object(4)
memory usage: 2.2+ MB


In [8]:
df_cleaned[['review', 'nps_rating']].head()

Unnamed: 0,review,nps_rating
0,Why does it look like someone spit on my food?...,0
1,It'd McDonalds. It is what it is as far as the...,1
2,Made a mobile order got to the speaker and che...,0
3,My mc. Crispy chicken sandwich was Ã¯Â¿Â½Ã¯Â¿Â...,1
4,"I repeat my order 3 times in the drive thru, a...",0


Pre-processing time. Make lowercase, remove punctuation, special characters, keep time sensitive text, example: 'food was 30 min late!'

In [9]:
df_cleaned['review'].head()

0    Why does it look like someone spit on my food?...
1    It'd McDonalds. It is what it is as far as the...
2    Made a mobile order got to the speaker and che...
3    My mc. Crispy chicken sandwich was Ã¯Â¿Â½Ã¯Â¿Â...
4    I repeat my order 3 times in the drive thru, a...
Name: review, dtype: object

In [10]:
df_cleaned['review'] = df_cleaned['review'].str.lower()
df_cleaned['review'].head()

0    why does it look like someone spit on my food?...
1    it'd mcdonalds. it is what it is as far as the...
2    made a mobile order got to the speaker and che...
3    my mc. crispy chicken sandwich was ã¯â¿â½ã¯â¿â...
4    i repeat my order 3 times in the drive thru, a...
Name: review, dtype: object

In [11]:
print(df_cleaned.iloc[3]['review'])

my mc. crispy chicken sandwich was ã¯â¿â½ã¯â¿â½ã¯â¿â½ã¯â¿â½ã¯â¿â½ã¯â¿â½ã¯â¿â½ã¯â¿â½ã¯â¿â½ã¯â¿â½ã¯â¿â½ã¯â¿â½ã¯â¿â½ã¯â¿â½ã¯â¿â½ã¯â¿â½ã¯â¿â½ã¯â¿â½ã¯â¿â½ã¯â¿â½ã¯â¿â½ã¯â¿â½ã¯â¿â½ã¯â¿â½ã¯â¿â½ã¯â¿â½ã¯â¿â½ã¯â¿â½ã¯â¿â½ã¯â¿â½ã¯â¿â½ã¯â¿â½ã¯â¿â½ã¯â¿â½ã¯â¿â½ã¯â¿â½ customer service was quick and p


In [12]:
df_cleaned['review'] = df_cleaned['review'].str.replace(r'[^\w\s]', '', regex=True)
print(df_cleaned['review'].head())

0    why does it look like someone spit on my food\...
1    itd mcdonalds it is what it is as far as the f...
2    made a mobile order got to the speaker and che...
3    my mc crispy chicken sandwich was ãââ½ãââ½ãââ½...
4    i repeat my order 3 times in the drive thru an...
Name: review, dtype: object


In [13]:
print(df_cleaned.iloc[33]['review'])

just spent 10 minutes waiting at this mcdonalds  according to google theyre open 247 finally we pull up to the window to see if anyone was there sure enough one employee seated in the lobby and another at the window she told me they were closed whoever these two are they need to be replaced get it together mcdonalds youre a corporate power house and you have a reputation to keep one of the most unprofessional experiences ive ever had with fast food 010 would not recomend this location


Remove all nonsensical reviews with special characters

In [14]:
pattern = r'^[a-zA-Z0-9 .,?!:;\'\"]+$'

In [15]:
df_filtered = df_cleaned[df_cleaned['review'].str.match(pattern, na=False)]

In [16]:
print("Original number of rows:", df_cleaned.shape[0])
print("Number of rows after filtering:", df_filtered.shape[0])

Original number of rows: 32736
Number of rows after filtering: 28609


Leaves me with 28k+ reviews. Good size dataset.

In [17]:
df_filtered['review'] = df_filtered['review'].astype('str')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['review'] = df_filtered['review'].astype('str')


In [18]:
def lowercase(text):
    return text.lower()

df_filtered['review'] = df_filtered['review'].apply(lowercase)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['review'] = df_filtered['review'].apply(lowercase)


In [19]:
def text_clean(text):
    # Normalize spaces around numbers to help with pattern matching
    text = re.sub(r'(\d+)', r' \1 ', text)
    
    # Remove URLs
    text = re.sub("https?://\S+|www\.\S+", "", text)
    
    # Attempt to preserve time expressions and similar numeric patterns
    # Define patterns to preserve, e.g., "X minutes", "24 7" (since punctuation is removed)
    time_patterns = [
        r'\b\d+\s*hours\b',  # Matches "2 hours"
        r'\b\d+\s*minutes\b',  # Matches "30 minutes"
        r'\b24\s*7\b',  # Matches "24 7"
        
    ]
    
    # Combine all patterns into a single regex
    combined_pattern = '|'.join(time_patterns)
    
    # Find all matches of the combined pattern in the text
    preserved_terms = re.findall(combined_pattern, text)
    
    # Remove standalone numbers and specific numeric patterns not part of preserved expressions
    text = re.sub(r'\b\d+\b', ' ', text)  # Remove standalone numbers
    text = re.sub(r'\b(19|20)\d{2}\b', ' ', text)  # Remove standalone years
    
    # Reinsert preserved terms back into text (optional based on your logic for handling overlaps)
    for term in preserved_terms:
        text += ' ' + term  # Append preserved terms at the end; consider more sophisticated merging if necessary
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = text.split()
    filtered_words = [word for word in words if word not in stop_words]
    text = ' '.join(filtered_words)
    
    # Normalize whitespace
    text = re.sub('\s+', ' ', text).strip()
    
    return text

In [20]:
df_filtered.reset_index(drop=True, inplace=True)
df_filtered.head()

Unnamed: 0,store_address,latitude,longitude,rating_count,review_time,review,rating,nps_rating
0,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,5 days ago,itd mcdonalds it is what it is as far as the f...,4.0,1
1,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,2 months ago,i repeat my order 3 times in the drive thru an...,1.0,0
2,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,3 weeks ago,i work for door dash and they locked us all ou...,1.0,0
3,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,3 months ago,if i could give this location a zero on custo...,1.0,0
4,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,3 months ago,went thru drive thru ordered getting home noti...,1.0,0


In [21]:
df_filtered.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28609 entries, 0 to 28608
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   store_address  28609 non-null  object 
 1   latitude       28609 non-null  float64
 2   longitude      28609 non-null  float64
 3   rating_count   28609 non-null  object 
 4   review_time    28609 non-null  object 
 5   review         28609 non-null  object 
 6   rating         28609 non-null  float64
 7   nps_rating     28609 non-null  int64  
dtypes: float64(3), int64(1), object(4)
memory usage: 1.7+ MB


In [22]:
print(df_filtered.iloc[19]['review'])

just spent 10 minutes waiting at this mcdonalds  according to google theyre open 247 finally we pull up to the window to see if anyone was there sure enough one employee seated in the lobby and another at the window she told me they were closed whoever these two are they need to be replaced get it together mcdonalds youre a corporate power house and you have a reputation to keep one of the most unprofessional experiences ive ever had with fast food 010 would not recomend this location


Tokenize

In [23]:
first_review = df_filtered['review'].iloc[1]
first_review

'i repeat my order 3 times in the drive thru and she still manage to mess it up  it was suppose to be a large meal double filet of fish with large fries  no cheese  it was all wrong  they either need to pay close attention to the order being made  understand english or they need not to work at a drive thru'

In [24]:
tokens = word_tokenize(first_review, language='english')
print(tokens)

['i', 'repeat', 'my', 'order', '3', 'times', 'in', 'the', 'drive', 'thru', 'and', 'she', 'still', 'manage', 'to', 'mess', 'it', 'up', 'it', 'was', 'suppose', 'to', 'be', 'a', 'large', 'meal', 'double', 'filet', 'of', 'fish', 'with', 'large', 'fries', 'no', 'cheese', 'it', 'was', 'all', 'wrong', 'they', 'either', 'need', 'to', 'pay', 'close', 'attention', 'to', 'the', 'order', 'being', 'made', 'understand', 'english', 'or', 'they', 'need', 'not', 'to', 'work', 'at', 'a', 'drive', 'thru']


In [25]:
test_from_kash = "build built building"

In [26]:
print(word_tokenize(test_from_kash, language="english"))

['build', 'built', 'building']


Fix this process for more accurate data.

In [27]:
first_review

'i repeat my order 3 times in the drive thru and she still manage to mess it up  it was suppose to be a large meal double filet of fish with large fries  no cheese  it was all wrong  they either need to pay close attention to the order being made  understand english or they need not to work at a drive thru'

In [28]:
print(word_tokenize(first_review, language='english'))

['i', 'repeat', 'my', 'order', '3', 'times', 'in', 'the', 'drive', 'thru', 'and', 'she', 'still', 'manage', 'to', 'mess', 'it', 'up', 'it', 'was', 'suppose', 'to', 'be', 'a', 'large', 'meal', 'double', 'filet', 'of', 'fish', 'with', 'large', 'fries', 'no', 'cheese', 'it', 'was', 'all', 'wrong', 'they', 'either', 'need', 'to', 'pay', 'close', 'attention', 'to', 'the', 'order', 'being', 'made', 'understand', 'english', 'or', 'they', 'need', 'not', 'to', 'work', 'at', 'a', 'drive', 'thru']


In [29]:
df_filtered['tokens'] = df_filtered['review'].apply(lambda x: word_tokenize(x, language='english'))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['tokens'] = df_filtered['review'].apply(lambda x: word_tokenize(x, language='english'))


In [30]:
df_filtered.head()

Unnamed: 0,store_address,latitude,longitude,rating_count,review_time,review,rating,nps_rating,tokens
0,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,5 days ago,itd mcdonalds it is what it is as far as the f...,4.0,1,"[itd, mcdonalds, it, is, what, it, is, as, far..."
1,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,2 months ago,i repeat my order 3 times in the drive thru an...,1.0,0,"[i, repeat, my, order, 3, times, in, the, driv..."
2,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,3 weeks ago,i work for door dash and they locked us all ou...,1.0,0,"[i, work, for, door, dash, and, they, locked, ..."
3,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,3 months ago,if i could give this location a zero on custo...,1.0,0,"[if, i, could, give, this, location, a, zero, ..."
4,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,3 months ago,went thru drive thru ordered getting home noti...,1.0,0,"[went, thru, drive, thru, ordered, getting, ho..."


Lemmanization - important for sentiment analysis

In [31]:
lemmatizer = WordNetLemmatizer()

In [32]:
# Function to map NLTK's POS tags to the format understood by WordNet Lemmatizer
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # Default case

# Enhanced lemmatization function considering POS tags
def lemmatize_words_with_pos(tokens):
    lemmatizer = WordNetLemmatizer()
    pos_tags = pos_tag(tokens)
    
    lemmatized_tokens = []
    for word, tag in pos_tags:
        wntag = get_wordnet_pos(tag)
        lemmatized_tokens.append(lemmatizer.lemmatize(word, wntag))
    return lemmatized_tokens

def lemmatize_text(text):
    tokens = word_tokenize(text, language='english')
    lemmatized_tokens = lemmatize_words_with_pos(tokens)
    return ' '.join(lemmatized_tokens)

# Tokenizing the test sentence
test_sentence = "build built building"
print("Lemmatized test sentence:", lemmatize_text(test_sentence))

Lemmatized test sentence: build build building


In [33]:
df_filtered['lemmatized_review'] = df_filtered['review'].apply(lemmatize_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['lemmatized_review'] = df_filtered['review'].apply(lemmatize_text)


In [34]:
df_filtered.head()

Unnamed: 0,store_address,latitude,longitude,rating_count,review_time,review,rating,nps_rating,tokens,lemmatized_review
0,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,5 days ago,itd mcdonalds it is what it is as far as the f...,4.0,1,"[itd, mcdonalds, it, is, what, it, is, as, far...",itd mcdonalds it be what it be as far a the fo...
1,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,2 months ago,i repeat my order 3 times in the drive thru an...,1.0,0,"[i, repeat, my, order, 3, times, in, the, driv...",i repeat my order 3 time in the drive thru and...
2,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,3 weeks ago,i work for door dash and they locked us all ou...,1.0,0,"[i, work, for, door, dash, and, they, locked, ...",i work for door dash and they lock u all out t...
3,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,3 months ago,if i could give this location a zero on custo...,1.0,0,"[if, i, could, give, this, location, a, zero, ...",if i could give this location a zero on custom...
4,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,3 months ago,went thru drive thru ordered getting home noti...,1.0,0,"[went, thru, drive, thru, ordered, getting, ho...",go thru drive thru order get home notice my 10...


In [35]:
df_filtered['lemmatized_review'][473]

'it be a good place for one to eat something quick and simple'

In [36]:
df_filtered.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28609 entries, 0 to 28608
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   store_address      28609 non-null  object 
 1   latitude           28609 non-null  float64
 2   longitude          28609 non-null  float64
 3   rating_count       28609 non-null  object 
 4   review_time        28609 non-null  object 
 5   review             28609 non-null  object 
 6   rating             28609 non-null  float64
 7   nps_rating         28609 non-null  int64  
 8   tokens             28609 non-null  object 
 9   lemmatized_review  28609 non-null  object 
dtypes: float64(3), int64(1), object(6)
memory usage: 2.2+ MB


In [37]:
df_processed = df_filtered
df_processed

Unnamed: 0,store_address,latitude,longitude,rating_count,review_time,review,rating,nps_rating,tokens,lemmatized_review
0,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,5 days ago,itd mcdonalds it is what it is as far as the f...,4.0,1,"[itd, mcdonalds, it, is, what, it, is, as, far...",itd mcdonalds it be what it be as far a the fo...
1,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,2 months ago,i repeat my order 3 times in the drive thru an...,1.0,0,"[i, repeat, my, order, 3, times, in, the, driv...",i repeat my order 3 time in the drive thru and...
2,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,3 weeks ago,i work for door dash and they locked us all ou...,1.0,0,"[i, work, for, door, dash, and, they, locked, ...",i work for door dash and they lock u all out t...
3,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,3 months ago,if i could give this location a zero on custo...,1.0,0,"[if, i, could, give, this, location, a, zero, ...",if i could give this location a zero on custom...
4,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,3 months ago,went thru drive thru ordered getting home noti...,1.0,0,"[went, thru, drive, thru, ordered, getting, ho...",go thru drive thru order get home notice my 10...
...,...,...,...,...,...,...,...,...,...,...
28604,"3501 Biscayne Blvd, Miami, FL 33137, United St...",25.810000,-80.189098,2810,4 years ago,they treated me very badly,1.0,0,"[they, treated, me, very, badly]",they treat me very badly
28605,"3501 Biscayne Blvd, Miami, FL 33137, United St...",25.810000,-80.189098,2810,a year ago,the service is very good,5.0,1,"[the, service, is, very, good]",the service be very good
28606,"3501 Biscayne Blvd, Miami, FL 33137, United St...",25.810000,-80.189098,2810,a year ago,to remove hunger is enough,4.0,1,"[to, remove, hunger, is, enough]",to remove hunger be enough
28607,"3501 Biscayne Blvd, Miami, FL 33137, United St...",25.810000,-80.189098,2810,5 years ago,its good but lately it has become very expensive,5.0,1,"[its, good, but, lately, it, has, become, very...",it good but lately it have become very expensive


In [38]:
df_processed.to_csv('data/processed.csv')