In [54]:
import spacy
import pandas as pd
import time
from collections import Counter
from spellchecker import SpellChecker
import re
# https://spacy.io/models/en
nlp = spacy.load('en_core_web_sm')

In [55]:
df = pd.read_csv("McDonald_s_Reviews.csv", encoding="latin-1")
df.columns = df.columns.str.strip()

In [56]:
spell = SpellChecker()

In [57]:
def correct_spellings(text):
    
    
    # Remove all special characters
    text = re.sub(r'[^\w\s\'"-.,!?]', '', text)
    
    
    # spaCy
    nlp.max_length = len(text)


    doc = nlp(text.lower())
    
    corrected_text = []
    for token in doc:
        if not token.is_stop and not token.is_punct and token.is_alpha:
            corrected_word = spell.correction(token.lemma_.lower())
            if corrected_word is not None:  # Only add the word if it's not None
                corrected_text.append(corrected_word)
        elif token.text is not None:
            corrected_text.append(token.text)
    return ' '.join(corrected_text)



In [58]:
start = time.time()
df['review'] = df['review'].apply(correct_spellings)
end = time.time()
duration_seconds = end - start
duration_minutes = duration_seconds / 60

print("Time taken: {:.2f} minutes".format(duration_minutes))

Time taken: 7.14 minutes


In [59]:
df.to_csv('cleaned_reviews.csv', index=False)

0        Why does it look like someone spit on my food ...
1        It 'd mcdonald's . It is what it is as far as ...
2        Made a mobile order get to the speaker and che...
3        My my . crispy chicken sandwich was ï¿½ï¿½ï¿½ï...
4        I repeat my order 3 time in the drive thru , a...
                               ...                        
33391                           They treat me very badly .
33392                             The service is very good
33393                           To remove hunger is enough
33394    It 's good , but lately it has become very exp...
33395                            they take good care of me
Name: review, Length: 33396, dtype: object


In [15]:
for word, freq in word_freq.items():
    print(f'{word}: {freq}')

look: 925
like: 2694
spit: 26
food: 7912
normal: 151
transaction: 21
chill: 23
polite: 187
not: 703
want: 1233
eat: 1586
i: 966
try: 817
think: 579
milky: 8
white: 83
clear: 70
substance: 4
sure: 425
come: 1933
mcdonald's: 1220
far: 285
atmosphere: 132
staff: 2555
difference: 53
friendly: 1496
accommodate: 37
smile: 194
make: 418
pleasant: 164
experience: 1053
fast: 2880
place: 3754
mobile: 225
order: 9034
get: 2300
speaker: 122
check: 398
line: 1311
move: 165
leave: 789
late: 338
work: 1574
refund: 200
app: 484
call: 359
say: 1533
money: 483
person: 531
stick: 65
system: 190
go: 2381
day: 825
manager: 1749
tell: 1399
wasn: 2
my: 501
crispy: 126
chicken: 834
sandwich: 768
customer: 2441
service: 6402
quick: 1053
repeat: 102
time: 3453
drive: 3460
manage: 105
mess: 518
suppose: 173
large: 601
meal: 1117
double: 354
filet: 69
fish: 252
fry: 1982
cheese: 473
wrong: 1132
need: 1268
pay: 892
close: 880
attention: 286
understand: 292
english: 221
door: 370
dash: 19
lock: 129
wait: 3127
long:

In [10]:
spell = SpellChecker()

In [11]:
misspelled = spell.unknown(word_freq.keys())

In [71]:
correct_words_alpha = {}
correct_nonalpha_words = {}
incorrect_words = {} 

In [72]:
for word, freq in word_freq.items():
    if word in spell: 
        if word.isalpha():
            correct_words_alpha[word] = freq
        else: 
            correct_nonalpha_words[word] = freq
    elif word.isalpha():
        incorrect_words[word] = freq

In [74]:
corrections = {}
for word in incorrect_words:
    corrections[word] = spell.correction(word)

print(corrections)


{'m': 'i', 'd': 'i', 'mcdonalds': "mcdonald's", 'wasnï': 'wasn', 'mc': 'my', 'p': 'i', 'covid': 'couid', 'w': 'i', 'mcds': 'meds', 'lmao': 'lao', 'pm': 'am', 'ws': 'is', 'cinnabun': 'cinnamon', 'smh': 'suh', 'jamar': 'jamal', 'mcdonald': 'macdonald', 'pinballz': 'pinball', 'recomend': 'recommend', 'delgado': 'delano', 'spicey': 'spice', 'b': 'i', 'br': 'be', 't': 'i', 'mcflurry': 'flurry', 'mcdona': 'dona', 'nvr': 'nor', 'eleventeen': 'seventeen', 'ngl': 'nil', 'tx': 'to', 'didnt': "didn't", 'ackowledge': 'acknowledge', 'discription': 'description', 'ft': 'it', 'blondoish': 'blondish', 'thry': 'they', 'omg': 'om', 'bbq': 'be', 'mcys': 'mays', 'macdonalds': 'macdonald', 'cryin': 'crying', 'everytime': 'overtime', 'fyi': 'fbi', 'vs': 'is', 'hashbrown': None, 'frappes': 'trapped', 'canï': 'can', 'oick': 'pick', 'splenda': 'spend', 'f': 'i', 'doordash': 'doodah', 'whataburger': None, 'mcd': 'mad', 'hr': 'he', 'munchie': 'munchies', 'hrs': 'his', 'mcgriddle': 'griddle', 'sause': 'cause', 'm

In [55]:
# print('Correct words (alpha):', correct_words_alpha)
# print('Correct words (non-alpha):', correct_nonalpha_words)
# print('Incorrect words:', incorrect_words)

In [56]:
sorted_incorrect_words = {k: v for k, v in sorted(incorrect_words.items(), key=lambda item: item[1], reverse=True)}

In [57]:
sorted_correct_words = {k: v for k, v in sorted(correct_words_alpha.items(), key=lambda item: item[1], reverse=True)}

In [59]:
# print('Sorted incorrect words:', sorted_incorrect_words)

In [60]:
# print('Sorted correct words:', sorted_correct_words)

In [41]:
unique_words = list(incorrect_words.keys())
# print(unique_words)

In [42]:
sorted_word_freq = sorted(incorrect_words.items(), key=lambda x: x[1], reverse=True)

In [47]:
# for word, freq in incorrect_words:
#     print(f'{word}: {freq}')