In [21]:
import pandas as pd

# data taken from wikipedia.txt: https://www.kaggle.com/datasets/bittlingmayer/spelling?resource=download&select=birkbeck.txt
data = pd.read_csv('words.csv')

# data taken from Robert Heckendorn's List of Hard to Spell Words
data = pd.read_csv('words_1.csv')


# PySpellChecker

In [22]:
# !pip install pyspellchecker
# !pip install textblob
# !apt-get install -y hunspell

In [23]:
from spellchecker import SpellChecker

spell = SpellChecker()
corrected = []
i=0
for misspelled in data['Misspelling']:
    i+=1
    corrected.append(spell.correction(misspelled))
    print(f"{i} Misspelled: {misspelled} | Corrected: {spell.correction(misspelled)}")
    

data['pyspellchecker_correction'] = corrected

1 Misspelled: Austrailia | Corrected: australia
2 Misspelled: Carribean | Corrected: None
3 Misspelled: Checkoslovakia | Corrected: None
4 Misspelled: Conneticut | Corrected: None
5 Misspelled: Euclidian | Corrected: Euclidian
6 Misspelled: Febewary | Corrected: None
7 Misspelled: Kwanza | Corrected: Kwanza
8 Misspelled: Parmisian | Corrected: partisan
9 Misspelled: Presbaterian | Corrected: presbyterian
10 Misspelled: Tootonic | Corrected: tectonic
11 Misspelled: Tusday | Corrected: today
12 Misspelled: Wendsday | Corrected: None
13 Misspelled: Wisconson | Corrected: None
14 Misspelled: abanden | Corrected: abandon
15 Misspelled: abiss | Corrected: abyss
16 Misspelled: abizmal | Corrected: animal
17 Misspelled: abriviate | Corrected: abbreviate
18 Misspelled: abscound | Corrected: abscond
19 Misspelled: absorbant | Corrected: absorbant
20 Misspelled: absorbtion | Corrected: absorption
21 Misspelled: abstanence | Corrected: abstinence
22 Misspelled: abundence | Corrected: abundance
23 

# TextBlob

In [24]:
from textblob import TextBlob

corrected = []
i=0
for misspelled in data['Misspelling']:
    i+=1
    word = TextBlob(misspelled).correct()
    corrected.append(str(word))
    print(f"{i} Misspelled: {misspelled} | Corrected: {word}")

data['textblob_correction'] = corrected

1 Misspelled: Austrailia | Corrected: Australia
2 Misspelled: Carribean | Corrected: Carribean
3 Misspelled: Checkoslovakia | Corrected: Checkoslovakia
4 Misspelled: Conneticut | Corrected: Connecticut
5 Misspelled: Euclidian | Corrected: Euclidian
6 Misspelled: Febewary | Corrected: Febewary
7 Misspelled: Kwanza | Corrected: Stanza
8 Misspelled: Parmisian | Corrected: Parisian
9 Misspelled: Presbaterian | Corrected: Presbaterian
10 Misspelled: Tootonic | Corrected: Tootonic
11 Misspelled: Tusday | Corrected: Sunday
12 Misspelled: Wendsday | Corrected: Wendsday
13 Misspelled: Wisconson | Corrected: Wisconsin
14 Misspelled: abanden | Corrected: abandon
15 Misspelled: abiss | Corrected: amiss
16 Misspelled: abizmal | Corrected: animal
17 Misspelled: abriviate | Corrected: abriviate
18 Misspelled: abscound | Corrected: abound
19 Misspelled: absorbant | Corrected: absorbent
20 Misspelled: absorbtion | Corrected: absorption
21 Misspelled: abstanence | Corrected: abstanence
22 Misspelled: ab

# Aspell

In [25]:
import pandas as pd
import subprocess

def correct_with_aspell(word):
    process = subprocess.run(['aspell', '-a', '-d', 'en_GB'], input=word, text=True, capture_output=True)
    output = process.stdout.splitlines()
    
    if output and output[1].startswith('&'):
        suggestions = output[1].split(": ")[1].split(", ")
        return suggestions[0] if suggestions else word  
    return word  

corrected = []
i=0
for misspelled in data['Misspelling']:
    i+=1
    corrected_word = correct_with_aspell(misspelled)
    corrected.append(corrected_word)
    print(f"{i}Misspelled: {misspelled} | Corrected: {corrected_word}")

data['aspell_correction'] = corrected


1Misspelled: Austrailia | Corrected: Australia
2Misspelled: Carribean | Corrected: Caribbean
3Misspelled: Checkoslovakia | Corrected: Slovakia
4Misspelled: Conneticut | Corrected: Connecticut
5Misspelled: Euclidian | Corrected: Euclidean
6Misspelled: Febewary | Corrected: February
7Misspelled: Kwanza | Corrected: Kwanzaa
8Misspelled: Parmisian | Corrected: Parisian
9Misspelled: Presbaterian | Corrected: Presbyterian
10Misspelled: Tootonic | Corrected: Teutonic
11Misspelled: Tusday | Corrected: Tuesday
12Misspelled: Wendsday | Corrected: Wends day
13Misspelled: Wisconson | Corrected: Wisconsin
14Misspelled: abanden | Corrected: abandon
15Misspelled: abiss | Corrected: abyss
16Misspelled: abizmal | Corrected: abysmal
17Misspelled: abriviate | Corrected: abbreviate
18Misspelled: abscound | Corrected: abscond
19Misspelled: absorbant | Corrected: absorbent
20Misspelled: absorbtion | Corrected: absorption
21Misspelled: abstanence | Corrected: abstinence
22Misspelled: abundence | Corrected: a

# BERT

In [26]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForMaskedLM

model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForMaskedLM.from_pretrained(model_name)

def correct_misspellings(misspelled_words):
    corrected = []
    i=0
    for misspelled in misspelled_words:
        i+=1
        input_text = f"This is a word which was misspelled: {misspelled}. The right written word is [MASK]."
        input_ids = tokenizer.encode(input_text, return_tensors='pt')

        mask_token_index = input_ids[0].tolist().index(tokenizer.mask_token_id)

        with torch.no_grad():
            outputs = model(input_ids)
            predictions = outputs.logits

        predicted_token_id = predictions[0, mask_token_index].argmax(dim=-1).item()
        predicted_word = tokenizer.decode(predicted_token_id)

        corrected.append(predicted_word)
        print(f"{i} Misspelled: {misspelled} | Corrected: {predicted_word}")

    return corrected

data['bert_correction'] = correct_misspellings(data['Misspelling'])


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


1 Misspelled: Austrailia | Corrected: latin
2 Misspelled: Carribean | Corrected: english
3 Misspelled: Checkoslovakia | Corrected: [UNK]
4 Misspelled: Conneticut | Corrected: english
5 Misspelled: Euclidian | Corrected: greek
6 Misspelled: Febewary | Corrected: [UNK]
7 Misspelled: Kwanza | Corrected: [UNK]
8 Misspelled: Parmisian | Corrected: [UNK]
9 Misspelled: Presbaterian | Corrected: english
10 Misspelled: Tootonic | Corrected: missing
11 Misspelled: Tusday | Corrected: [UNK]
12 Misspelled: Wendsday | Corrected: [UNK]
13 Misspelled: Wisconson | Corrected: [UNK]
14 Misspelled: abanden | Corrected: [UNK]
15 Misspelled: abiss | Corrected: ab
16 Misspelled: abizmal | Corrected: [UNK]
17 Misspelled: abriviate | Corrected: omitted
18 Misspelled: abscound | Corrected: omitted
19 Misspelled: absorbant | Corrected: [UNK]
20 Misspelled: absorbtion | Corrected: omitted
21 Misspelled: abstanence | Corrected: [UNK]
22 Misspelled: abundence | Corrected: omitted
23 Misspelled: abundent | Correcte

In [27]:
data.to_csv('corrected_words_1.csv', index=False)

# Analysis of results

In [28]:
data = pd.read_csv('corrected_words.csv') # data from kaggle
data_1 = pd.read_csv('corrected_words_1.csv') # data from Robert Heckendorn's List of Hard to Spell Words

In [30]:
def ratio_correctly_predicted_words(corrected_words, original_words):
    '''
    Calculate the ratio of correctly predicted words in the list
    '''
    count = 0
    for i in range(len(corrected_words)):
        if str(corrected_words[i]).lower() == str(original_words[i]).lower():
            count += 1

    return count / len(corrected_words)*100

print(f"Correctly predicted words from kaggle ratio:")
print(f"aspell: {ratio_correctly_predicted_words(data['aspell_correction'], data['RealWord'])} %")
print(f"pyspellchecker: {ratio_correctly_predicted_words(data['pyspellchecker_correction'], data['RealWord'])} %")
print(f"textblob: {ratio_correctly_predicted_words(data['textblob_correction'], data['RealWord'])} %")
print(f"bert: {ratio_correctly_predicted_words(data['bert_correction'], data['RealWord'])} %")
print('\n')
print(f"Correctly predicted words from Robert Heckendorn's List of Hard to Spell Words ratio:")
print(f"aspell: {ratio_correctly_predicted_words(data_1['aspell_correction'], data_1['RealWord'])} %")
print(f"pyspellchecker: {ratio_correctly_predicted_words(data_1['pyspellchecker_correction'], data_1['RealWord'])} %")
print(f"textblob: {ratio_correctly_predicted_words(data_1['textblob_correction'], data_1['RealWord'])} %")
print(f"bert: {ratio_correctly_predicted_words(data_1['bert_correction'], data_1['RealWord'])} %")


Correctly predicted words from kaggle ratio:
aspell: 79.38900203665987 %
pyspellchecker: 73.76782077393077 %
textblob: 61.83299389002037 %
bert: 0.8961303462321792 %


Correctly predicted words from Robert Heckendorn's List of Hard to Spell Words ratio:
aspell: 58.90138980807412 %
pyspellchecker: 47.98146922567836 %
textblob: 34.348113831899404 %
bert: 0.3309066843150232 %


In [42]:
import nltk

def mean_levenshtein_distance(corrected_words, original_words):
    '''
    Calculate the mean Levenshtein distance between the corrected words and the original words
    We pressume that if the model predicts none, then the levenshtein distance is len(original_word)
    '''
    total_distance = 0
    for i in range(len(corrected_words)):
        if corrected_words[i] is not None and not pd.isna(corrected_words[i]) and original_words[i]!='unknown' and original_words[i]!='[UNK]':
            total_distance += nltk.edit_distance(corrected_words[i], original_words[i])
        else:
            total_distance += len(original_words[i])

    return total_distance / len(corrected_words)


print(f"Mean Levenshtein distance from kaggle:")
print(f"aspell: {mean_levenshtein_distance(data['aspell_correction'], data['RealWord'])}")
print(f"pyspellchecker: {mean_levenshtein_distance(data['pyspellchecker_correction'], data['RealWord'])}")
print(f"textblob: {mean_levenshtein_distance(data['textblob_correction'], data['RealWord'])}")
print(f"bert: {mean_levenshtein_distance(data['bert_correction'], data['RealWord'])}")
print('\n')

print(f"Mean Levenshtein distance from Robert Heckendorn's List of Hard to Spell Words:")
print(f"aspell: {mean_levenshtein_distance(data_1['aspell_correction'], data_1['RealWord'])}")
print(f"pyspellchecker: {mean_levenshtein_distance(data_1['pyspellchecker_correction'], data_1['RealWord'])}")
print(f"textblob: {mean_levenshtein_distance(data_1['textblob_correction'], data_1['RealWord'])}")
print(f"bert: {mean_levenshtein_distance(data_1['bert_correction'], data_1['RealWord'])}")


Mean Levenshtein distance from kaggle:
aspell: 0.5315682281059063
pyspellchecker: 0.7254582484725051
textblob: 0.7816700610997963
bert: 7.5979633401222


Mean Levenshtein distance from Robert Heckendorn's List of Hard to Spell Words:
aspell: 1.4202514890800795
pyspellchecker: 2.129715420251489
textblob: 1.9080079417604237
bert: 7.133686300463269
