## Building a Spell Checker with NLP

## Spelling Suggestion based on Edit Distance

In [1]:
import os, sys, gc, warnings
import logging, math, re, heapq
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display, HTML
from IPython.core.interactiveshell import InteractiveShell
from collections import Counter
from nltk.tokenize import word_tokenize

In [2]:
# These settings help in proper formatting and display of the output of code we run
warnings.filterwarnings("ignore")
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
InteractiveShell.ast_node_interactivity = "all"
display(HTML(data="""<style>div#notebook-container { width:95%; }</style>"""))

In [3]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
# !wget https://ocw.mit.edu/ans7870/6/6.006/s08/lecturenotes/files/t8.shakespeare.txt

In [12]:
import re


file_name = "sample_data/t8.shakespeare.txt"
word_list = []
word_count = 0
max_words = 1000

# Read the file and append words one by one until we reach 1000 words
with open(file_name, 'r') as file:
    for line in file:
        # Find all words in the current line
        words_in_line = re.findall(r"\w+", line)
        # Convert words to lower case and extend the word list
        word_list.extend(word.lower() for word in words_in_line)
        word_count += len(words_in_line)
        # Break the loop if we've reached or exceeded the word limit
        if word_count >= max_words:
            break

# This will be our new vocabulary
vocab = set(word_list)

In [13]:
# Initiating the word_count dictionary and populating it
word_count_dict = {}
word_count_dict = Counter(word_list)
print(f"There are {len(word_count_dict)} key values pairs")
print(f"The count for the word 'gutenberg' is {word_count_dict.get('gutenberg',0)}")

There are 382 key values pairs
The count for the word 'gutenberg' is 14


In [14]:
# Initalize the probability dictionary
probs = {}
total_words = sum(word_count_dict.values())

for word, word_count in word_count_dict.items():
    word_prob = word_count/total_words
    probs[word] = word_prob
print(f"Length of probs is {len(probs)}")

# Let us use both the dictionaries for both word counts and probabilities and display an example word.
print(f"P('gutenberg') is {probs['gutenberg']:.4f}")
print(word_count_dict['gutenberg'])

Length of probs is 382
P('gutenberg') is 0.0139
14


In [15]:
def delete_letter(word):
    '''delete_letter - When we give a word, this function will return all the possible strings that have one character removed.'''
    delete_list = []
    split_list = []
    split_list = [(word[:i], word[i:]) for i in range(len(word))]
    delete_list = [L+R[1:] for L, R in split_list]
    return delete_list

def switch_letter(word):
    '''switch_letter - When we give a word, this function will return all the possible strings that have two adjacent letters switched.'''
    switch_list = []
    split_list = []
    split_list = [(word[:i], word[i:]) for i in range(len(word))]
    switch_list = [L + R[1] + R[0] + R[2:] for L, R in split_list if len(R)>=2]
    return switch_list

def replace_letter(word):
    '''replace_letter - When we give a word, this function will return all the possible strings that have one character replaced by another different letter.'''
    letters = 'abcdefghijklmnopqrstuvwxyz'
    replace_list = []
    split_list = []
    split_list = [(word[0:i], word[i:]) for i in range(len(word))]
    replace_list = [L + letter + (R[1:] if len(R)>1 else '') for L, R in split_list if R for letter in letters]
    replace_set = set(replace_list)
    replace_list = sorted(list(replace_set))
    return replace_list

def insert_letter(word):
    '''insert_letter - When we give a word, this function will return all the possible strings that have an additional character inserted.'''
    letters = 'abcdefghijklmnopqrstuvwxyz'
    insert_list = []
    split_list = []
    split_list = [(word[0:i], word[i:]) for i in range(len(word)+1)]
    insert_list = [L + letter + R for L, R in split_list for letter in letters]
    return insert_list

def edit_one_letter(word, allow_switches = True):
    '''edit_one_letter - This function will give all possible edits that are one edit away from a word such that the edits consist of the replace, insert, delete, and optionally the switch operation.'''
    edit_one_set = set()
    edit_one_set.update(delete_letter(word))
    if allow_switches: edit_one_set.update(switch_letter(word))
    edit_one_set.update(replace_letter(word))
    edit_one_set.update(insert_letter(word))
    if word in edit_one_set: edit_one_set.remove(word)
    return edit_one_set

def edit_two_letter(word, allow_switches = True):
    '''edit_two_letters - We can then generalize the edit_one_letter function to implement to get two edits on a word. We will have to get all the possible edits on a single word and then, for each modified word, we would have to modify it again.'''
    edit_two_set = set()
    edit_one = edit_one_letter(word, allow_switches=allow_switches)
    for word in edit_one:
        if word:
            edit_two = edit_one_letter(word, allow_switches=allow_switches)
            edit_two_set.update(edit_two)

    return edit_two_set

In [16]:
def get_spelling_suggestions(word, probs, vocab, n=2):
    suggestions = []
    top_n_suggestions = []
    suggestions = list((word in vocab and word) or
                       edit_one_letter(word).intersection(vocab) or
                       edit_two_letter(word).intersection(vocab))
    top_n_suggestions = [[s, probs[s]] for s in list(suggestions)]
    return top_n_suggestions

In [17]:
my_words = ['dys', 'furthar', 'mercuryn', 'disdaain', 'tumtultous']
res = []

# Get spelling suggestions for each word
for word_c in my_words:
    res.append(get_spelling_suggestions(word_c, probs, vocab, 3))

# Print the results
for i, word in enumerate(my_words):
    print(' ')
    print(f'Word - {my_words[i]}')
    for j, word_prob in enumerate(res[i]):
        print(f"word - {j}: {word_prob[0]}, probability {word_prob[1]:.6f}")

 
Word - dys
word - 0: do, probability 0.003968
word - 1: s, probability 0.001984
word - 2: by, probability 0.017857
word - 3: dir, probability 0.000992
word - 4: does, probability 0.000992
word - 5: day, probability 0.000992
word - 6: as, probability 0.002976
word - 7: is, probability 0.021825
word - 8: has, probability 0.002976
 
Word - furthar
word - 0: further, probability 0.001984
 
Word - mercuryn
 
Word - disdaain
 
Word - tumtultous


## Contextual models for Spell Check

In [19]:
!pip install contextualSpellCheck

Collecting contextualSpellCheck
  Downloading contextualSpellCheck-0.4.4-py3-none-any.whl (128 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/128.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m128.1/128.1 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.4->contextualSpellCheck)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.4->contextualSpellCheck)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.4->contextualSpellCheck)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.4->contextualSpellCheck)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_

In [20]:
import contextualSpellCheck
import spacy


nlp = spacy.load('en_core_web_sm')
contextualSpellCheck.add_to_pipe(nlp)
doc = nlp('I came home so that as I would rather participate in the function the next dys.')

# This shows the number of corrections in the input text.
print(len(doc._.suggestions_spellCheck))

# This shows all the actual corrections that were made with the associated mapping.
print(doc._.suggestions_spellCheck)

# This displays the outcome after spelling correction
print(doc._.outcome_spellCheck)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

1
{dys: 'day'}
I came home so that as I would rather participate in the function the next day.
