In [5]:
import pandas as pd

df = pd.read_csv('words.txt', header=None, names=['word'], dtype = str, keep_default_na=False)

df

Unnamed: 0,word
0,a
1,aa
2,aaa
3,aaron
4,ab
...,...
9995,zope
9996,zshops
9997,zu
9998,zum


In [6]:
# type_counts = df['word'].apply(type).value_counts()
# print(type_counts)

# df[df['word'].apply(lambda x: isinstance(x, float))]
# df = df.dropna()

In [7]:
word_dict = {}
#{chr(i): [] for i in range(97, 123)}

#iterating over each word in the dataframe
for word in df['word']:
    #checking if the first letter of the word exists in the dictionary
    if word[0] not in word_dict.keys():
        word_dict[word[0]] = []
    #appending the word to the respective key
    word_dict[word[0]].append(word)



In [8]:
def longest_common_subsequence(s1, s2):
    m, n = len(s1), len(s2)
    dp = [[0] * (n + 1) for _ in range(m + 1)]

    for i in range(1, m + 1):
        for j in range(1, n + 1):
            if s1[i-1] == s2[j-1]:
                dp[i][j] = dp[i-1][j-1] + 1
            else:
                dp[i][j] = max(dp[i-1][j], dp[i][j-1])
    
    return dp[m][n]

def spell_checker(word, word_dict):
    #using the first letter to narrow down the words to check
    possible_words = word_dict.get(word[0], [])

    #finding the word with the maximum LCS value
    max_lcs = 0
    best_match = None

    for w in possible_words:
        lcs_val = longest_common_subsequence(word, w)
        if lcs_val > max_lcs:
            max_lcs = lcs_val
            best_match = w

    return best_match

# Sample usage
word = "abrod"
corrected_word = spell_checker(word, word_dict)
print(f"Suggested correction for '{word}' is '{corrected_word}'")


Suggested correction for 'abrod' is 'abroad'


In [9]:
def edit_distance(s1, s2):
    m, n = len(s1), len(s2)
    dp = [[0 for x in range(n+1)] for x in range(m+1)]

    for i in range(m+1):
        for j in range(n+1):
            if i == 0:
                dp[i][j] = j
            elif j == 0:
                dp[i][j] = i
            elif s1[i-1] == s2[j-1]:
                dp[i][j] = dp[i-1][j-1]
            else:
                dp[i][j] = 1 + min(dp[i-1][j], dp[i][j-1], dp[i-1][j-1])

    return dp[m][n]

def spell_checker_edit_distance(word, word_dict):
    possible_words = word_dict.get(word[0], [])
    min_distance = float('inf')
    best_match = None

    for w in possible_words:
        distance = edit_distance(word, w)
        if distance < min_distance:
            min_distance = distance
            best_match = w

    return best_match


word = "abrod"
corrected_word = spell_checker_edit_distance(word, word_dict)
print(f"Suggested correction for '{word}' is '{corrected_word}'")

Suggested correction for 'abrod' is 'abroad'


In [10]:
def jaccard_similarity(set1, set2):
    """
    computing the Jaccard Similarity of two sets
    """
    intersection_size = len(set1.intersection(set2))
    union_size = len(set1.union(set2))
    
    return intersection_size / union_size

def ngram_similarity(word1, word2, n=2):
    ngrams_word1 = set([word1[i:i+n] for i in range(len(word1)-n+1)])
    ngrams_word2 = set([word2[i:i+n] for i in range(len(word2)-n+1)])

    return jaccard_similarity(ngrams_word1, ngrams_word2)

def spell_checker_ngram(word, word_dict, n=2):
    possible_words = word_dict.get(word[0], [])
    max_similarity = 0
    best_match = None

    for w in possible_words:
        similarity = ngram_similarity(word, w, n)
        if similarity > max_similarity:
            max_similarity = similarity
            best_match = w

    return best_match

word = "abrod"
corrected_word = spell_checker_ngram(word, word_dict)
print(f"Suggested correction for '{word}' is '{corrected_word}'")


Suggested correction for 'abrod' is 'abroad'


In [12]:
from symspellpy import SymSpell, Verbosity

def setup_symspell(word_dict):
    sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
    
    for word_list in word_dict.values():
        for word in word_list:
            sym_spell.create_dictionary_entry(word, 1)
    return sym_spell

def spell_checker_symspell(word, sym_spell):
    suggestions = sym_spell.lookup(word, Verbosity.CLOSEST, max_edit_distance=2)
    if suggestions:
        return suggestions[0].term
    return None

sym_spell = setup_symspell(word_dict)

word = "abrod"
corrected_word = spell_checker_symspell(word, sym_spell)
print(f"Suggested correction for '{word}' is '{corrected_word}'")

<symspellpy.symspellpy.SymSpell object at 0x1046b6ca0>
Suggested correction for 'abrod' is 'abroad'
