In [52]:
import re
import string
from collections import Counter
import numpy as np

In [53]:
def read_corpus(filename):
    with open(filename, 'r') as file:
        lines = file.readlines()
        words = []
        
        for line in lines:
            words += re.findall(r'\w+', line.lower())
            
        return words

In [54]:
words = read_corpus("c:/Users/karth/Onedrive/Desktop/Projects/ML projects/spell-check/spell-check/shakespeare.txt")
#print(len(words)

In [55]:
vocabs = set(words)
print(len(vocabs))

23902


In [56]:
word_count = Counter(words)
print(word_count["animal"])

3


In [57]:
total_word_count = float(sum(word_count.values()))
word_prob = {word: word_count[word] / total_word_count for word in word_count.keys()}


In [58]:
print(word_prob["smell"])

5.487434850160749e-05


In [59]:
def split(word):
    return [(word[:i], word[i:]) for i in range(len(word)+1)]

In [60]:
print(split("animal"))

[('', 'animal'), ('a', 'nimal'), ('an', 'imal'), ('ani', 'mal'), ('anim', 'al'), ('anima', 'l'), ('animal', '')]


In [61]:
def delete(word):
    return [l+r[1:] for l,r in split(word) if r]

In [62]:
print(delete("trash"))

['rash', 'tash', 'trsh', 'trah', 'tras']


In [63]:
def swap(word):
    return [l+r[1]+r[0]+r[2:] for l,r in split(word) if len(r)>1]

In [64]:
def replace(word):
    letters = string.ascii_lowercase
    return [l+c+r[1:] for l,r in split(word) if r for c in letters]

In [65]:
def insert(word):
    letters = string.ascii_lowercase
    return [l+c+r[1:] for l,r in split(word) for c in letters]

In [66]:
def lvl_1_edit(word):
    return set(delete(word) + swap(word) + replace(word) + insert(word))

In [67]:
print(lvl_1_edit("trash"))

{'lrash', 'ttash', 'qrash', 'trasw', 'trasf', 'trysh', 'tqash', 'trasj', 'frash', 'toash', 'trosh', 'trqsh', 'trsah', 'trnsh', 'traxh', 'nrash', 'tvash', 'trasp', 'trawh', 'trashv', 'trarh', 'trksh', 'trlsh', 'trasd', 'trhsh', 'trass', 'trasg', 'trasq', 'teash', 'tmash', 'yrash', 'trashq', 'trask', 'krash', 'trauh', 'trtsh', 'trasha', 'tpash', 'trashh', 'trashx', 'tbash', 'tuash', 'trasx', 'trazh', 'tash', 'tcash', 'trzsh', 'tdash', 'trashe', 'trasa', 'jrash', 'trashj', 'tralh', 'trsh', 'traso', 'trach', 'rtash', 'trasu', 'trashm', 'xrash', 'tresh', 'prash', 'trwsh', 'trfsh', 'trasz', 'trabh', 'tramh', 'trbsh', 'trashp', 'trashg', 'trasr', 'trish', 'trahs', 'trvsh', 'srash', 'trasl', 'traoh', 'trasc', 'wrash', 'trashf', 'trush', 'trasy', 'trashl', 'trjsh', 'trash', 'trafh', 'trasm', 'tlash', 'traqh', 'trashw', 'trasb', 'trashd', 'tnash', 'tarsh', 'trashb', 'trashz', 'traeh', 'trmsh', 'hrash', 'thash', 'rash', 'urash', 'trashi', 'orash', 'trayh', 'crash', 'tfash', 'twash', 'trcsh', 'tra

In [68]:
def lvl_2_edit(word):
    return set(e2 for e1 in lvl_1_edit(word) for e2 in lvl_1_edit(e1))

In [69]:
def corr_spell(word, vocab, prob):
    if word in vocab:
        print("word is correct")
        return
    
    suggestion = lvl_1_edit(word) or lvl_2_edit(word) or [word]
    best_guess = [w for w in suggestion if w in vocab]
    return [(w, prob[w]) for w in best_guess]

In [75]:
word = "foad"
guesses = corr_spell(word, vocabs, word_prob)
print(guesses)

[('foal', 1.0759676176785783e-06), ('fond', 6.240612182535754e-05), ('ford', 0.0003131065767444663), ('food', 6.99378951491076e-05), ('toad', 1.9367417118214412e-05), ('load', 2.1519352353571567e-05), ('foam', 4.303870470714313e-06), ('road', 1.9367417118214412e-05), ('goad', 1.0759676176785783e-06), ('fold', 2.3671287588928723e-05)]


In [None]:
class spellCheck(object):
    def __init__(self, corpus_file_path):
        with open(corpus_file_path, "r") as file:
            lines = file.readlines()
            words=[]
            for line in lines:
                words+=re.findall(r'\w+', line.lower())

            self.vocab = set(words)
            self.word_count = Counter(words)
            total_words = float(sum(self))