In [95]:
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\kritarth\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [96]:
def create_wordlist(filename):  
    words = set()
    lem = WordNetLemmatizer()
    with open(filename) as f:
        for word in f:
            try:
                word = word.strip()
                word = lem.lemmatize(word,pos='v')
                if word.isalpha():
                    words.add(word.lower())
                else:
                    continue
            except:
                continue
    return words

In [97]:
def create_dfa(words):  
    tran = [[-1]*26]
    final = [0]

    for word in words:
        curr_state = 0
        for char in word:
            if tran[curr_state][ord(char)-97]==-1:
                prev_state = curr_state
                curr_state = len(tran)
                tran.append([-1]*26)
                final.append(0)
                tran[prev_state][ord(char)-97] = curr_state
            else:
                curr_state = tran[curr_state][ord(char)-97]
        final[curr_state] = 1
    return tran,final

In [98]:
def simulate(tran,final,word):
    curr_state = 0
    for char in word:
        if tran[curr_state][ord(char)-97]!=-1:
            curr_state = tran[curr_state][ord(char)-97]
        else:
            return False
    if final[curr_state]==1:
        return True
    return False

In [99]:
def edit_distance(dp,i,j,word1,word2):
    if i>=len(word1):
        return len(word2)-j
    if j>=len(word2):
        return len(word1)-i
    if dp[i][j]!=-1:
        return dp[i][j]
    if word1[i]==word2[j]:
        dp[i][j] = edit_distance(dp,i+1,j+1,word1,word2)
        return dp[i][j]
    else:
        dp[i][j] = min(edit_distance(dp,i+1,j+1,word1,word2)+1,edit_distance(dp,i+1,j,word1,word2)+1,edit_distance(dp,i,j+1,word1,word2)+1)
        return dp[i][j]

In [100]:
def gen_suggestions(words,word):
    suggestions1 = []
    suggestions2 = []
    for suggestion in words:
        dp = []
        for i in range(len(word)):
            row = []
            for j in range(len(suggestion)):
                row.append(-1)
            dp.append(row)
        levenshtein_dist = edit_distance(dp,0,0,word,suggestion)
        if levenshtein_dist<=1:
            suggestions1.append(suggestion)
        elif levenshtein_dist<=2: 
            suggestions2.append(suggestion)
    if len(word)>=6:
        return 'Incorect spelling!! \nSuggestions (distance = 1) : {} \nSuggestions (distance = 2) : {}'.format(suggestions1,suggestions2)
    else:
        return 'Incorect spelling!! \nSuggestions (distance = 1) : {}'.format(suggestions1)

In [101]:
def spellcheck(tran,final,words,word):
    lem = WordNetLemmatizer()
    word = lem.lemmatize(word,pos='v')
    if simulate(tran,final,word):
        return 'Spelling is correct'
    else:
        return gen_suggestions(words,word)

In [102]:
words = create_wordlist('3000_most_frequent.txt')
tran,final = create_dfa(words)

In [103]:
print(spellcheck(tran,final,words,'lidt'))

Incorect spelling!! 
Suggestions (distance = 1) : ['lift', 'list']


In [108]:
print(spellcheck(tran,final,words,'happy'))

Spelling is correct


In [113]:
print(spellcheck(tran,final,words,'corect'))

Incorect spelling!! 
Suggestions (distance = 1) : ['correct'] 
Suggestions (distance = 2) : ['core', 'collect', 'direct', 'forest', 'connect']
