In [2]:
DATA_PATH = '.\Data'

In [3]:
import os
import json
from tqdm import tqdm

from sklearn.feature_extraction.text import CountVectorizer

from nltk.tokenize import WordPunctTokenizer
from nltk import edit_distance

In [24]:
class ProbablisticAutoCorrect():
    def __init__(self, data_path=DATA_PATH):
        with open(os.path.join(DATA_PATH,'info.json'),'r') as infoFP:
            self.info = json.load(infoFP)
        self.wordProbs = self.returnWordProbabilities()
        self. tokenizer = WordPunctTokenizer()
    
    def returnArticle(self, articleNameKey):
        articlePath = self.info[articleNameKey]['path']
        with open(articlePath,'r',encoding="utf-8") as articleFP:
            article = ''.join(articleFP.readlines())
        return article

    def returnArticles(self):
        articles = []
        vocab = {}
        print('Building vocabulary by reading articles...')
        for articleNameKey in tqdm(self.info.keys()):
            articles.append(self.returnArticle(articleNameKey))
        articles = ''.join(articles)
        return articles
    
    def returnVocabulary(self):
        vectorizer = CountVectorizer()
        vectorizer.fit([self.returnArticles()])
        return vectorizer.vocabulary_

    def returnWordProbabilities(self):
        vocab = self.returnVocabulary()
        totalWordCount = sum(vocab.values())
        wordProbs = {}
        for word in vocab.keys():
            wordProbs[word] = vocab[word]/totalWordCount
        return wordProbs

    def identifyErrors(self, targetText):
        tokens = self.tokenizer.tokenize(targetText)
        potentialErrors = [token for token in tokens if token not in self.wordProbs.keys()]
        return potentialErrors

    def nEditDistStrings(self,misspelledWord,n=2):
        return [(vocabWord,self.wordProbs[vocabWord]) for vocabWord in self.wordProbs.keys() if edit_distance(misspelledWord,vocabWord) <= n]

    def infer(self,targetString):
        resultString = targetString[:]
        incorrectWords = self.identifyErrors(targetString)
        print(incorrectWords)
        for word in incorrectWords:
            candidates = self.nEditDistStrings(word)
            candidates.sort()
            print(candidates[-1][0])
            resultString = resultString.replace(word, candidates[-1][0], 1) #last candidate is of highest probability
        return resultString

robotique = ProbablisticAutoCorrect()

Building vocabulary by reading articles...


100%|██████████████████████████████████████████████████████████████████████████████| 740/740 [00:00<00:00, 7913.68it/s]


In [25]:
testTarget = 'AU au fil du temps, il a acquisd une personnalité et une conscience prores.'
print("Did you meant: {}".format(robotique.infer(testTarget)))

['AU', ',', 'a', 'acquisd', 'prores', '.']
할래
할래
할래
acquises
spores
할래
Did you meant: 할래 할래u fil du temps할래 il a acquises une personnalité et une conscience spores할래
