In [1]:
import re
import os
import math
import nltk
import string

import numpy as np
import pandas as pd

from glob import glob
from collections import Counter
from tqdm.notebook import tqdm

In [2]:
class spellCorrect:
    
    def _loadData(self):
        """
        Load data file in the .npy file extension.
        
        Input - none
        Output - words, bgrms, tgrms
        """
        words = list()

        wordFiles = glob(os.getcwd()+"/Data/Processed/*/*Words.npy")

        for d in wordFiles:
            file = np.load(d, allow_pickle=True)
            words.append([w for w in file])
        words = [x for sl in words for x in sl]

        bgrms = nltk.bigrams(words)
        tgrms = nltk.trigrams(words)
        return Counter(words), Counter(bgrms), Counter(tgrms)
        
    def _prob(self, word):
        """
        Calculate word probability based on word frequency in the corpus.
        
        Input - word (str)
        Output - prob (float)
        """
        prob = self.wordDict[word]/sum(self.wordDict.values())
        return prob    
    
    def _edit1(self, word):
        """
        Generate strings with edit distance 1 from the source word.
        Edit distance used here is the Damerau-Levenshtein Distance.
        
        Input - word (str)
        Output - set of word (str)
        """
        letters    = 'abcdefghijklmnopqrstuvwxyz'
        splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
        deletes    = [L + R[1:]               for L, R in splits if R]
        transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
        replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
        inserts    = [L + c + R               for L, R in splits for c in letters]
        return set(deletes + transposes + replaces + inserts)

    def _edit2(self, word):
        """
        Generate strings with edit distance 2 from the source word.
        Calculated based on strings with another edit distance of 1 from strings with edit distance of 1 from the source word.
        
        Input - word (str)
        Output - dict of word (str)
        """
        return (e2 for e1 in self._edit1(word) for e2 in self._edit1(e1))

    def _known(self, words):
        """
        Filter the set of strings to that which only exists in the list of known words.
        """
        return set(w for w in words if w in self.wordDict)    
    
    #def _GUI():

In [3]:
class nonWord(spellCorrect):
    
    def __init__(self):
        self.wordDict, _, _ = self._loadData()
        
        self.vocabSize = len(self.wordDict)
        self.totalWord = sum(self.wordDict.values())
        
        self.textOut  = list()
        
    def _candidate(self, word):
        """
        Generate set of candidate words with edit distances 1 and 2 based on the source word.
        
        Input - word(str)
        Output - list of set of word (str)
        """
        return (self._known([word]) or            # if the word is known (no error),
                self._known(self._edit1(word)) or # if the word has edit distance 1,
                self._known(self._edit2(word)) or # if the word has edit distance 2,
                {word})                           # if the word cannot be found from any of the above.

    def _correct(self, word):
        return max(self._candidate(word), key=self._prob)
    
    def correct(self, textInput):
        textInput = textInput.group()
        return ''.join(self._correct(textInput))

In [4]:
class realWord(spellCorrect):
    
    def __init__(self):
        self.wordDict, self.bgrmDict, self.tgrmDict = self._loadData()
        
        self.vocabSize = len(self.wordDict)
        self.totalWord = sum(self.wordDict.values())
        
        self.textOut = list()

    def _wordCandidate(self, word):
        """
        Generate set of candidate words edit distance 1 based on the source word.
        
        Input - word(str)
        Output - list of set of word (str)
        """
        candidates = [self._known([word]),            # if the word is known (no error),
                      self._known(self._edit1(word)), # if the word has edit distance 1,
                      #self._known(self._edit2(word)),# if the word has edit distance 2,
                      {word}]                         # if the word cannot be found from any of the above.
        return set([x for sl in candidates for x in sl]) # return a set of all unique known words in the case of real-word correction
    
    def _sentProb(self, sent):
        """
        Calculate sentence probability based on a combination of bigram and trigram frequency in the corpus.
        
        Input - list of words (str)
        Output - prob (float)
        """
        sent = sent.split()
        prob = self._prob(sent[0])
        for i in range(1, len(sent)-1):
            P1 = self.bgrmDict[(sent[i-1], sent[i])] * self.wordDict[sent[i]] / self.wordDict[sent[i-1]]
            P2 = self.bgrmDict[(sent[i], sent[i+1])] * self.wordDict[sent[i]] / self.wordDict[sent[i+1]]
            P3 = self.tgrmDict[(sent[i-1], sent[i], sent[i+1])] * self.wordDict[sent[i]] / (self.wordDict[sent[i-1]] * self.wordDict[sent[i+1]])
            P  = 0.25*P1 + 0.25*P2 + 0.5*P3
            prob *= P
        return prob
    
    def _sentCandidate(self, para):
        """
        Generate set of candidate sentences based on the source sentence, "sent", and the corpus.
        
        Input - sent (str)
        Output - list of words (str)
        """
        sentList = nltk.tokenize.sent_tokenize(para)
        sentCands = list()
        for s in sentList:
            wordList = s.split()
            for i in range(len(wordList)):
                for c in self._wordCandidate(wordList[i]):
                    sentOrig = wordList
                    sentOrig[i] = c
                    sentCands.append(" ".join([w for w in sentOrig]))
        sentCands = pd.Series(sentCands).unique()
        return sentCands
    
    def _correct(self, sent):
        return max(self._sentCandidate(sent), key=self._sentProb)
    
    def correct(self, textInput):
        textInput = textInput.group()
        return ''.join(self._correct(textInput))

In [5]:
nonword = nonWord()
realword = realWord()

def correct(string):
    string = re.sub(r'\b\w{3}\w+\b', nonword.correct, string)
    string = re.sub(r'\b\w{3}\w+\b', realword.correct, string)
    return string

In [10]:
sentence = 'Two NASA astronauts are preparing for a spacewalk on Tuesday to replace a faulty antenna system on the International Space Station. Flight Engineers Thomas Marshburn and Kayla Barron will exit the orbiting lab tomorrow after setting their U.S. spacesuits to battery power at 7:10 a.m. EST signifying the start of their spacewalk. The duo was joined on Monday by three of their fellow Expedition 66 flight engineers collecting tools and reviewing procedures planned for the six-and-a-half-hour spacewalk. NASA astronaut Raja Chari partnered with Marshburn and Barron gathering and organizing tethers, cameras, and pistol grip tools. The three astronauts then joined NASA astronaut Mark Vande Hei and ESA (European Space Agency) astronaut Matthias Maurer for a procedures cnference with spacewalk specialists on the ground. Chari and Vande Hei will be on duty throughout Tuesday monitoring the two astronauts during the spacewalk and helping them in and out of their spacesuits. Maurer will be at the controls of the Canadarm2 robotic arm assisting the spacewalkers at the Port-1 truss structure worksite. NASA TV begins its live coverage on Tuesday at 5:30 a.m. on the agency’s website, and the NASA app. The station’s two cosmonauts, Flight Engineer Pyotr Dubrov and Commander Anton Shkaplerov, spent their day on a variety of space research and maintenance tasks in the orbiting lab’s Russian segment. Dubrov photographed the condition of the Nauka multipurpose laboratory module following the Prichal module’s docking on Friday. Shkaplerov swapped out life support hardware and began unpacking cargo from the newly arrived Prichal docking port.'

In [7]:
corrected = correct(sentence)

In [11]:
len(sentence.split())

252