In [4]:
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm
import pickle
import os

In [5]:
words = None
if False:
    from nltk.corpus import gutenberg as corpus
    words = corpus.words()
    vocabulary = vocabulary[803:-2]
    # first 803 and last 2 words are punctuation signs, numbers and underscored 
    # words like _home_
else:
    # Using the whole corpus means too much work to later compute the words at a 
    # distance for each word in the vocabulary, so we load just one book
    import nltk
    nltk.download('gutenberg')
    words = nltk.corpus.gutenberg.words('austen-emma.txt')
    # using all the words in the book will take 25-30 min. to process later
    # so we limit its number for the moment.
    words = words[:10000] # cell [15] will take 1.5 min. 
    #words = words[:100000] # cell [15] will take 15 min. 

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


In [6]:
vocabulary = list(set(words))
vocabulary.sort()
# get rid of some non-words like ',' '--' '['
idx_first_word = vocabulary.index('A')
vocabulary = vocabulary[idx_first_word:]
# plus some more annoying non-words
vocabulary.remove('[')
vocabulary.remove(']')
vocabulary.remove('`')
vocabulary.remove('II')
vocabulary = np.array(vocabulary)
print(vocabulary)
print('Vocabulary size {} words'.format(len(vocabulary)))

['A' 'Abbey' 'After' ... 'yourself' 'youth' 'youthful']
Vocabulary size 1711 words


For each word in the vocabulary find the nearest words = at Levenshtein distance up to ``MAX_DIST``. This is a long computation, $O(n^2)$ for $n$ size of the vocabulary. We try to speed up it a little : if $\text{dist}(w_1, w_2) \leq d$ then $|\text{len}(w_1) - \text{len}(w_2)| \leq d$. This reduces the candidate words in the vocabulary for which to compute the distance to each word.
We save the resulting dictionary to avoid recomputing it each time.

In [7]:
from nltk.metrics.distance import edit_distance

def levenshtein(s1, s2):
    return edit_distance(s1, s2, substitution_cost=1, transpositions=True)

word_lengths = np.array([len(w) for w in vocabulary])
dict_lengths = {}
for l in range(min(word_lengths), max(word_lengths)+1):
    dict_lengths[l] = vocabulary[word_lengths==l] # needs vocabulary to be a numpy array

min_length = min(dict_lengths.keys())
max_length = max(dict_lengths.keys())

MAX_DIST = 1
fname = 'close_words_{}.pkl'.format(MAX_DIST)
if not os.path.exists(fname):
    close_words = {}
    for word in tqdm(vocabulary):
        length = len(word)
        candidate_words = []
        d1 = max(min_length, length - MAX_DIST)
        d2 = min(max_length, length + MAX_DIST)
        for d in range(d1, d2+1):
            candidate_words.extend(dict_lengths[d])
        close_words[word] = [w for w in candidate_words if levenshtein(word,w) <= MAX_DIST]

    with open(fname,'wb') as f:
        pickle.dump(close_words, f)
else:
    close_words = pickle.load(open(fname,'rb'))

100%|██████████| 1711/1711 [01:37<00:00, 17.52it/s]


Given one sentence $X$, which is a list of words, build the candidates to correct sentence $C(X)$ assuming at most one word is mispelled.

In [8]:
#sentence = 'Only two of the apples'
sentence = 'I wish you where here'
X = sentence.split(' ')
for x in X:
  assert x in vocabulary, 'All the words in the sentence must belong to the '\
      + 'vocabulary, {} doesn\'t'.format(x)
CX = [X] # no errors
for i in range(len(X)): # one mispelled word at a time
    if X[i] in vocabulary: 
        for cw in close_words[X[i]]:
            if cw != X[i]:
                C = X.copy()
                C[i] = cw
                CX.append(C)
    else:
        pass # let it be as is
for W in CX:
    print('\t'.join(W))

I	wish	you	where	here
A	wish	you	where	here
a	wish	you	where	here
s	wish	you	where	here
IV	wish	you	where	here
If	wish	you	where	here
In	wish	you	where	here
It	wish	you	where	here
I	fish	you	where	here
I	with	you	where	here
I	wish	You	where	here
I	wish	your	where	here
I	wish	you	here	here
I	wish	you	were	here
I	wish	you	There	here
I	wish	you	Where	here
I	wish	you	there	here
I	wish	you	where	her
I	wish	you	where	hers
I	wish	you	where	were
I	wish	you	where	There
I	wish	you	where	Where
I	wish	you	where	there
I	wish	you	where	where


Likelihood $P(X | W) = \prod_{i=1}^n p(x_i | w_i)$ where $n$ is number of words in $X$ (same as in $W$), and $p(x | w)$  is Eq. B.8. $X$ is the written sentence, $W$ are the candidate sentences in $C(X)$. Each $W$ contains zero (ie, $W=X$) or at most one mispelled word, and in this case the mispelled word $w_i$ is at a Levenshtein distance $1...$ ``MAX_DIST`` of the written word $x_i$ 

In [9]:
alpha = 0.95
likelihoods = []
for W in CX:
    PXW = 1.0
    #print(X,W)
    for x,w in zip(X,W):
        if w==x:
            pxw = alpha
        else:
            close_to_x = close_words[x] # includes x itself
            pxw = (1-alpha) / (len(close_to_x) - 1) # so we substract 1
        PXW *= pxw
    likelihoods.append(PXW)

likelihoods = np.array(likelihoods)
idx_most_likely = likelihoods.argmax()
print('Sentence with highest likelihood is the written one, X')
print('\t'.join(CX[idx_most_likely]), '\t', likelihoods[idx_most_likely])
print('Likelihoods')
num_candidates = len(CX)
for i in range(num_candidates):
    print('\t'.join(CX[i]), '\t', likelihoods[i])


Sentence with highest likelihood is the written one, X
I	wish	you	where	here 	 0.7737809374999999
Likelihoods
I	wish	you	where	here 	 0.7737809374999999
A	wish	you	where	here 	 0.005817901785714291
a	wish	you	where	here 	 0.005817901785714291
s	wish	you	where	here 	 0.005817901785714291
IV	wish	you	where	here 	 0.005817901785714291
If	wish	you	where	here 	 0.005817901785714291
In	wish	you	where	here 	 0.005817901785714291
It	wish	you	where	here 	 0.005817901785714291
I	fish	you	where	here 	 0.020362656250000017
I	with	you	where	here 	 0.020362656250000017
I	wish	You	where	here 	 0.020362656250000017
I	wish	your	where	here 	 0.020362656250000017
I	wish	you	here	here 	 0.008145062500000006
I	wish	you	were	here 	 0.008145062500000006
I	wish	you	There	here 	 0.008145062500000006
I	wish	you	Where	here 	 0.008145062500000006
I	wish	you	there	here 	 0.008145062500000006
I	wish	you	where	her 	 0.00581790178571429
I	wish	you	where	hers 	 0.00581790178571429
I	wish	you	where	were 	 0.00581790178

Priors $P(W)$ for all $W \in C(X)$ computed by a LM model, for instance tri-grams (on this same corpus). We've chosen the stupid backoff version.

In [10]:
nltk.download('punkt')

sents = nltk.corpus.gutenberg.sents('austen-emma.txt')
text = []
for s in tqdm(sents):
    text.append(s[:-1]) # except ending point

from nltk.lm.preprocessing import padded_everygram_pipeline
from nltk.lm.models import StupidBackoff

n=3
lm = StupidBackoff(alpha=0.4, order=n)
train, vocab = padded_everygram_pipeline(n, text)
lm.fit(train, vocab)
print(lm.counts)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
100%|██████████| 7717/7717 [00:00<00:00, 10852.98it/s]


<NgramCounter with 3 ngram orders and 623754 ngrams>


In [11]:
def P(W, verbose=False):
    S = ['<s>', '<s>',] + W + ['</s>'] 
    if verbose: print(S)
    num_words = len(S)
    PW = 1.0
    for i in range(2,num_words-1): # omit </s> because likelihoods don't have it
        score = lm.score(S[i], [S[i-2], S[i-1]])
        if verbose: print('P({} | {}, {}) = {}'.format(S[i], S[i-2], S[i-1], score))
        PW *= score
    return PW

print('X =', X)
score = P(X, verbose=True)
print('P(X) = {}'.format(score))
print('\nW = CX[1] = {}'.format(CX[1]))
score = P(CX[1], verbose=True)
print('P(W) = {}'.format(score))

X = ['I', 'wish', 'you', 'where', 'here']
['<s>', '<s>', 'I', 'wish', 'you', 'where', 'here', '</s>']
P(I | <s>, <s>) = 0.08422962291045744
P(wish | <s>, I) = 0.016923076923076923
P(you | I, wish) = 0.34375
P(where | wish, you) = 0.0002385211687537269
P(here | you, where) = 9.868527836390196e-05
P(X) = 1.1533634670336262e-11

W = CX[1] = ['A', 'wish', 'you', 'where', 'here']
['<s>', '<s>', 'A', 'wish', 'you', 'where', 'here', '</s>']
P(A | <s>, <s>) = 0.012051315277957756
P(wish | <s>, A) = 9.942727293806665e-05
P(you | A, wish) = 0.04477611940298508
P(where | wish, you) = 0.0002385211687537269
P(here | you, where) = 9.868527836390196e-05
P(W) = 1.2628905903997896e-15


In [12]:
priors = []
for W in CX:
    priors.append(P(W))
    
posteriors = np.array(priors)*np.array(likelihoods)
idx_best_post = np.argmax(posteriors)

for i in range(num_candidates):
    best = '<----' if i==idx_best_post else ''
    print('\t'.join(CX[i]), '\tlikelihood={}\tprior={}\tposterior={} {}'
          .format(likelihoods[i], priors[i], posteriors[i],best))

print('\nThe original sentence was')
print('\t' + ' '.join(X))
print('The right sentence is')
print('\t' + ' '.join(CX[idx_best_post]))

I	wish	you	where	here 	likelihood=0.7737809374999999	prior=1.1533634670336262e-11	posterior=8.924506647995294e-12 
A	wish	you	where	here 	likelihood=0.005817901785714291	prior=1.2628905903997896e-15	posterior=7.347373421048711e-18 
a	wish	you	where	here 	likelihood=0.005817901785714291	prior=1.8186005327090402e-16	posterior=1.0580439286748885e-18 
s	wish	you	where	here 	likelihood=0.005817901785714291	prior=3.1281420915150617e-16	posterior=1.8199223460193514e-18 
IV	wish	you	where	here 	likelihood=0.005817901785714291	prior=0.0	posterior=0.0 
If	wish	you	where	here 	likelihood=0.005817901785714291	prior=1.004880684834241e-15	posterior=5.84629713072693e-18 
In	wish	you	where	here 	likelihood=0.005817901785714291	prior=7.332913105547164e-16	posterior=4.266216825125057e-18 
It	wish	you	where	here 	likelihood=0.005817901785714291	prior=4.005943270622988e-15	posterior=2.3306184507627628e-17 
I	fish	you	where	here 	likelihood=0.020362656250000017	prior=3.661075396645823e-18	posterior=7.45492

In [13]:
print(np.log(posteriors) - np.log(posteriors[0]))

[  0.         -14.00996871 -15.94788975 -15.40551777         -inf
 -14.23850311 -14.55358416 -12.85559285 -18.60060731 -10.1196857
 -12.48012817  -7.54567297  -5.43639918   5.36284507  -5.4669359
  -7.55415958  -3.4552646   -2.00542241  -6.78496598  -3.39888224
  -4.92088585  -7.00810953  -0.15259778  -5.51801838]


  print(np.log(posteriors) - np.log(posteriors[0]))
