In [1]:
import re
from collections import Counter

In [2]:
# function to tokenise words
def words(document):
    "Convert text to lower case and tokenise the document"
    return re.findall(r'\w+', document.lower())

In [28]:
import requests
response = requests.get(url).text

In [30]:
all_words = Counter(words(response))

In [32]:
all_words

Counter({'the': 79809,
         'project': 288,
         'gutenberg': 263,
         'ebook': 87,
         'of': 40024,
         'adventures': 17,
         'sherlock': 101,
         'holmes': 467,
         'by': 6735,
         'sir': 177,
         'arthur': 34,
         'conan': 4,
         'doyle': 5,
         '15': 47,
         'in': 22023,
         'our': 1065,
         'series': 128,
         'copyright': 51,
         'laws': 233,
         'are': 3630,
         'changing': 43,
         'all': 4143,
         'over': 1282,
         'world': 362,
         'be': 6155,
         'sure': 123,
         'to': 28765,
         'check': 38,
         'for': 6941,
         'your': 1279,
         'country': 423,
         'before': 1362,
         'downloading': 5,
         'or': 5352,
         'redistributing': 7,
         'this': 4063,
         'any': 1203,
         'other': 1501,
         'header': 7,
         'should': 1297,
         'first': 1174,
         'thing': 303,
         'seen': 444,
  

In [31]:
# check frequency of a random word, say, 'chair'
all_words['chair']

135

In [33]:
# look at top 10 frequent words
all_words.most_common(10)

[('the', 79809),
 ('of', 40024),
 ('and', 38312),
 ('to', 28765),
 ('in', 22023),
 ('a', 21124),
 ('that', 12512),
 ('he', 12401),
 ('was', 11410),
 ('it', 10681)]

In [34]:
def edits_one(word):
    "Create all edits that are one edit away from `word`."
    alphabets    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])                   for i in range(len(word) + 1)]
    deletes    = [left + right[1:]                       for left, right in splits if right]
    inserts    = [left + c + right                       for left, right in splits for c in alphabets]
    replaces   = [left + c + right[1:]                   for left, right in splits if right for c in alphabets]
    transposes = [left + right[1] + right[0] + right[2:] for left, right in splits if len(right)>1]
    return set(deletes + inserts + replaces + transposes)

In [35]:
def edits_two(word):
    "Create all edits that are two edits away from `word`."
    return (e2 for e1 in edits_one(word) for e2 in edits_one(e1))

In [37]:
def known(words):
    "The subset of `words` that appear in the `all_words`."
    return set(word for word in words if word in all_words)

In [38]:
def possible_corrections(word):
    "Generate possible spelling corrections for word."
    return (known([word]) or known(edits_one(word)) or known(edits_two(word)) or [word])

In [39]:
def prob(word, N=sum(all_words.values())): 
    "Probability of `word`: Number of appearances of 'word' / total number of tokens"
    return all_words[word] / N

In [40]:
print(len(set(edits_one("monney"))))
print(edits_one("monney"))

336
{'maonney', 'monnee', 'monnezy', 'moyney', 'mbnney', 'msnney', 'mocney', 'monnehy', 'monnmy', 'monvey', 'monsey', 'sonney', 'monneey', 'moncney', 'monnety', 'mlonney', 'mjnney', 'mjonney', 'oonney', 'monzney', 'onney', 'ponney', 'monneyn', 'monnfey', 'mohney', 'mosney', 'monnef', 'moaney', 'tmonney', 'monneqy', 'tonney', 'qonney', 'mfnney', 'motney', 'monneo', 'monneyl', 'monneuy', 'mocnney', 'smonney', 'monneyh', 'monnqy', 'monnvy', 'movney', 'monneny', 'monney', 'xonney', 'monnesy', 'mynney', 'mfonney', 'monndy', 'monnpey', 'monneye', 'moqnney', 'moneey', 'monneya', 'monnpy', 'monnery', 'vonney', 'monnmey', 'monnuey', 'mononey', 'mxonney', 'mvonney', 'mornney', 'monnez', 'mwonney', 'monnley', 'mognney', 'monzey', 'monneyy', 'monhney', 'monnbey', 'konney', 'monnby', 'monniy', 'moneny', 'mmnney', 'monnvey', 'monneyd', 'mojney', 'aonney', 'meonney', 'monqey', 'wonney', 'pmonney', 'monndey', 'monfey', 'monnem', 'monfney', 'monnqey', 'monuey', 'modnney', 'minney', 'monnaey', 'monneyx'

In [41]:
print(known(edits_one("monney")))

{'money', 'monkey'}


In [42]:
# Let's look at words that are two edits away
print(len(set(edits_two("monney"))))
print(known(edits_one("monney")))

51013
{'money', 'monkey'}


In [43]:
# Let's look at possible corrections of a word
print(possible_corrections("monney"))

{'money', 'monkey'}


In [44]:
# Let's look at probability of a word
print(prob("money"))
print(prob("monkey"))

0.0002922233626303688
5.378344097491451e-06


In [45]:
def spell_check(word):
    "Print the most probable spelling correction for `word` out of all the `possible_corrections`"
    correct_word = max(possible_corrections(word), key=prob)
    if correct_word != word:
        return "Did you mean " + correct_word + "?"
    else:
        return "Correct spelling."

In [46]:
# test spell check
print(spell_check("monney"))

Did you mean money?


In [47]:
print(len(set(edits_one('emfasize'))))

442


In [48]:
print(len(set(edits_two('emfasize'))))

90902


In [49]:
possible_corrections('emfasize')

{'emphasize'}