In [2]:
import re
import numpy as np
import pandas


In [5]:
with open("shakespeare.txt") as f:
    txt = f.read()

txt = txt.lower()
word_l = re.findall('\w+', txt)
vocab = set(word_l)
print(f"There are {len(vocab)} unique words in the vocabulary.")

There are 6116 unique words in the vocabulary.


In [6]:
def get_count(word_l):
    word_count_dict = {}
    
    for word in word_l:
        if word not in word_count_dict:
            word_count_dict[word] = 1
        else:
            word_count_dict[word] += 1
    return word_count_dict

word_count_dict = get_count(word_l)
print(f"There are {len(word_count_dict)} key values pairs")
print(f"The count for the word 'thee' is {word_count_dict['thee']}")

There are 6116 key values pairs
The count for the word 'thee' is 240


In [7]:
def get_probs(word_count_dict):
    probs = {}
    
    m = sum(word_count_dict.values())
    
    for word in word_count_dict:
        probs[word] = word_count_dict[word]/m
    
    return probs
probs = get_probs(word_count_dict)
print(f"Length of probs is {len(probs)}")
print(f"P('thee') is {probs['thee']:.4f}")

Length of probs is 6116
P('thee') is 0.0045


### String Manipulation

In [15]:
def delete_letter(word, verbose = False):
    delete_l = []
    split_l = []
    
    for i in range(len(word)):
        split_l.append((word[:i], word[i:]))
    
    delete_l = [l+r[1:] for l,r in split_l if r]
    if verbose: print(f"input word {word}, \nsplit_l = {split_l}, \ndelete_l = {delete_l}")

    return  delete_l

In [14]:
delete_word_l = delete_letter(word="cans",verbose=True)

input word cans, 
split_l = [('', 'cans'), ('c', 'ans'), ('ca', 'ns'), ('can', 's')], 
delete_l = ['ans', 'cns', 'cas', 'can']


In [16]:
def switch_letter(word, verbose = False):
    switch_l = []
    split_l = []
    
    for i in range(len(word)):
        split_l.append((word[:i], word[i:]))
    
    switch_l = [l+r[1] + r[0] + r[2:] for l,r in split_l if len(r) > 1]
    if verbose: print(f"input word {word}, \nsplit_l = {split_l}, \nswitch_l = {switch_l}")

    return  switch_l

In [17]:
switch_word_l = switch_letter(word="eta",
                         verbose=True)

input word eta, 
split_l = [('', 'eta'), ('e', 'ta'), ('et', 'a')], 
switch_l = ['tea', 'eat']


In [20]:
def replace_letter(word, verbose = False):
    replace_l = []
    split_l = []
    letters = "abcdefghijklmnopqrstuvwxyz"
    
    for i in range(len(word)):
        split_l.append((word[:i], word[i:]))
    
    for letter in letters:
        for l, r in split_l:
            replace_l.append(l + letter + r[1:])
     
    replace_set = set(replace_l)
    
    if word in replace_set:
        replace_set.remove(word)
    
    replace_l = sorted(list(replace_set))
        
    if verbose: print(f"input word {word}, \nsplit_l = {split_l}, \nreplace_l = {replace_l}")

    return  replace_l

In [21]:
replace_l = replace_letter(word='can',
                              verbose=True)

input word can, 
split_l = [('', 'can'), ('c', 'an'), ('ca', 'n')], 
switch_l = ['aan', 'ban', 'caa', 'cab', 'cac', 'cad', 'cae', 'caf', 'cag', 'cah', 'cai', 'caj', 'cak', 'cal', 'cam', 'cao', 'cap', 'caq', 'car', 'cas', 'cat', 'cau', 'cav', 'caw', 'cax', 'cay', 'caz', 'cbn', 'ccn', 'cdn', 'cen', 'cfn', 'cgn', 'chn', 'cin', 'cjn', 'ckn', 'cln', 'cmn', 'cnn', 'con', 'cpn', 'cqn', 'crn', 'csn', 'ctn', 'cun', 'cvn', 'cwn', 'cxn', 'cyn', 'czn', 'dan', 'ean', 'fan', 'gan', 'han', 'ian', 'jan', 'kan', 'lan', 'man', 'nan', 'oan', 'pan', 'qan', 'ran', 'san', 'tan', 'uan', 'van', 'wan', 'xan', 'yan', 'zan']


In [24]:
def insert_letter(word, verbose = False):
    insert_l = []
    split_l = []
    letters = "abcdefghijklmnopqrstuvwxyz"
    
    for i in range(len(word)+1):
        split_l.append((word[:i], word[i:]))
    
    for l, r in split_l:
        for letter in letters:
            insert_l.append(l + letter + r)
        
    if verbose: print(f"input word {word}, \nsplit_l = {split_l}, \ninsert_l = {insert_l}")

    return  insert_l

In [25]:
insert_l = insert_letter('at', True)
print(f"Number of strings output by insert_letter('at') is {len(insert_l)}")

input word at, 
split_l = [('', 'at'), ('a', 't'), ('at', '')], 
insert_l = ['aat', 'bat', 'cat', 'dat', 'eat', 'fat', 'gat', 'hat', 'iat', 'jat', 'kat', 'lat', 'mat', 'nat', 'oat', 'pat', 'qat', 'rat', 'sat', 'tat', 'uat', 'vat', 'wat', 'xat', 'yat', 'zat', 'aat', 'abt', 'act', 'adt', 'aet', 'aft', 'agt', 'aht', 'ait', 'ajt', 'akt', 'alt', 'amt', 'ant', 'aot', 'apt', 'aqt', 'art', 'ast', 'att', 'aut', 'avt', 'awt', 'axt', 'ayt', 'azt', 'ata', 'atb', 'atc', 'atd', 'ate', 'atf', 'atg', 'ath', 'ati', 'atj', 'atk', 'atl', 'atm', 'atn', 'ato', 'atp', 'atq', 'atr', 'ats', 'att', 'atu', 'atv', 'atw', 'atx', 'aty', 'atz']
Number of strings output by insert_letter('at') is 78
