# Probabilities Calculation Functions Using Bayes Theorem
Scarlett Hwang | April 14th, 2020

In [35]:
# Load libraries
import json
import pandas as pd
import numpy as np
import argparse
import os
import re

In [2]:
# Load corpus data as a series
def load_corpus():
    """
    load word frequency corpus
    """
    print("Loading corpus")
    a = pd.read_csv("corpus/frequency-corpus.txt.bz2", sep="\t",
                    keep_default_na=False, na_values=[""])
    c = pd.Series(a["count"].values, index=a.word)
    c = np.log(c) - np.log(sum(c))
    return c

In [8]:
def load_table():
    """
    load the count dict from file.
    eventually does smoothing and other things
    """
    table = json.load(open("table.json"))
    return table

In [41]:
# Pr(c1|c2)
# c2: Original letter
# c1: Converted letter

def char_based_pr(conv, orig):
    """
    return log Pr(converted|original) for individual letters
    the corresponding counts are based on dict 'table'
    """
    if orig in table:
        if conv in table[orig]:
            p = table[orig][conv] / sum(table[orig].values())
            return np.log(p)
        else:
            return 1e-8
    else:
        raise ValueError()

In [51]:
# Pr(w1|w2)
# w2: Original word
# w1: Converted word

def word_based_pr(conv, orig):
    """
    return log Pr(converted|original) for individual words/tokens/other
    multiletter structures
    the corresponding counts are based on dict 'table', used by 'char_based_pr'
    """
    try:
        word_p = 0
        for i, j in zip(orig, conv):
            word_p = word_p + char_based_pr(j, i)
        return(word_p)
    except ValueError as ve:
        #print(ve)
        return 0

In [5]:
def loglik():
    pass

+ create dictionary from orig text (word & count)

+ reads in the error table << command line arg DONE

+ consistant way to read in text (give it the file name)  << command line arg DONE

+ dictionary of word frequency << downloaded DONE


In [11]:
table = load_table()
corpus = load_corpus()
print(table)

Loading corpus
{'T': {'T': 8}, 'h': {'h': 75}, 'e': {'e': 167}, ' ': {' ': 247}, 'p': {'p': 23}, 'l': {'l': 54, 'f': 1, 't': 1}, 'a': {'a': 114}, 'n': {'n': 70}, 't': {'t': 122}, 's': {'s': 80}, 'r': {'r': 74, 'F': 1}, 'o': {'o': 91}, 'f': {'f': 24}, 'c': {'c': 40}, 'b': {'b': 18}, '-': {'-': 4}, 'm': {'m': 38}, 'i': {'i': 85, 'I': 1}, ',': {',': 18}, 'y': {'y': 12}, 'k': {'k': 7}, 'v': {'v': 10}, '.': {'.': 11}, 'd': {'d': 45}, 'u': {'u': 35}, 'q': {'q': 6}, 'g': {'g': 15}, 'w': {'w': 18}, 'z': {'z': 5}, 'A': {'A': 1}, 'O': {'O': 4, '0': 1}, 'W': {'W': 3}, 'C': {'C': 3}, '_': {'_': 4}, '{': {'{': 4}, '2': {'2': 4}, '}': {')': 1, '}': 3}, 'H': {'H': 1}, 'S': {'S': 1}, '(': {'(': 1}, 'x': {'x': 1}, ')': {')': 1}, 'F': {'F': 1}}


In [39]:
test = """
ćontragravity lorries were driffing back and forth, scattering
fertilizer, mainly nitrates from Mimir or Yggarasill. There were stit
a good number of animal-drawn plows ahd harrows in use, however.

As planots went, Uiler was no bargain, he thought soury.Attimes, he
wished he had never followed the lure of rapid promotion and
fantastically high pay and left the Federation regulars for the army

at the Uiler Company, the hadn't e'd probably be a colonel, zt

five thousand sols a year, but maybe it would be better to be a
middle-aged colonel cn a decent planet-Odin, with its two moons,
"""

In [55]:
# strip punctuations
tokens = test.replace('-', ' ').split()

# Strip remaining punctuations
table = str.maketrans('', '', string.punctuation)
tokens = [w.translate(table) for w in tokens]

for token_conv in tokens:
    best_ll = -np.Inf
    o = token_conv
    print(token_conv, "->", end=" ")
    
    if not re.match('^[a-zA-Z_]+$', token_conv):   # if special character is found
        print(token_conv, " (skipped - contains accented character)")
    else:
        for token_orig in corpus.index:
            l = word_based_pr(token_conv, token_orig) + corpus[token_orig]
            if l > best_ll:
                best_ll = l
                o = token_orig
        print(o)

ćontragravity -> ćontragravity  (skipped - contains accented character)
lorries -> the
were -> the
driffing -> the
back -> the
and -> 

KeyboardInterrupt: 

# SMALL TEST

In [18]:
examples = [('applc', 'apple'), ('Utter', 'Uller'), ("4.", "1."),
            ("tie", "the"), ("tbe", "the")]
print("A few examples")
for conv, orig in examples:
    print("Pr(", conv, "|", orig, ") =", word_based_pr(conv, orig))
    best_ll = -np.Inf
    o = token_conv
    print(token_conv, "->", end=" ")
    for token_orig in corpus.index:
        l = word_based_pr(token_conv, token_orig) + corpus[token_orig]
        if l > best_ll:
            best_ll = l
            o = token_orig
    print(o)

A few examples
Pr( applc | apple ) = -0.03636763417087483
forth, -> the
Pr( Utter | Uller ) = 0
forth, -> the
Pr( 4. | 1. ) = 0
forth, -> the
Pr( tie | the ) = 1e-08
forth, -> the
Pr( tbe | the ) = 1e-08
forth, -> the


In [20]:
for conv, orig in examples:
    print(conv, orig)

applc apple
Utter Uller
4. 1.
tie the
tbe the


# ----------------------------TRASH FROM HERE---------------------------------

In [163]:
# '1.' and '4.' returns 0 which ends up being 0 at the end
o = open("moby_dick-10/moby_dick-orig-chunk-aa.txt", "r").read()
c = open("moby_dick-10/moby_dick-orig-chunk-aa-converted.txt", "r").read()

for i, j in zip(o.split(), c.split()) :
    print(i, j)
    print(word_based_pr(i, j))


CHAPTER CHAPTER
0.4213031359157587
1. 4.
8.694203071062969e-07
Loomings. Loamings.
7.413027921142096e-05
Call Call
0.7906068874888901
me me
0.9763282355731604
Ishmael. Ishmael.
0.11695234327730351
Some Some
0.9019056175700088
years years
0.9364455991125061
ago—never ago—never
Crap, I have never seen this character in the table—
0
mind mind
0.9415337625658535
how how
0.9635306407847003
long long
0.8938034410244767
precisely—having precisely—having
Crap, I have never seen this character in the table—
0
little {ile
Crap, I have never seen this character in the table{
0
or or
0.9705515882211881
no no
0.972122605015246
money money
0.9352641894307998
in in
0.9693574088628795
my my
0.9727729806769553
purse, purse,
0.9132229087262467
and and
0.9601131753265466
nothing nothing
0.8997357486645862
particular particular
0.8076030962000971
to to
0.9649378577764336
interest interest
0.8810408518850329
me me
0.9763282355731604
on on
0.972122605015246
shore, shore,
0.9213191951015779
I thought
1e-06
t

# ERROR TABLES

In [67]:
# SMALL TABLE 1: Sample error table for debugging
# Table will be passed in as a command line argument from error_table.py eventually
# table = {' ': {' ': 247},
#  '(': {'(': 1},
#  ')': {')': 1},
#  ',': {',': 20},
#  '-': {'-': 4},
#  '.': {'.': 15},
#  '2': {'2': 4},
#  'A': {'A': 1},
#  'C': {'C': 3},
#  'F': {'F': 1},
#  'H': {'H': 1},
#  'O': {'0': 1, 'O': 4},
#  'S': {'S': 1},
#  'T': {'T': 8},
#  'W': {'W': 3},
#  '_': {'_': 4},
#  'a': {'a': 114},
#  'b': {'b': 18},
#  'c': {'c': 40},
#  'd': {'d': 45},
#  'e': {'e': 171},
#  'f': {'f': 25},
#  'g': {'g': 16},
#  'h': {'h': 76},
#  'i': {'I': 1, 'i': 85},
#  'k': {'k': 7},
#  'l': {'f': 1, 'i': 1, 'l': 55, 't': 1},
#  'm': {'m': 38},
#  'n': {'n': 72},
#  'o': {'o': 91},
#  'p': {'p': 23},
#  'q': {'q': 6},
#  'r': {'F': 1, 'r': 76},
#  's': {'s': 84},
#  't': {'t': 122},
#  'u': {'u': 35},
#  'v': {'v': 10},
#  'w': {'w': 18},
#  'x': {'x': 1},
#  'y': {'y': 13},
#  'z': {'z': 5},
#  '{': {'{': 4},
#  '}': {')': 1, '}': 3}}

In [70]:
# SUPER LARGE TABLE 1 : Don quixote
table = {' ': {' ': 388089,
       '!': 41,
       '"': 2,
       '#': 2,
       "'": 64,
       '(': 1,
       ',': 8,
       '-': 2,
       '.': 15,
       '0': 1,
       ':': 1,
       ';': 1,
       '?': 6,
       'A': 1,
       'C': 3,
       'D': 1,
       'Del': 5018,
       'E': 4,
       'F': 5,
       'G': 3,
       'H': 1,
       'L': 2,
       'M': 1,
       'N': 1,
       'O': 1,
       'P': 1,
       'S': 4,
       'T': 1,
       'W': 2,
       'X': 1,
       'Y': 1,
       ']': 1,
       '_': 3,
       'c': 5,
       'd': 13,
       'e': 8,
       'f': 12,
       'g': 8,
       'h': 3,
       'i': 16,
       'j': 1,
       'k': 3,
       'l': 8,
       'm': 5,
       'n': 8,
       'o': 3,
       'p': 4,
       'r': 10,
       's': 3,
       't': 13,
       'u': 4,
       'v': 5,
       'y': 7,
       '{': 2,
       '|': 1,
       '}': 54,
       '\x99': 1,
       '\x9d': 4,
       '\xe2': 25},
 '!': {'!': 623,
       '"': 5,
       "'": 1,
       ',': 1,
       '?': 4,
       'Del': 13,
       'a': 1,
       'b': 1,
       'f': 3,
       'i': 1,
       'l': 10,
       't': 2},
 '#': {'#': 1},
 '$': {'$': 2},
 '%': {'%': 1},
 '(': {'(': 538, 'Del': 7, 'e': 18, 'o': 1, '{': 13},
 ')': {')': 407, 'Del': 24, '}': 146},
 '*': {'*': 16, '+': 1, 'Del': 9, '\xe2': 2},
 ',': {'!': 3,
       '"': 2,
       '(': 1,
       ')': 3,
       ',': 36310,
       '.': 43,
       ':': 2,
       ';': 4,
       '?': 1,
       'Del': 468,
       'E': 1,
       'a': 5,
       'b': 1,
       'c': 5,
       'd': 2,
       'e': 5,
       'f': 1,
       'g': 1,
       'h': 2,
       'i': 2,
       'l': 2,
       'm': 2,
       'o': 1,
       'p': 1,
       'r': 3,
       's': 6,
       't': 2,
       'u': 2,
       'v': 2,
       'y': 2,
       '|': 1,
       '}': 1,
       '\x9d': 1,
       '\xb0': 1,
       '\xe2': 1},
 '-': {' ': 14,
       "'": 1,
       '-': 1813,
       '.': 1,
       ':': 1,
       'D': 1,
       'Del': 34,
       'e': 1,
       'k': 1,
       'o': 1,
       'p': 1,
       's': 1,
       't': 1},
 '.': {' ': 83,
       '!': 3,
       '"': 3,
       ',': 806,
       '.': 7304,
       '1': 2,
       '8': 1,
       ':': 1,
       'A': 1,
       'Del': 175,
       'N': 1,
       'S': 1,
       'W': 1,
       'b': 1,
       'c': 1,
       'd': 1,
       'e': 3,
       'g': 1,
       'h': 1,
       'n': 2,
       'o': 1,
       'r': 2,
       's': 1,
       't': 3,
       'w': 1,
       '\xc2': 1},
 '/': {'/': 27, 'i': 1},
 '0': {'$': 1, '0': 175, 'Del': 12, 'O': 3, 'l': 1},
 '1': {'$': 1,
       '1': 339,
       '8': 2,
       ':': 1,
       'A': 3,
       'Del': 45,
       'O': 2,
       'S': 2,
       'T': 1,
       'd': 1,
       'e': 1,
       'i': 7,
       'l': 5,
       't': 9},
 '2': {'2': 243, 'Del': 15, 'e': 1},
 '3': {'0': 3,
       '2': 5,
       '3': 199,
       '6': 2,
       '8': 32,
       '9': 2,
       'A': 1,
       'Del': 24,
       'O': 1},
 '4': {',': 1,
       '4': 160,
       'A': 7,
       'Del': 30,
       'O': 1,
       'a': 1,
       'd': 10,
       'l': 1,
       '\xc3': 3},
 '5': {'5': 136,
       '6': 3,
       '8': 15,
       'Del': 19,
       'S': 9,
       'l': 1,
       '{': 1,
       '\xc3': 1},
 '6': {'6': 148, '8': 8, 'B': 1, 'Del': 8},
 '7': {'7': 128, 'Del': 5},
 '8': {'6': 1, '8': 119, 'A': 1, 'Del': 7},
 '9': {'2': 2, '8': 12, '9': 101, 'Del': 14},
 ':': {'"': 1,
       '.': 2,
       '3': 1,
       ':': 367,
       '?': 1,
       'Del': 3,
       'h': 1,
       'o': 1,
       's': 1,
       'y': 1,
       '\xc2': 1},
 ';': {' ': 1,
       '!': 3,
       '"': 3,
       ',': 78,
       '.': 10,
       ':': 399,
       ';': 5407,
       '?': 5,
       'Del': 120,
       'a': 1,
       'e': 1,
       'f': 1,
       'i': 1,
       'l': 2,
       'o': 3,
       'r': 2,
       's': 5,
       't': 2,
       'v': 1,
       'y': 6,
       '}': 1,
       '\xe2': 2},
 '>': {'>': 2},
 '?': {'?': 944, 'Del': 22, 'y': 1},
 '@': {'@': 2},
 'A': {' ': 5,
       ',': 1,
       '.': 2,
       'A': 3598,
       'B': 1,
       'C': 1,
       'D': 1,
       'Del': 317,
       'E': 2,
       'F': 1,
       'G': 2,
       'H': 2,
       'I': 1,
       'K': 1,
       'N': 2,
       'S': 3,
       'T': 1,
       'U': 1,
       'X': 1,
       'f': 2},
 'B': {'A': 1, 'B': 1159, 'C': 1, 'Del': 58, 'E': 1, 'H': 1, 'I': 1, 'S': 2},
 'C': {' ': 1,
       '1': 1,
       'A': 2,
       'B': 2,
       'C': 2159,
       'D': 4,
       'Del': 135,
       'G': 12,
       'H': 1,
       'I': 2,
       'K': 2,
       'L': 1,
       'M': 1,
       'O': 2,
       'Q': 1,
       'S': 1,
       'T': 5,
       'V': 2,
       'W': 2,
       'X': 4,
       'b': 1,
       'g': 1,
       'n': 1,
       't': 2,
       '\xe2': 1},
 'D': {' ': 4,
       '8': 1,
       'B': 25,
       'C': 5,
       'D': 4360,
       'Del': 253,
       'F': 2,
       'G': 8,
       'H': 1,
       'I': 2,
       'J': 1,
       'K': 1,
       'L': 1,
       'M': 1,
       'N': 1,
       'O': 1,
       'Q': 1,
       'R': 2,
       'S': 3,
       'T': 1,
       'V': 1,
       'X': 1,
       'Y': 1,
       'Z': 1,
       'a': 2,
       'b': 1,
       'f': 1,
       'h': 2,
       'i': 1,
       'j': 2,
       'p': 1,
       't': 2,
       'w': 1,
       '\x98': 1},
 'Del': {' ': 4109,
         '!': 66,
         '"': 21,
         '#': 16,
         '$': 1,
         '&': 1,
         "'": 34,
         '(': 23,
         ')': 22,
         '*': 11,
         '+': 36,
         ',': 483,
         '-': 61,
         '.': 195,
         '/': 1,
         '0': 118,
         '1': 36,
         '2': 16,
         '3': 9,
         '4': 53,
         '5': 16,
         '6': 12,
         '7': 6,
         '8': 23,
         '9': 14,
         ':': 40,
         ';': 66,
         '?': 21,
         'A': 357,
         'B': 66,
         'C': 144,
         'D': 259,
         'E': 607,
         'F': 155,
         'G': 94,
         'H': 402,
         'I': 301,
         'J': 4,
         'K': 40,
         'L': 168,
         'M': 86,
         'N': 330,
         'O': 378,
         'P': 125,
         'Q': 73,
         'R': 295,
         'S': 301,
         'T': 489,
         'U': 144,
         'V': 85,
         'W': 142,
         'X': 91,
         'Y': 39,
         'Z': 9,
         '[': 2,
         '_': 3,
         'a': 1253,
         'b': 229,
         'c': 465,
         'd': 667,
         'e': 1883,
         'f': 578,
         'g': 361,
         'h': 953,
         'i': 1714,
         'j': 38,
         'k': 146,
         'l': 701,
         'm': 458,
         'n': 1113,
         'o': 1388,
         'p': 310,
         'q': 15,
         'r': 933,
         's': 980,
         't': 1780,
         'u': 540,
         'v': 180,
         'w': 361,
         'x': 67,
         'y': 318,
         'z': 24,
         '{': 13,
         '|': 17,
         '}': 20,
         '~': 4,
         '\x80': 483,
         '\x94': 8,
         '\x98': 306,
         '\x99': 74,
         '\x9c': 62,
         '\x9d': 20,
         '\xa2': 29,
         '\xa3': 13,
         '\xa5': 3,
         '\xa7': 27,
         '\xa9': 79,
         '\xab': 1,
         '\xb0': 1,
         '\xc2': 17,
         '\xc3': 5,
         '\xe2': 400},
 'E': {' ': 7,
       '"': 1,
       '1': 1,
       '2': 1,
       '5': 1,
       '6': 1,
       '8': 1,
       'A': 5,
       'B': 1,
       'D': 1,
       'Del': 531,
       'E': 2893,
       'H': 2,
       'I': 5,
       'L': 1,
       'M': 1,
       'N': 2,
       'O': 5,
       'R': 4,
       'S': 2,
       'T': 2,
       'Y': 1,
       'e': 1,
       'f': 1,
       'l': 1,
       'p': 1,
       'y': 1,
       '\xc2': 6},
 'F': {'B': 1,
       'C': 3,
       'D': 6,
       'Del': 110,
       'E': 1,
       'F': 1373,
       'G': 2,
       'H': 1,
       'K': 1,
       'M': 2,
       'N': 1,
       'P': 1,
       'R': 1,
       'S': 1,
       'T': 3,
       'U': 1,
       'W': 6,
       'X': 1,
       'c': 1,
       'e': 2,
       'f': 1,
       'm': 1,
       's': 1,
       't': 1,
       'w': 2,
       '\xc2': 1,
       '\xe2': 3},
 'G': {' ': 1,
       'C': 3,
       'D': 2,
       'Del': 93,
       'G': 1370,
       'H': 1,
       'M': 1,
       'P': 1,
       'T': 2,
       'W': 1,
       'X': 1,
       'e': 1,
       'h': 1},
 'H': {' ': 1,
       'A': 5,
       'B': 2,
       'Del': 358,
       'E': 3,
       'F': 1,
       'H': 2649,
       'I': 1,
       'N': 1,
       'O': 2,
       'P': 1,
       'Q': 1,
       'R': 1,
       'U': 3,
       'V': 1,
       'X': 1,
       'k': 1,
       'm': 1,
       'p': 1,
       'r': 1},
 'I': {' ': 1,
       '!': 497,
       '"': 3,
       '#': 159,
       '&': 1,
       "'": 91,
       '(': 1,
       '+': 3,
       ',': 2,
       '-': 1,
       '/': 13,
       '1': 84,
       '3': 4,
       '4': 43,
       '8': 3,
       '?': 32,
       'A': 3,
       'C': 7,
       'Del': 1196,
       'E': 25,
       'F': 45,
       'G': 2,
       'H': 65,
       'I': 1463,
       'J': 9,
       'L': 23,
       'N': 3,
       'O': 1,
       'P': 13,
       'R': 1,
       'S': 2,
       'T': 34,
       'U': 3,
       'V': 6,
       'W': 1,
       'X': 1,
       'Y': 4,
       '[': 11,
       'a': 1,
       'e': 2,
       'f': 384,
       'g': 2,
       'h': 2,
       'i': 915,
       'j': 15,
       'l': 13,
       'm': 1,
       'n': 1,
       'o': 3,
       'p': 1,
       's': 1,
       't': 352,
       'u': 2,
       'v': 1,
       'w': 4,
       'y': 3,
       '{': 968,
       '|': 2271,
       '}': 706,
       '\x98': 10,
       '\x9c': 1,
       '\xa5': 2,
       '\xc2': 30,
       '\xe2': 2},
 'J': {'Del': 3, 'J': 171, 'd': 11, 'p': 1},
 'K': {'8': 1, 'C': 4, 'Del': 45, 'H': 1, 'K': 570, 'R': 1, 'k': 5},
 'L': {'C': 1,
       'D': 2,
       'Del': 166,
       'E': 1,
       'F': 2,
       'H': 1,
       'L': 1630,
       'N': 2,
       'P': 1,
       'R': 1,
       'S': 1,
       'U': 1,
       'V': 1,
       'W': 1,
       'Y': 1,
       'j': 1,
       'q': 1,
       'r': 1,
       's': 2},
 'M': {' ': 1,
       'A': 1,
       'B': 1,
       'C': 1,
       'D': 1,
       'Del': 89,
       'G': 1,
       'M': 1476,
       'N': 1,
       'V': 1,
       'W': 1,
       'X': 1,
       'l': 1,
       'm': 1,
       'o': 1,
       't': 1},
 'N': {')': 1,
       '3': 1,
       'B': 1,
       'C': 1,
       'D': 2,
       'Del': 315,
       'F': 2,
       'G': 1,
       'H': 1,
       'M': 1,
       'N': 1793,
       'O': 2,
       'P': 2,
       'R': 1,
       'S': 1,
       'T': 1,
       'V': 2,
       'W': 1,
       'X': 1,
       'p': 1},
 'O': {' ': 2,
       '(': 1,
       '0': 1,
       'A': 2,
       'B': 1,
       'C': 8,
       'D': 1,
       'Del': 346,
       'G': 15,
       'H': 1,
       'I': 2,
       'N': 3,
       'O': 2010,
       'Q': 4,
       'R': 3,
       'S': 1,
       'U': 1,
       'V': 1,
       'W': 1,
       'X': 2,
       'e': 1,
       'o': 1,
       't': 1,
       '\xc2': 27},
 'P': {' ': 4,
       'A': 1,
       'D': 6,
       'Del': 102,
       'E': 1,
       'F': 1,
       'I': 1,
       'L': 2,
       'N': 4,
       'O': 1,
       'P': 1263,
       'Q': 2,
       'R': 3,
       'S': 1,
       'T': 6,
       'e': 2,
       'i': 1,
       's': 1,
       '\xe2': 2},
 'Q': {'C': 1,
       'Del': 70,
       'G': 23,
       'O': 1,
       'P': 3,
       'Q': 2348,
       'd': 1,
       'e': 2,
       'h': 2,
       'i': 1,
       'p': 1,
       's': 1,
       't': 1},
 'R': {' ': 6,
       '-': 1,
       '9': 1,
       'A': 9,
       'D': 6,
       'Del': 284,
       'F': 11,
       'H': 17,
       'N': 2,
       'R': 1905,
       'S': 3,
       'T': 1,
       'W': 1,
       'd': 1,
       'j': 1,
       'n': 2,
       'p': 1,
       'r': 1,
       's': 1},
 'S': {' ': 2,
       '1': 1,
       '4': 1,
       '6': 1,
       'D': 2,
       'Del': 259,
       'E': 1,
       'F': 1,
       'G': 4,
       'H': 1,
       'L': 1,
       'M': 1,
       'O': 1,
       'P': 3,
       'R': 1,
       'S': 4690,
       'T': 1,
       'U': 1,
       'V': 2,
       'W': 1,
       'X': 1,
       'a': 1,
       'b': 1,
       'd': 2,
       'f': 1,
       'g': 1,
       'h': 3,
       'i': 1,
       'l': 2,
       'm': 2,
       'o': 2,
       'p': 3,
       'r': 1,
       's': 3,
       't': 5,
       'u': 1,
       '|': 1,
       '\xe2': 1},
 'T': {' ': 2,
       'A': 2,
       'B': 3,
       'C': 4,
       'D': 3,
       'Del': 442,
       'E': 3,
       'F': 2,
       'H': 2,
       'L': 2,
       'N': 2,
       'O': 1,
       'P': 3,
       'R': 1,
       'S': 3,
       'T': 5109,
       'W': 3,
       'X': 2,
       'Y': 1,
       'Z': 1,
       'a': 1,
       'b': 1,
       'c': 2,
       'e': 4,
       'f': 4,
       'g': 2,
       'o': 2,
       'p': 1,
       'r': 1,
       't': 4,
       'w': 1,
       '\xe2': 1},
 'U': {' ': 2,
       ',': 1,
       'C': 1,
       'Del': 137,
       'F': 1,
       'H': 2,
       'O': 1,
       'R': 1,
       'S': 1,
       'U': 575},
 'V': {' ': 1,
       'A': 3,
       'B': 1,
       'C': 1,
       'D': 2,
       'Del': 68,
       'F': 2,
       'G': 2,
       'I': 2,
       'K': 1,
       'M': 1,
       'P': 3,
       'S': 2,
       'T': 2,
       'U': 1,
       'V': 434,
       'W': 1},
 'W': {' ': 1,
       'A': 1,
       'C': 2,
       'D': 1,
       'Del': 132,
       'F': 1,
       'G': 1,
       'H': 1,
       'L': 4,
       'N': 1,
       'O': 1,
       'R': 2,
       'T': 4,
       'W': 1328,
       'X': 1,
       'e': 2,
       'f': 2,
       'i': 2,
       'r': 1,
       's': 1,
       't': 2,
       'y': 1,
       '{': 1},
 'X': {'.': 1,
       'A': 4,
       'Del': 88,
       'G': 1,
       'H': 1,
       'L': 1,
       'M': 1,
       'N': 1,
       'O': 3,
       'R': 1,
       'T': 1,
       'V': 1,
       'W': 1,
       'X': 388,
       'x': 1},
 'Y': {'A': 1, 'Del': 32, 'E': 3, 'L': 1, 'P': 1, 'V': 2, 'Y': 349},
 'Z': {'Del': 12, 'L': 1, 'Z': 149},
 '[': {'[': 1},
 ']': {']': 1},
 '_': {' ': 3, '"': 1, '(': 1, '.': 1, 'Del': 41, '_': 146, 'y': 1, '\xe2': 2},
 'a': {' ': 7,
       '!': 1,
       '&': 6,
       "'": 1,
       ',': 1,
       '.': 1,
       '2': 23,
       '3': 1,
       '4': 2,
       '8': 1,
       '9': 1,
       '@': 21,
       'A': 1,
       'B': 1,
       'D': 2,
       'Del': 1246,
       '_': 1,
       'a': 143526,
       'b': 6,
       'c': 4,
       'd': 5,
       'e': 31,
       'f': 5,
       'h': 11,
       'i': 9,
       'j': 1,
       'k': 1,
       'l': 4,
       'm': 4,
       'n': 7,
       'o': 15,
       'p': 4,
       'r': 1,
       's': 11,
       't': 6,
       'u': 2,
       'v': 1,
       'w': 3,
       'y': 5,
       'z': 1,
       '\x98': 1,
       '\x9c': 1,
       '\xe2': 1},
 'b': {' ': 2,
       ',': 1,
       '0': 1,
       '2': 1,
       '3': 1,
       'B': 46,
       'D': 4,
       'Del': 283,
       'L': 1,
       'R': 1,
       'S': 1,
       'a': 3,
       'b': 23929,
       'c': 2,
       'd': 2,
       'e': 4,
       'f': 6,
       'h': 4,
       'i': 4,
       'l': 3,
       'm': 3,
       'n': 1,
       'o': 7,
       'p': 6,
       'r': 1,
       's': 5,
       't': 6,
       'u': 2,
       'v': 2,
       'w': 2,
       'y': 3,
       'z': 1,
       '\xe2': 2},
 'c': {' ': 2,
       '0': 10,
       '2': 1,
       '6': 16,
       '8': 2,
       'C': 34,
       'Del': 489,
       'G': 13,
       'P': 1,
       'S': 6,
       'V': 1,
       'Z': 1,
       'a': 3,
       'b': 1,
       'c': 38314,
       'd': 6,
       'e': 17,
       'f': 1,
       'g': 14,
       'h': 4,
       'i': 2,
       'k': 1,
       'l': 8,
       'm': 3,
       'n': 4,
       'o': 3,
       'p': 5,
       'r': 2,
       's': 14,
       't': 9,
       'w': 3,
       'y': 2,
       '\xc2': 16,
       '\xe2': 1},
 'd': {' ': 5,
       '!': 1,
       "'": 1,
       ',': 4,
       '-': 1,
       ':': 1,
       'Del': 834,
       'G': 12,
       'H': 1,
       'I': 1,
       'a': 15,
       'b': 2,
       'c': 123,
       'd': 75609,
       'e': 22,
       'f': 5,
       'g': 67,
       'h': 5,
       'i': 6,
       'j': 3,
       'k': 5,
       'l': 14,
       'm': 7,
       'n': 8,
       'o': 10,
       'p': 5,
       'r': 16,
       's': 23,
       't': 18,
       'u': 2,
       'v': 1,
       'w': 8,
       'y': 5,
       '}': 1,
       '\x98': 1,
       '\x9d': 1,
       '\xc2': 1,
       '\xe2': 1},
 'e': {' ': 4,
       '"': 1,
       ',': 10,
       '.': 2,
       '2': 4,
       '4': 1,
       '8': 5,
       'B': 1,
       'C': 1,
       'Del': 1867,
       'F': 1,
       'G': 2,
       'K': 1,
       'S': 1,
       'a': 79,
       'b': 1,
       'c': 16,
       'd': 7,
       'e': 213779,
       'f': 2,
       'g': 11,
       'h': 5,
       'i': 41,
       'l': 5,
       'n': 5,
       'o': 161,
       'p': 4,
       'r': 4,
       's': 63,
       't': 11,
       'u': 2,
       'v': 1,
       'w': 4,
       'y': 10,
       'z': 1,
       '{': 1,
       '\xc2': 4,
       '\xc3': 36},
 'f': {' ': 4,
       '!': 6,
       "'": 2,
       '.': 1,
       '4': 1,
       'A': 1,
       'D': 1,
       'Del': 624,
       'F': 1,
       'G': 1,
       'H': 4,
       'M': 1,
       'S': 1,
       'a': 9,
       'b': 5,
       'c': 6,
       'd': 7,
       'e': 1,
       'f': 39309,
       'g': 1,
       'h': 3,
       'i': 89,
       'j': 4,
       'k': 1,
       'l': 118,
       'm': 5,
       'n': 5,
       'o': 3,
       'p': 1,
       'r': 3,
       's': 6,
       't': 200,
       'u': 4,
       'v': 3,
       'w': 1,
       'y': 1,
       '{': 15,
       '|': 1,
       '\xe2': 8},
 'g': {' ': 4,
       ',': 2,
       '0': 1,
       '9': 1,
       'Del': 351,
       'G': 2,
       'S': 1,
       'T': 1,
       'a': 10,
       'c': 6,
       'd': 5,
       'e': 7,
       'f': 2,
       'g': 32680,
       'i': 1,
       'k': 3,
       'l': 1,
       'm': 1,
       'n': 2,
       'o': 3,
       'p': 1,
       'q': 5,
       's': 8,
       't': 4,
       'v': 1,
       'w': 1,
       'y': 4,
       'z': 1,
       '\xc2': 1,
       '\xe2': 1},
 'h': {' ': 4,
       '!': 3,
       ',': 2,
       '.': 2,
       '1': 1,
       '3': 1,
       ';': 2,
       'D': 1,
       'Del': 1070,
       'F': 17,
       'H': 23,
       'K': 6,
       'N': 4,
       'P': 3,
       'R': 18,
       'V': 1,
       'a': 7,
       'b': 44,
       'c': 1,
       'd': 5,
       'e': 8,
       'f': 35,
       'h': 120813,
       'i': 8,
       'j': 2,
       'k': 6,
       'l': 4,
       'm': 2,
       'n': 5,
       'o': 20,
       'p': 1,
       'r': 13,
       's': 11,
       't': 26,
       'u': 2,
       'v': 4,
       'w': 7,
       'y': 5,
       '}': 1,
       '\x94': 1,
       '\xe2': 3},
 'i': {' ': 7,
       '!': 3,
       '"': 1,
       '#': 39,
       "'": 1,
       '(': 1,
       ',': 4,
       '.': 1,
       '8': 1,
       'B': 1,
       'Del': 1413,
       'H': 5,
       'I': 11,
       'M': 1,
       'R': 1,
       'S': 2,
       'a': 13,
       'c': 8,
       'd': 1,
       'e': 21,
       'f': 69,
       'g': 2,
       'h': 4,
       'i': 110644,
       'j': 3,
       'k': 1,
       'l': 156,
       'm': 5,
       'n': 5,
       'o': 13,
       'p': 3,
       'r': 7,
       's': 7,
       't': 38,
       'u': 6,
       'v': 1,
       'w': 1,
       'y': 5,
       '{': 1,
       '|': 1,
       '}': 1,
       '\x98': 1,
       '\x99': 1,
       '\xc3': 1},
 'j': {' ': 1, 'Del': 48, 'J': 7, '[': 1, 'c': 2, 'i': 4, 'j': 1775, 't': 1},
 'k': {' ': 1,
       '.': 1,
       'Del': 147,
       'K': 34,
       'a': 2,
       'c': 1,
       'd': 6,
       'e': 2,
       'g': 2,
       'i': 3,
       'k': 12035,
       'm': 1,
       'o': 1,
       'r': 2,
       's': 1,
       't': 5,
       'v': 1,
       'w': 2,
       'y': 2},
 'l': {' ': 5,
       '!': 131,
       "'": 1,
       '(': 1,
       ',': 1,
       '.': 3,
       '0': 1,
       '1': 2,
       ':': 1,
       ';': 2,
       'Del': 1616,
       'H': 7,
       'I': 6,
       'J': 7,
       'N': 1,
       'T': 3,
       '[': 1,
       'a': 10,
       'b': 3,
       'c': 6,
       'd': 2,
       'e': 3,
       'f': 878,
       'g': 2,
       'h': 7,
       'i': 776,
       'j': 21,
       'k': 16,
       'l': 60966,
       'm': 5,
       'n': 7,
       'o': 13,
       'p': 3,
       'r': 5,
       's': 9,
       't': 898,
       'v': 3,
       'w': 2,
       'y': 5,
       '{': 4,
       '}': 5,
       '\x98': 2,
       '\xc3': 1,
       '\xe2': 4},
 'm': {' ': 8,
       ',': 2,
       '.': 1,
       'Del': 437,
       'I': 3,
       'Q': 1,
       'S': 3,
       'a': 6,
       'b': 2,
       'c': 3,
       'd': 3,
       'e': 7,
       'f': 3,
       'g': 1,
       'h': 4,
       'i': 14,
       'k': 1,
       'l': 1,
       'm': 43720,
       'n': 17,
       'o': 1,
       'p': 1,
       'r': 12,
       's': 7,
       't': 20,
       'u': 1,
       'v': 1,
       'w': 3,
       'y': 1,
       'z': 1,
       '|': 2,
       '\xe2': 1},
 'n': {' ': 7,
       '!': 2,
       "'": 9,
       ',': 2,
       '.': 1,
       ':': 1,
       'Del': 1255,
       'F': 1,
       'T': 1,
       'W': 1,
       'a': 8,
       'b': 2,
       'c': 3,
       'd': 6,
       'e': 7,
       'f': 26,
       'g': 2,
       'h': 19,
       'i': 7,
       'k': 1,
       'l': 10,
       'm': 14,
       'n': 120129,
       'o': 5,
       'p': 2,
       'r': 299,
       's': 13,
       't': 16,
       'u': 5,
       'v': 9,
       'w': 4,
       'y': 1,
       '|': 1,
       '\xe2': 1},
 'o': {' ': 6,
       '!': 1,
       '$': 23,
       "'": 3,
       ',': 2,
       '.': 1,
       '0': 14,
       '1': 10,
       '3': 6,
       '5': 32,
       '6': 2,
       '8': 23,
       '9': 5,
       '<': 1,
       'Del': 1474,
       'F': 1,
       'G': 1,
       'J': 1,
       'O': 13,
       'S': 4,
       'a': 135,
       'b': 2,
       'c': 30,
       'd': 12,
       'e': 85,
       'f': 6,
       'g': 6,
       'h': 12,
       'i': 16,
       'j': 1,
       'k': 2,
       'l': 4,
       'm': 4,
       'n': 2,
       'o': 142403,
       'p': 3,
       'q': 1,
       'r': 4,
       's': 6,
       't': 6,
       'u': 5,
       'w': 6,
       'y': 3,
       '{': 4,
       '|': 1,
       '\x9d': 2,
       '\xc2': 4,
       '\xc3': 1,
       '\xe2': 5},
 'p': {' ': 2,
       '1': 1,
       '8': 2,
       'B': 2,
       'D': 1,
       'Del': 364,
       'F': 1,
       'P': 5,
       'a': 5,
       'b': 1,
       'c': 3,
       'd': 1,
       'e': 6,
       'f': 7,
       'g': 3,
       'h': 5,
       'i': 2,
       'j': 2,
       'l': 15,
       'm': 4,
       'n': 9,
       'o': 32,
       'p': 24971,
       'q': 1,
       'r': 4,
       's': 1,
       't': 10,
       'u': 1},
 'q': {'Del': 21,
       'G': 1,
       'Q': 3,
       'S': 1,
       'a': 4,
       'b': 1,
       'c': 4,
       'e': 1,
       'g': 95,
       'o': 2,
       'p': 1,
       'q': 1615,
       'r': 1,
       'w': 3},
 'r': {' ': 6,
       "'": 2,
       ',': 5,
       '.': 4,
       '9': 1,
       ';': 1,
       'Del': 1062,
       'F': 14,
       'O': 1,
       'T': 6,
       'W': 2,
       'a': 9,
       'b': 3,
       'c': 7,
       'd': 5,
       'e': 6,
       'f': 49,
       'g': 1,
       'h': 5,
       'i': 8,
       'k': 1,
       'l': 4,
       'm': 135,
       'n': 23,
       'o': 5,
       'p': 2,
       'r': 96251,
       's': 9,
       't': 158,
       'u': 4,
       'v': 3,
       'w': 4,
       'x': 1,
       'y': 5,
       '\x9d': 1,
       '\xe2': 1},
 's': {' ': 1,
       '"': 1,
       '$': 4,
       ',': 4,
       '.': 1,
       '0': 14,
       '1': 1,
       '2': 9,
       '3': 2,
       '5': 12,
       '8': 10,
       '9': 1,
       ';': 1,
       'Del': 1081,
       'E': 1,
       'I': 1,
       'M': 1,
       'R': 1,
       'S': 177,
       'a': 13,
       'b': 1,
       'c': 11,
       'd': 8,
       'e': 28,
       'f': 10,
       'g': 32,
       'h': 5,
       'i': 3,
       'j': 1,
       'k': 1,
       'l': 10,
       'm': 7,
       'n': 15,
       'o': 7,
       'p': 1,
       'r': 10,
       's': 108423,
       't': 10,
       'u': 1,
       'v': 1,
       'w': 4,
       'y': 3,
       '}': 2,
       '\xe2': 1},
 't': {' ': 3,
       '!': 11,
       "'": 6,
       ',': 3,
       '.': 1,
       '0': 2,
       '1': 2,
       '9': 1,
       'C': 1,
       'Del': 1614,
       'H': 3,
       'L': 1,
       'S': 2,
       'T': 1,
       'U': 1,
       'a': 9,
       'b': 4,
       'c': 6,
       'd': 15,
       'e': 9,
       'f': 918,
       'g': 4,
       'h': 9,
       'i': 570,
       'j': 1,
       'k': 2,
       'l': 240,
       'm': 5,
       'n': 12,
       'o': 5,
       'p': 5,
       'r': 8,
       's': 14,
       't': 160207,
       'u': 3,
       'v': 6,
       'w': 8,
       'x': 1,
       'y': 8,
       '{': 12,
       '\x99': 1,
       '\xc3': 1,
       '\xe2': 4},
 'u': {' ': 7,
       'Del': 557,
       'G': 1,
       'S': 1,
       'T': 1,
       'U': 7,
       'a': 29,
       'c': 2,
       'd': 5,
       'e': 5,
       'f': 8,
       'g': 3,
       'h': 3,
       'i': 21,
       'j': 1,
       'k': 2,
       'l': 2,
       'm': 2,
       'n': 1,
       'o': 11,
       'p': 1,
       'r': 4,
       's': 3,
       't': 16,
       'u': 48929,
       'v': 6,
       'w': 14,
       'x': 1,
       'y': 4,
       '|': 1,
       '\x9d': 1,
       '\xe2': 1},
 'v': {' ': 3,
       ':': 1,
       'Del': 159,
       'R': 1,
       'V': 1,
       'a': 1,
       'd': 2,
       'f': 3,
       'g': 1,
       'h': 1,
       'i': 2,
       'l': 1,
       'm': 1,
       'n': 4,
       'o': 2,
       'p': 1,
       'r': 4,
       's': 2,
       'v': 17953,
       'y': 9,
       '|': 1},
 'w': {' ': 2,
       ',': 2,
       'A': 2,
       'Del': 364,
       'F': 1,
       'O': 1,
       'R': 1,
       'S': 1,
       'W': 2,
       'a': 6,
       'b': 5,
       'c': 3,
       'd': 4,
       'e': 2,
       'f': 3,
       'g': 3,
       'h': 3,
       'i': 7,
       'l': 1,
       'm': 2,
       'n': 1,
       'o': 1,
       'p': 1,
       'r': 7,
       's': 8,
       't': 6,
       'u': 2,
       'v': 12,
       'w': 38622,
       'y': 7,
       '|': 2,
       '\xe2': 1},
 'x': {'+': 1,
       'Del': 62,
       'a': 2,
       'd': 1,
       'g': 1,
       'h': 1,
       'n': 1,
       'p': 1,
       'r': 2,
       't': 1,
       'u': 1,
       'x': 3983},
 'y': {'#': 1,
       '5': 1,
       '?': 1,
       'D': 1,
       'Del': 406,
       'J': 1,
       'L': 1,
       'S': 1,
       'Y': 5,
       'a': 1,
       'c': 2,
       'd': 2,
       'e': 4,
       'g': 1,
       'h': 3,
       'i': 7,
       'j': 1,
       'k': 1,
       'l': 1,
       'm': 4,
       'n': 4,
       'o': 2,
       'p': 3,
       'r': 5,
       's': 7,
       't': 5,
       'y': 32354,
       '{': 2,
       '}': 2,
       '\xe2': 4},
 'z': {'Del': 9,
       'd': 1,
       'f': 1,
       'g': 2,
       'n': 4,
       'o': 1,
       'r': 2,
       's': 3,
       'x': 2,
       'z': 1179},
 '\x80': {'Del': 3029, '\x80': 11393},
 '\x81': {'Del': 21},
 '\x86': {'A': 4, 'Del': 5, '\xe2': 1},
 '\x91': {'Del': 8},
 '\x92': {'C': 1},
 '\x93': {'a': 1, 'e': 1, 'o': 9},
 '\x94': {'Del': 18, '\x94': 398, '\x9c': 1},
 '\x98': {'Del': 53, 'i': 1, '\x98': 386, '\x9c': 16},
 '\x99': {' ': 1,
          "'": 1,
          ',': 1,
          'Del': 1700,
          '\x98': 8,
          '\x99': 1263,
          '\x9c': 6,
          '\x9d': 25},
 '\x9a': {'Del': 6},
 '\x9c': {"'": 2,
          ',': 1,
          'Del': 486,
          'O': 1,
          'i': 1,
          '\x98': 283,
          '\x9c': 4542,
          '\x9d': 1},
 '\x9d': {' ': 1,
          ',': 1,
          'Del': 769,
          'e': 2,
          'i': 1,
          'j': 2,
          'l': 1,
          'm': 1,
          'p': 2,
          'r': 1,
          '\x84': 1,
          '\x94': 1,
          '\x99': 19,
          '\x9d': 4425},
 '\xa0': {'Del': 1},
 '\xa1': {'Del': 11, '\xa9': 1},
 '\xa6': {'Del': 2, 'a': 11, 'e': 4},
 '\xa8': {'Del': 1},
 '\xa9': {'Del': 2, '\xa9': 14},
 '\xab': {'\xa9': 1},
 '\xad': {'Del': 15},
 '\xb1': {'Del': 91,
          'a': 1,
          'f': 596,
          'i': 3,
          'j': 3,
          'n': 5,
          'r': 31,
          't': 1},
 '\xb3': {'Del': 4, '\xa9': 1},
 '\xb9': {'Del': 2},
 '\xc3': {'&': 1,
          'A': 21,
          'Del': 676,
          'E': 5,
          'F': 1,
          'N': 6,
          'U': 6,
          'a': 15,
          'd': 3,
          'e': 1,
          'f': 44,
          'g': 2,
          'h': 6,
          'i': 15,
          'l': 1,
          'n': 13,
          'o': 3,
          't': 10,
          '\xc3': 17},
 '\xc5': {'Del': 12},
 '\xe2': {' ': 4,
          '!': 4,
          '"': 847,
          '#': 1,
          "'": 1160,
          '*': 24,
          '+': 12,
          ',': 2,
          '-': 5,
          '1': 22,
          '4': 93,
          ':': 1,
          '<': 2,
          '=': 1,
          '?': 3,
          'Del': 754,
          'L': 1,
          'P': 1,
          'S': 1,
          'Z': 1,
          'a': 3,
          'b': 1,
          'd': 1,
          'f': 12,
          'h': 2,
          'i': 10,
          'j': 3,
          'l': 3,
          'm': 1,
          'n': 1,
          'o': 1,
          'r': 1,
          's': 1,
          't': 10,
          'u': 2,
          'w': 2,
          '{': 21,
          '}': 9,
          '~': 2,
          '\xab': 1,
          '\xb0': 2,
          '\xe2': 11394}}
#('final total letter count:', 2330901)