# Approximations to English

See section 3 of [A Mathematical Theory of Communication by Claude E. Shannon](https://people.math.harvard.edu/~ctm/home/text/others/shannon/entropy/entropy.pdf).


In [1]:
# Imports.

# Selecting random items from lists.
import random

# Efficient data structures.
import collections

## Zero Order Letter Approximation

Create strings by selecting random characters from a list of characters.


### random
https://docs.python.org/3/library/random.html
> This module implements pseudo-random number generators for various distributions.


### random.choice

https://docs.python.org/3/library/random.html#random.choice

Select a random element from a list.  
By random, we mean all elements have equal chance of selection.

In [2]:
# Example: select a random element of a list.
random.choice([1, 2, 3, 4, 5])

3

In [3]:
# Select a random character.
random.choice('abcdefghijklmnopqrstuvwxyz .')

'y'

### random.choices

https://docs.python.org/3/library/random.html#random.choices

> Return a k sized list of elements chosen from the population with replacement.  
> If a weights sequence is specified, selections are made according to the relative weights.

In [4]:
# Select a sequence of characters using equal weights.
''.join(random.choices('abcdefghijklmnopqrstuvwxyz .', k=100))

'hev daezqpx n.cxstfdgnzius hg qiv g..czmfrev vueuibo birdxlfhjshw.qjybsvvyqgoqehstnopuhhplwhtlbndeef'

## First Order Letter Approximation

### Reading Text Files

The following was adapted from a response from ChatGPT.  
https://chatgpt.com/share/66ffdf0f-4094-800d-9ae9-63ffb9b20043

### Open File for Reading

We'll use [Frankenstein](https://www.gutenberg.org/ebooks/84) from [Project Gutenberg](https://www.gutenberg.org/).

In [5]:
# Using a with https://peps.python.org/pep-0343/
with open('data/frankenstein.txt', 'r') as file:
  # Read the whole file into a string.
  english = file.read()

In [6]:
# Change everything to lower case.
english = english.lower()

In [7]:
# The characters to keep.
keep = 'abcdefghijklmnopqrstuvwxyz .'

# Remove unwanted characters.
cleaned = ''.join(c for c in english if c in keep)

https://docs.python.org/3/library/collections.html#collections.Counter

In [8]:
# Count the frequency of each character.
# https://docs.python.org/3/library/collections.html#collections.Counter
counts = collections.Counter(cleaned)

In [9]:
# Show.
counts

Counter({' ': 71747,
         'e': 46094,
         't': 30379,
         'a': 26743,
         'o': 25254,
         'i': 24577,
         'n': 24359,
         's': 21173,
         'r': 20876,
         'h': 19763,
         'd': 16858,
         'l': 12722,
         'm': 10545,
         'u': 10412,
         'c': 9275,
         'f': 8722,
         'y': 7923,
         'w': 7653,
         'p': 6134,
         'g': 5980,
         'b': 5021,
         'v': 3829,
         '.': 3145,
         'k': 1760,
         'x': 677,
         'j': 502,
         'q': 324,
         'z': 213})

In [10]:
# Print the results
for char, count in counts.items():
  print(f"'{char}': {count}")


't': 30379
'h': 19763
'e': 46094
' ': 71747
'p': 6134
'r': 20876
'o': 25254
'j': 502
'c': 9275
'g': 5980
'u': 10412
'n': 24359
'b': 5021
'k': 1760
'f': 8722
'a': 26743
's': 21173
'i': 24577
'm': 10545
'd': 16858
'y': 7923
'w': 7653
'l': 12722
'v': 3829
'.': 3145
'z': 213
'x': 677
'q': 324


In [11]:
L = [[l, c] for l, c in counts.items()]
L.sort(key=lambda x: x[1], reverse=True)
for l, c in L:
  print(f"'{l}': {c}")

' ': 71747
'e': 46094
't': 30379
'a': 26743
'o': 25254
'i': 24577
'n': 24359
's': 21173
'r': 20876
'h': 19763
'd': 16858
'l': 12722
'm': 10545
'u': 10412
'c': 9275
'f': 8722
'y': 7923
'w': 7653
'p': 6134
'g': 5980
'b': 5021
'v': 3829
'.': 3145
'k': 1760
'x': 677
'j': 502
'q': 324
'z': 213


## Dictionaries

In [12]:
counts

Counter({' ': 71747,
         'e': 46094,
         't': 30379,
         'a': 26743,
         'o': 25254,
         'i': 24577,
         'n': 24359,
         's': 21173,
         'r': 20876,
         'h': 19763,
         'd': 16858,
         'l': 12722,
         'm': 10545,
         'u': 10412,
         'c': 9275,
         'f': 8722,
         'y': 7923,
         'w': 7653,
         'p': 6134,
         'g': 5980,
         'b': 5021,
         'v': 3829,
         '.': 3145,
         'k': 1760,
         'x': 677,
         'j': 502,
         'q': 324,
         'z': 213})

In [13]:
counts.keys()

dict_keys(['t', 'h', 'e', ' ', 'p', 'r', 'o', 'j', 'c', 'g', 'u', 'n', 'b', 'k', 'f', 'a', 's', 'i', 'm', 'd', 'y', 'w', 'l', 'v', '.', 'z', 'x', 'q'])

In [14]:
counts.values()

dict_values([30379, 19763, 46094, 71747, 6134, 20876, 25254, 502, 9275, 5980, 10412, 24359, 5021, 1760, 8722, 26743, 21173, 24577, 10545, 16858, 7923, 7653, 12722, 3829, 3145, 213, 677, 324])

https://docs.python.org/3/library/stdtypes.html#dictionary-view-objects

In [15]:
chars, weights = counts.keys(), counts.values()

In [16]:
list(chars)

['t',
 'h',
 'e',
 ' ',
 'p',
 'r',
 'o',
 'j',
 'c',
 'g',
 'u',
 'n',
 'b',
 'k',
 'f',
 'a',
 's',
 'i',
 'm',
 'd',
 'y',
 'w',
 'l',
 'v',
 '.',
 'z',
 'x',
 'q']

In [17]:
weights

dict_values([30379, 19763, 46094, 71747, 6134, 20876, 25254, 502, 9275, 5980, 10412, 24359, 5021, 1760, 8722, 26743, 21173, 24577, 10545, 16858, 7923, 7653, 12722, 3829, 3145, 213, 677, 324])

In [18]:
s = ''.join(random.choices(list(chars), weights=weights, k=1000))

s

'ad u oe i ilioehutegro pocseaeicordr o bh   ml rt itipweedtnh yhroie erhis tsngssaimsr.nfubr has tegr dlctfrlpr r itas  enetlabt  e mainyeicade elurt ea gmi nnsfueesetercyntdplcer igtui r dorreonfhi  mieesetl  setnneinene i wgttawiubtitc h b tdmer ncoeraedfaoopwnhb iiaum t oumhr d.exneezehoemi   ttksccrnboiltgtarqcatidovhmyd jlam ednnlgaeiaruf atlssu iewt rd agu aao vbw vea    biioehteledtufldn igcme  oty   beyn dtgcoyktesd n g widiydaslaticehe oneceeiw rfc  orrg st     lttlfrayivrdovihpohnhdt   c etteiisusdmooib er oaa it.epipdbkmld jeanspj ltleohhetabtr  rror mnlcumwfiad d uwhierrhdahpshguhfei  d e.afltyigosoth hsts  jd eefoo sporaph osn .  sln  ed slrs ribpt ssaha dehteeia iq aoo  ev hogolrivceos dtetoeoiuipobaecaattaeyfs e  eo i ai s hihka o itelvldie o mditdy.muaditst rie o hklueah g msee no rhn fgt cdf  ottsti eetauycrln  ft ee.tlatynai   c  ltvdynbtilhlktiti  v nhw tdtnisifihreaguiaol  erhidnehpgntohaoearstneu g .vhohor u  fosbl oar  iedmpe on nt  nmi mne ers b  nt sy tdhftelsa

In [19]:
collections.Counter(s)

Counter({' ': 176,
         'e': 97,
         't': 79,
         'i': 72,
         'o': 57,
         'a': 55,
         'r': 50,
         'h': 44,
         's': 44,
         'd': 43,
         'n': 43,
         'l': 36,
         'u': 24,
         'c': 23,
         'm': 22,
         'g': 21,
         'f': 20,
         'b': 18,
         'p': 16,
         'y': 16,
         'w': 11,
         'v': 11,
         '.': 8,
         'k': 6,
         'j': 4,
         'q': 2,
         'x': 1,
         'z': 1})

## Second Order Letter Approximation

In [20]:
# Create empty dcitionary.
model2 = {}

model2

{}

In [21]:
list(set(cleaned))

['q',
 'o',
 'd',
 'k',
 'l',
 't',
 'v',
 'j',
 'e',
 'y',
 'a',
 'm',
 '.',
 'g',
 'r',
 'i',
 's',
 ' ',
 'u',
 'n',
 'x',
 'h',
 'f',
 'p',
 'b',
 'w',
 'z',
 'c']

In [22]:
for i in range(1, len(cleaned)):
    digram = cleaned[i-1:i+1]
    model2[digram] = model2.get(digram, 0) + 1

In [23]:
model2

{'th': 9681,
 'he': 8466,
 'e ': 13166,
 ' p': 2072,
 'pr': 1021,
 'ro': 1747,
 'oj': 94,
 'je': 180,
 'ec': 1238,
 'ct': 919,
 't ': 6534,
 ' g': 914,
 'gu': 355,
 'ut': 1377,
 'te': 3154,
 'en': 4368,
 'nb': 124,
 'be': 1829,
 'er': 5612,
 'rg': 226,
 'g ': 1604,
 ' e': 1882,
 'eb': 92,
 'bo': 360,
 'oo': 678,
 'ok': 224,
 'k ': 513,
 ' o': 4663,
 'of': 2923,
 'f ': 3033,
 ' f': 3011,
 'fr': 722,
 'ra': 1384,
 'an': 5870,
 'nk': 194,
 'ke': 560,
 'ns': 1133,
 'st': 2659,
 'ei': 628,
 'in': 5714,
 'n ': 5051,
 'or': 2950,
 'r ': 3767,
 ' t': 10173,
 ' m': 4910,
 'mo': 990,
 'od': 368,
 'de': 2467,
 'rn': 505,
 'om': 1493,
 'me': 2702,
 'et': 1209,
 'eu': 39,
 'us': 1090,
 's ': 6929,
 '  ': 335,
 'hi': 2569,
 'is': 2776,
 ' i': 6085,
 'fo': 1375,
 ' u': 778,
 'se': 2793,
 ' a': 8274,
 'ny': 298,
 'yo': 1062,
 'on': 3910,
 'ne': 2161,
 'yw': 26,
 'wh': 1658,
 're': 5621,
 'un': 1371,
 'ni': 735,
 'it': 2787,
 'ed': 5027,
 'd ': 9810,
 ' s': 4334,
 'ta': 1062,
 'at': 3931,
 'es': 3378,


In [24]:
model2b = collections.Counter([cleaned[i-1:i+1] for i in range(1, len(cleaned))])

In [25]:
model2b

Counter({'e ': 13166,
         ' t': 10173,
         'd ': 9810,
         'th': 9681,
         'he': 8466,
         ' a': 8274,
         's ': 6929,
         't ': 6534,
         ' i': 6085,
         'an': 5870,
         'in': 5714,
         're': 5621,
         'er': 5612,
         'y ': 5171,
         ' w': 5138,
         'n ': 5051,
         'ed': 5027,
         ' m': 4910,
         'nd': 4891,
         ' o': 4663,
         'en': 4368,
         ' s': 4334,
         ' h': 4152,
         'at': 3931,
         'on': 3910,
         'r ': 3767,
         'ou': 3631,
         'ha': 3402,
         'es': 3378,
         'te': 3154,
         ' b': 3102,
         'f ': 3033,
         'to': 3030,
         ' f': 3011,
         'o ': 2951,
         'or': 2950,
         'of': 2923,
         'se': 2793,
         'it': 2787,
         'is': 2776,
         'me': 2702,
         ' c': 2697,
         'ea': 2673,
         'st': 2659,
         've': 2659,
         'ar': 2657,
         'as': 2636,
         'i

In [26]:
# Start the string.
gen2 = ''.join(random.choices(list(chars), weights=weights, k=1))

# Show.
gen2

'e'

In [27]:
i = 0
gen2[i]

'e'

In [28]:
model2b.keys()

dict_keys(['th', 'he', 'e ', ' p', 'pr', 'ro', 'oj', 'je', 'ec', 'ct', 't ', ' g', 'gu', 'ut', 'te', 'en', 'nb', 'be', 'er', 'rg', 'g ', ' e', 'eb', 'bo', 'oo', 'ok', 'k ', ' o', 'of', 'f ', ' f', 'fr', 'ra', 'an', 'nk', 'ke', 'ns', 'st', 'ei', 'in', 'n ', 'or', 'r ', ' t', ' m', 'mo', 'od', 'de', 'rn', 'om', 'me', 'et', 'eu', 'us', 's ', '  ', 'hi', 'is', ' i', 'fo', ' u', 'se', ' a', 'ny', 'yo', 'on', 'ne', 'yw', 'wh', 're', 'un', 'ni', 'it', 'ed', 'd ', ' s', 'ta', 'at', 'es', 'nd', 'dm', 'os', 'ot', 'pa', 'ar', 'rt', 'ts', ' w', 'wo', 'rl', 'ld', ' n', 'no', 'o ', ' c', 'co', 'wi', 'h ', 'al', 'lm', ' r', 'tr', 'ri', 'ic', 'ti', 'io', 'sw', 'ha', 'so', 'oe', 'ev', 've', 'r.', '. ', ' y', 'ou', 'u ', 'ma', 'ay', 'y ', 'op', 'py', 'gi', 'iv', 'aw', 'wa', 'rm', 'ms', ' l', 'li', 'ce', 'nc', 'cl', 'lu', 'ud', 'nl', 'ea', 'ww', 'w.', '.g', 'g.', '.o', 'if', 'lo', 'oc', 'ca', 'sy', 'il', 'll', 'l ', ' h', 'av', 'to', 'ch', 'ck', 'la', 'ws', 'nt', 'ry', 'db', 'ef', 'si', 'ng', 'k.', '.t',

In [29]:
[(x[1], model2b[x]) for x in model2b.keys() if x[0] == gen2[i]]

[(' ', 13166),
 ('c', 1238),
 ('n', 4368),
 ('r', 5612),
 ('b', 92),
 ('i', 628),
 ('t', 1209),
 ('u', 39),
 ('d', 5027),
 ('s', 3378),
 ('v', 889),
 ('a', 2673),
 ('f', 582),
 ('l', 1778),
 ('y', 477),
 ('m', 999),
 ('e', 1228),
 ('j', 22),
 ('g', 218),
 ('z', 14),
 ('k', 67),
 ('.', 660),
 ('x', 530),
 ('p', 520),
 ('q', 75),
 ('w', 347),
 ('o', 115),
 ('h', 143)]

In [30]:
letters, weights = list(zip(*[(x[1], model2b[x]) for x in model2b.keys() if x[0] == gen2[i]]))
print(letters)
print(weights)

(' ', 'c', 'n', 'r', 'b', 'i', 't', 'u', 'd', 's', 'v', 'a', 'f', 'l', 'y', 'm', 'e', 'j', 'g', 'z', 'k', '.', 'x', 'p', 'q', 'w', 'o', 'h')
(13166, 1238, 4368, 5612, 92, 628, 1209, 39, 5027, 3378, 889, 2673, 582, 1778, 477, 999, 1228, 22, 218, 14, 67, 660, 530, 520, 75, 347, 115, 143)


## First Order Word Approximation

## Second Order Word Approximation

## End