# Chapter 4 : cryptocipher

4.21 : given a message, create a dictionnary mapping the letters in message to the alphabet

In [8]:
import string
import random
import numpy as np
import requests
import re

In [9]:
def map_letters(message : str): 
    alphabet = string.ascii_lowercase
    letters = set(message.replace(' ', '')+alphabet)
    d={x:y for x, y in zip(letters, alphabet)}
    return d

# Create random cipher

In [10]:
alphabet = list(string.ascii_lowercase)
shuffled_alphabet = list(string.ascii_lowercase)
random.shuffle(shuffled_alphabet)
cipher = {x:y for x,y in zip(alphabet, shuffled_alphabet)}

In [11]:
cipher

{'a': 'a',
 'b': 'y',
 'c': 'c',
 'd': 'k',
 'e': 'b',
 'f': 'h',
 'g': 'f',
 'h': 't',
 'i': 'v',
 'j': 'r',
 'k': 'q',
 'l': 'x',
 'm': 'w',
 'n': 'l',
 'o': 'n',
 'p': 'o',
 'q': 'j',
 'r': 'i',
 's': 'g',
 't': 'z',
 'u': 'u',
 'v': 'e',
 'w': 'p',
 'x': 'd',
 'y': 'm',
 'z': 's'}

# Train language model

Create a character-level Markov model based on an English dataset (an edit of https://www.gutenberg.org/ebooks/2701 ). Any book could be used instead or together with this one. We suppose that the probability $p(a_k|a_1, ..., a_{k-1})=p(a_k|a_{k-1})$. In other terms, it only depend on the previous character. We will count occurances of character pairs in the text and will divide it by the count of character occurances in the text. 

For a given word : 
logprob$(word) = \log (p(x_1)  \Pi_{i=2}^n p(x_t | x_{t-1}))$

Markov matrix will provide the counts of pairs $a_i \rightarrow a_j$.
Weights vector will provide the counts of each letter.

In [12]:
markov_matrix = np.ones((26,26))
weights = np.zeros(26)

In [13]:
def markov_update(a, b):
    markov_matrix[alphabet.index(a), alphabet.index(b)]+=1

def weight_update (a):
    weights[alphabet.index(a)]+=1

### Get the log-probability of a word

In [14]:

def get_word_prob(word : str):

    i = alphabet.index(word[0])
    logp = np.log(weights[i])

    for ch in word[1:]:
        j = alphabet.index(ch)
        logp += np.log(markov_matrix[i, j]) 
        i = j
    return logp

### Get the probability of a sentence

The sentence is stripped from eventual punctuation and transformed to lower case before calculation of probability.


In [15]:
def get_sequence_prob(words:str):
    words = words.translate(str.maketrans('', '', string.punctuation)).lower().split()
    logp = sum([get_word_prob(word) for word in words])
    return logp

### Get a reference file for language model training.

We will use "Moby Dick" by Herman Melville.
Source : https://www.gutenberg.org/ebooks/2701

In [16]:
if not os.path.exists('moby_dick.txt'):
    print("Downloading moby dick...")
    r = requests.get('https://www.gutenberg.org/files/2701/2701-0.txt')
    with open('moby_dick.txt', 'w') as f:
        f.write(r.content.decode())
    with open('moby_dick.txt', 'r') as f:
        r = f.readlines()[848:21965]
    with open('moby_dick.txt', 'w') as f:
        f.write('\n'.join(r))

### Train the model

In [17]:
regex = re.compile('[^a-zA-Z]')

for line in open('moby_dick.txt'):
  line = line.strip()

  if line!='':
    line = regex.sub(' ', line)
    tokens = line.lower().split()

    for token in tokens:
      t0 = token[0]
      weight_update(t0)

      for t1 in token[1:]:
        markov_update(t0, t1)
        t0 = t1

### Normalize the probabilities

In [18]:
weights /= weights.sum()
markov_matrix /= markov_matrix.sum(axis=1, keepdims=True)

In [21]:
with open('encode_message.txt', 'r') as f:
    message = f.read().replace('\n',' ').lower()
    regex = re.compile('[^a-zA-Z]')
    message = ' '.join(regex.sub(' ', message).strip().split())
message

'i then lounged down the street and found as i expected that there was a mews in a lane which runs down by one wall of the garden i lent the ostlers a hand in rubbing down their horses and received in exchange twopence a glass of half and half two fills of shag tobacco and as much information as i could desire about miss adler to say nothing of half a dozen other people in the neighbourhood in whom i was not in the least interested but whose biographies i was compelled to listen to away they went and i was just wondering whether i should not do well to follow them when up the lane came a neat little landau the coachman with his coat only half buttoned and his tie under his ear while all the tags of his harness were sticking out of the buckles it hadn t pulled up before she shot out of the hall door and into it i only caught a glimpse of her at the moment but she was a lovely woman with a face that a man might die for my cabby drove fast i don t think i ever drove faster but the others 