# Chapter 4 : cryptocipher

4.21 : given a message, create a dictionnary mapping the letters in message to the alphabet

In [1]:
import string
import random
import numpy as np
import requests
import re

In [14]:
def map_letters(message : str): 
    alphabet = string.ascii_lowercase
    letters = set(message.replace(' ', '')+alphabet)
    d={x:y for x, y in zip(letters, alphabet)}
    return d

# Create random cipher

In [27]:
alphabet = list(string.ascii_lowercase)
shuffled_alphabet = list(string.ascii_lowercase)
random.shuffle(shuffled_alphabet)
cipher = {x:y for x,y in zip(alphabet, shuffled_alphabet)}

In [28]:
cipher

{'a': 'e',
 'b': 'm',
 'c': 'd',
 'd': 'j',
 'e': 'k',
 'f': 'c',
 'g': 'l',
 'h': 'i',
 'i': 's',
 'j': 'u',
 'k': 't',
 'l': 'y',
 'm': 'h',
 'n': 'p',
 'o': 'q',
 'p': 'v',
 'q': 'g',
 'r': 'n',
 's': 'o',
 't': 'z',
 'u': 'f',
 'v': 'a',
 'w': 'b',
 'x': 'r',
 'y': 'w',
 'z': 'x'}

# Train language model

Create a character-level Markov model based on an English dataset (an edit of https://www.gutenberg.org/ebooks/2701 ). Any book could be used instead or together with this one. We suppose that the probability $p(a_k|a_1, ..., a_{k-1})=p(a_k|a_{k-1})$. In other terms, it only depend on the previous character. We will count occurances of character pairs in the text and will divide it by the count of character occurances in the text. 

For a given word : 
logprob$(word) = \log (p(x_1)  \Pi_{i=2}^n p(x_t | x_{t-1}))$

Markov matrix will provide the counts of pairs $a_i \rightarrow a_j$.
Weights vector will provide the counts of each letter.

In [64]:
markov_matrix = np.ones((26,26))
weights = np.zeros(26)

In [56]:
def markov_update(a, b):
    markov_matrix[alphabet.index(a), alphabet.index(b)]+=1
    return markov_matrix

def weight_update (a):
    weights[alphabet.index(a)]+=1
    return weights

### Get the log-probability of a word

In [66]:

def get_word_prob(word : str):

    i = alphabet.index(word[0])
    logp = np.log(weights[i])

    for ch in word[1:]:
        j = alphabet.index(ch)
        logp += np.log(markov_matrix[i, j]) 
        i = j
    return logp

### Get the probability of a sentence

The sentence is stripped from eventual punctuation and transformed to lower case before calculation of probability.


In [68]:
def get_sequence_prob(words:str):
    words = words.translate(str.maketrans('', '', string.punctuation)).lower().split()
    logp = sum([get_word_prob(word) for word in words])
    return logp

### Get a reference file for language model training.

We will use "Moby Dick" by Herman Melville.
Source : https://www.gutenberg.org/ebooks/2701

In [114]:
if not os.path.exists('moby_dickya.txt'):
    print("Downloading moby dick...")
    r = requests.get('https://www.gutenberg.org/files/2701/2701-0.txt')
    with open('moby_dicky.txt', 'w') as f:
        f.write(r.content.decode())
    with open('moby_dicky.txt', 'r') as f:
        r = f.readlines()[848:21965]
    with open('moby_dicky.txt', 'w') as f:
        f.write('\n'.join(r))

Downloading moby dick...
