# Chapter 4 : cryptocipher

4.21 : given a message, create a dictionnary mapping the letters in message to the alphabet

In [282]:
import string
import random
import numpy as np
import requests
import re
from operator import itemgetter

# Create random cipher

In [283]:
alphabet = list(string.ascii_lowercase)
shuffled_alphabet = list(string.ascii_lowercase)
random.shuffle(shuffled_alphabet)
cipher = {x:y for x,y in zip(alphabet, shuffled_alphabet)}

In [284]:
cipher

{'a': 'w',
 'b': 'k',
 'c': 'l',
 'd': 'j',
 'e': 'm',
 'f': 'h',
 'g': 'd',
 'h': 's',
 'i': 'p',
 'j': 'v',
 'k': 'y',
 'l': 'q',
 'm': 'x',
 'n': 'u',
 'o': 'n',
 'p': 'i',
 'q': 'a',
 'r': 'c',
 's': 'g',
 't': 'f',
 'u': 't',
 'v': 'r',
 'w': 'z',
 'x': 'o',
 'y': 'b',
 'z': 'e'}

# Train language model

Create a character-level Markov model based on an English dataset (an edit of https://www.gutenberg.org/ebooks/2701 ). Any book could be used instead or together with this one. We suppose that the probability $p(a_k|a_1, ..., a_{k-1})=p(a_k|a_{k-1})$. In other terms, it only depend on the previous character. We will count occurances of character pairs in the text and will divide it by the count of character occurances in the text. 

For a given word : 
logprob$(word) = \log (p(x_1)  \Pi_{i=2}^n p(x_t | x_{t-1}))$

Markov matrix will provide the counts of pairs $a_i \rightarrow a_j$.
Weights vector will provide the counts of each letter.

In [285]:
markov_matrix = np.ones((26,26))
weights = np.zeros(26)

In [286]:
def markov_update(a, b):
    markov_matrix[alphabet.index(a), alphabet.index(b)]+=1

def weight_update (a):
    weights[alphabet.index(a)]+=1

### Get the log-probability of a word

In [287]:

def get_word_prob(word : str):

    i = alphabet.index(word[0])
    logp = np.log(weights[i])

    for ch in word[1:]:
        j = alphabet.index(ch)
        logp += np.log(markov_matrix[i, j]) 
        i = j
    return logp

### Get the probability of a sentence

The sentence is stripped from eventual punctuation and transformed to lower case before calculation of probability.


In [288]:
def get_sequence_prob(words:str):
    words = words.split()
    logp = sum([get_word_prob(word) for word in words])
    return logp

### Get a reference file for language model training.

We will use "Moby Dick" by Herman Melville.
Source : https://www.gutenberg.org/ebooks/2701

In [289]:
if not os.path.exists('moby_dick.txt'):
    print("Downloading moby dick...")
    r = requests.get('https://www.gutenberg.org/files/2701/2701-0.txt')
    with open('moby_dick.txt', 'w') as f:
        f.write(r.content.decode())
    with open('moby_dick.txt', 'r') as f:
        r = f.readlines()[848:21965]
    with open('moby_dick.txt', 'w') as f:
        f.write('\n'.join(r))

### Train the model

In [290]:
regex = re.compile('[^a-zA-Z]')

for line in open('moby_dick.txt'):
  line = line.strip()

  if line!='':
    line = regex.sub(' ', line)
    tokens = line.lower().split()

    for token in tokens:
      t0 = token[0]
      weight_update(t0)

      for t1 in token[1:]:
        markov_update(t0, t1)
        t0 = t1

### Normalize the probabilities

In [291]:
weights /= weights.sum()
markov_matrix /= markov_matrix.sum(axis=1, keepdims=True)

### Read and encode the message

In [292]:
with open('encode_message.txt', 'r') as f:
    message = f.read().replace('\n',' ').lower()
    regex = re.compile('[^a-zA-Z]')
    message_to_encode = ' '.join(regex.sub(' ', message).strip().split())
message_to_encode

'i then lounged down the street and found as i expected that there was a mews in a lane which runs down by one wall of the garden i lent the ostlers a hand in rubbing down their horses and received in exchange twopence a glass of half and half two fills of shag tobacco and as much information as i could desire about miss adler to say nothing of half a dozen other people in the neighbourhood in whom i was not in the least interested but whose biographies i was compelled to listen to away they went and i was just wondering whether i should not do well to follow them when up the lane came a neat little landau the coachman with his coat only half buttoned and his tie under his ear while all the tags of his harness were sticking out of the buckles it hadn t pulled up before she shot out of the hall door and into it i only caught a glimpse of her at the moment but she was a lovely woman with a face that a man might die for my cabby drove fast i don t think i ever drove faster but the others 

In [293]:
def encode(message : str, cipher : dict):
    encoded = ''
    for c in message:
        if c==' ':
            encoded += c
        else:
            encoded += cipher[c]
    return encoded

def decode(message : str, cipher : dict):
    anticipher = {x: y for y, x in cihper.items()}
    decoded = ''
    for c in message:
        if c==' ':
            decoded += c
        else:
            encoded += anticipher[c]
    return decoded

In [294]:
code = encode(message_to_encode, cipher)
code

'p fsmu qntudmj jnzu fsm gfcmmf wuj hntuj wg p moimlfmj fswf fsmcm zwg w xmzg pu w qwum zspls ctug jnzu kb num zwqq nh fsm dwcjmu p qmuf fsm ngfqmcg w swuj pu ctkkpud jnzu fsmpc sncgmg wuj cmlmprmj pu molswudm fznimulm w dqwgg nh swqh wuj swqh fzn hpqqg nh gswd fnkwlln wuj wg xtls puhncxwfpnu wg p lntqj jmgpcm wkntf xpgg wjqmc fn gwb unfspud nh swqh w jnemu nfsmc imniqm pu fsm umpdskntcsnnj pu zsnx p zwg unf pu fsm qmwgf pufmcmgfmj ktf zsngm kpndcwispmg p zwg lnximqqmj fn qpgfmu fn wzwb fsmb zmuf wuj p zwg vtgf znujmcpud zsmfsmc p gsntqj unf jn zmqq fn hnqqnz fsmx zsmu ti fsm qwum lwxm w umwf qpffqm qwujwt fsm lnwlsxwu zpfs spg lnwf nuqb swqh ktffnumj wuj spg fpm tujmc spg mwc zspqm wqq fsm fwdg nh spg swcumgg zmcm gfplypud ntf nh fsm ktlyqmg pf swju f itqqmj ti kmhncm gsm gsnf ntf nh fsm swqq jnnc wuj pufn pf p nuqb lwtdsf w dqpxigm nh smc wf fsm xnxmuf ktf gsm zwg w qnrmqb znxwu zpfs w hwlm fswf w xwu xpdsf jpm hnc xb lwkkb jcnrm hwgf p jnu f fspuy p mrmc jcnrm hwgfmc ktf fsm nfsmcg 

# Use a genetic algorithm to decript the message

### parameters

In [295]:
keep = 8
children = 3
N = keep * (children + 1)
epocs = 300

In [296]:
def generate_dna():
    alpha = list(string.ascii_lowercase)
    random.shuffle(alpha)
    return ''.join(alpha)

In [297]:
dna_pool = [generate_dna() for _ in range(N)]

In [298]:
def evaluate_dna(dna, message):
    alpha = list(string.ascii_lowercase)
    encoder = {x:y for x, y in zip(alpha, list(dna))}
    dna_code = encode(message, encoder)
    score = get_sequence_prob(dna_code)
    return score


In [299]:
def keep_best(pool, n=keep):
    score = [(dna, evaluate_dna(dna, code)) for dna in pool]
    score = sorted(score, key=itemgetter(1), reverse=True)[:n]
    best_dna = [x[0] for x in score]
    best_score = score[0][1]
    return best_dna, best_score

In [300]:
def mutations(pool, nc=children):
    children = []
    for dna in pool:
        for _ in range(nc):
            child = list(dna)
            i=random.randint(0,25)
            j=random.randint(0,25)
            child[i], child[j] = child[j], child[i]
            child = ''.join(child)
            children.append(child)
    return pool+children


In [301]:
len(dna_pool[0])

26

In [302]:
for i in range(epocs):
    dna_pool, best_score = keep_best(dna_pool)
    dna_pool = mutations(dna_pool)
    if i % 10 == 0:
        print(i, 'iterations, best score : ', best_score)





0 iterations, best score :  -5493.94712238369
10 iterations, best score :  -4768.545893175398
20 iterations, best score :  -4164.083373932705
30 iterations, best score :  -3866.95079142996
40 iterations, best score :  -3845.705004428158
50 iterations, best score :  -3739.4929455116
60 iterations, best score :  -3632.0231280792364
70 iterations, best score :  -3573.263355183419
80 iterations, best score :  -3554.546998097308
90 iterations, best score :  -3491.195220462158
100 iterations, best score :  -3463.896306291291
110 iterations, best score :  -3447.6041074236846
120 iterations, best score :  -3399.0021189974273
130 iterations, best score :  -3216.825586365621
140 iterations, best score :  -3147.8849586348592
150 iterations, best score :  -3054.541414801252
160 iterations, best score :  -2986.3192659503325
170 iterations, best score :  -2986.3192659503325
180 iterations, best score :  -2963.248763764731
190 iterations, best score :  -2949.1649454450476
200 iterations, best score :

In [303]:
 best_dna = dna_pool[0]
 alpha = list(string.ascii_lowercase)
 encoder = {x:y for x, y in zip(alpha, list(best_dna))}
 encode(code, encoder)

'i then lounged down the street and found as i expected that there was a mews in a lane which runs down by one wall of the garden i lent the ostlers a hand in rubbing down their horses and received in exchange twopence a glass of half and half two fills of shag tobacco and as much information as i could desire about miss adler to say nothing of half a dozen other people in the neighbourhood in whom i was not in the least interested but whose biographies i was compelled to listen to away they went and i was qust wondering whether i should not do well to follow them when up the lane came a neat little landau the coachman with his coat only half buttoned and his tie under his ear while all the tags of his harness were sticking out of the buckles it hadn t pulled up before she shot out of the hall door and into it i only caught a glimpse of her at the moment but she was a lovely woman with a face that a man might die for my cabby drove fast i don t think i ever drove faster but the others 