# Decrypting Messages with MCMC

In [1]:
import warnings
from collections import Counter
import numpy as np
import pandas as pd
import networkx as nx
from IPython.display import display, Markdown

warnings.filterwarnings('ignore')

In [2]:
with open('symbols.txt', 'r', encoding='utf-16') as f:
    symbols = f.read().split('\n')

with open('war-and-peace.txt', 'r') as f:
    war_and_peace = f.read()
    war_and_peace = ' '.join(war_and_peace.lower().split())

In [3]:
unrecognized_symbols = set(war_and_peace).difference(symbols)
for uc in unrecognized_symbols:
    war_and_peace = war_and_peace.replace(uc, ' ')

war_and_peace = ' '.join(war_and_peace.split())

## 5 (a)

In [4]:
class MarkovChain:

    def __init__(self, symbols, text):

        self.symbols = symbols
        self.symbols_to_idx = {s: i for i, s in enumerate(symbols)}
        
        transition_counts = np.zeros((len(symbols), len(symbols))) + 0   # Laplace smoothing: setting uniform prior, avoids transition probabilities 0
        for beta, alpha in zip(text[:-1], text[1:]):
            transition_counts[self.symbols_to_idx[beta], self.symbols_to_idx[alpha]] += 1
        self.tm = transition_counts / transition_counts.sum(axis=1, keepdims=True)

        eig_vals, eig_vecs = np.linalg.eig(self.tm.T)
        p_inv = eig_vecs[:, np.where(np.isclose(eig_vals, 1))[0][0]]
        self.p_inv = (p_inv / np.sum(p_inv)).astype(float)

    def transition_prob(self, beta, alpha):

        return self.tm[self.symbols_to_idx[beta], self.symbols_to_idx[alpha]]

    def limiting_prob(self, gamma):

        return self.p_inv[self.symbols_to_idx[gamma]]

In [5]:
mc = MarkovChain(symbols, war_and_peace)

display(Markdown(
    fr"$p\left(s_i=b|s_{{i-1}}=a\right) = \psi\left(b, a\right) = {mc.transition_prob('a', 'b'):.6f}$"
))

$p\left(s_i=b|s_{i-1}=a\right) = \psi\left(b, a\right) = 0.016810$

In [6]:
assert np.all(np.isclose(mc.tm.T @ mc.p_inv, mc.p_inv))
assert np.isclose(np.sum(mc.p_inv), 1)

display(Markdown(
    '\n'.join([
        r'| $\gamma$ |' + ''.join([f" {symbol} |" for symbol in symbols]),
        '| :---: |' + ' :---: |'*len(symbols),
        r'| $\psi\left(\gamma\right)$ |' + ''.join([f' {p:.6f} |' for p in mc.p_inv]),
    ])
))

| $\gamma$ | = |   | - | , | ; | : | ! | ? | / | . | ' | " | ( | ) | [ | ] | * | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | a | b | c | d | e | f | g | h | i | j | k | l | m | n | o | p | q | r | s | t | u | v | w | x | y | z |
| :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: |
| $\psi\left(\gamma\right)$ | 0.000001 | 0.176893 | 0.001234 | 0.012458 | 0.000357 | 0.000314 | 0.001226 | 0.000979 | 0.000003 | 0.009643 | 0.002351 | 0.002805 | 0.000208 | 0.000208 | 0.000001 | 0.000001 | 0.000090 | 0.000053 | 0.000122 | 0.000045 | 0.000018 | 0.000007 | 0.000016 | 0.000017 | 0.000012 | 0.000060 | 0.000010 | 0.064281 | 0.010823 | 0.019250 | 0.036944 | 0.098464 | 0.017144 | 0.016031 | 0.052288 | 0.054422 | 0.000804 | 0.006383 | 0.030145 | 0.019251 | 0.057524 | 0.060248 | 0.014217 | 0.000728 | 0.046361 | 0.050874 | 0.070722 | 0.020437 | 0.008458 | 0.018499 | 0.001369 | 0.014454 | 0.000745 |

## 5 (d)

In [7]:
with open('message.txt', 'r', encoding='utf-16') as f:
    message = f.read()

In [8]:
def log_likelihood(sigma):

    sigma_inv = {e: d for d, e in sigma.items()}
    decrypted_text = ''.join([sigma_inv[e] for e in message])

    l = np.log(mc.limiting_prob(decrypted_text[0]))
    for beta, alpha in zip(decrypted_text[:-1], decrypted_text[1:]):
        transition_prob = mc.transition_prob(beta, alpha)
        l += np.log(transition_prob+1e-20)
        
    return l

In [9]:
war_and_peace_cntr = dict(Counter(war_and_peace))
message_cntr = Counter(message)
message_cntr = {k: message_cntr.get(k, 0) for k in war_and_peace_cntr}

In [31]:
permutation = {d: e for d, e in zip(sorted(war_and_peace_cntr.keys(), key=lambda x: war_and_peace_cntr[x]), sorted(message_cntr.keys(), key=lambda x: message_cntr[x]))}
updates = 0

for i in range(1, 50001):

    s, s_ = np.random.choice(symbols, 2, replace=False)
    proposal = permutation.copy()
    proposal[s], proposal[s_] = proposal[s_], proposal[s]
    log_likelihood_diff = log_likelihood(proposal) - log_likelihood(permutation)

    if np.random.uniform() <= np.exp(log_likelihood_diff):
        permutation = proposal.copy()
        updates += 1

    if i%100 == 0:
        permutation_inv = {e: d for d, e in permutation.items()}
        print(f"Number of updates = {str(updates).rjust(4, ' ')}. Decrypted text: {''.join([permutation_inv[e] for e in message[:60]])}")

Number of updates =   27. Decrypted text: on wy yhdnues inm whse kdrnesipre yeist wy giales uike we th
Number of updates =   55. Decrypted text: on wy yhdnuer inm whre kdsneripse yeirt wy gialer uike we th
Number of updates =   72. Decrypted text: oa wy yhdauer iam whre kdsaeripse yeirt wy ginler uike we th
Number of updates =   98. Decrypted text: ia wy yhdager oam whre kdlaerople yeors wy uonter goke we sh
Number of updates =  121. Decrypted text: ia wy yhtacer oam whre ktlaeroble yeors wy uonder coke we sh
Number of updates =  138. Decrypted text: ia wy yhtacer oam whre ktlaeroble yeors wy uonder coke we sh
Number of updates =  154. Decrypted text: ia wy yhtacer oam whre ktlaeroble yeors wy uonder coke we sh
Number of updates =  171. Decrypted text: ia wy yutacer oam wure ktlaeroble yeors wy honder coke we su


KeyboardInterrupt: 

## 5 (e)

In [None]:
edges = list()
for beta in symbols:
    for alpha in symbols:
        if mc.transition_prob(beta, alpha) > 0:
            edges.append(tuple((beta, alpha)))

G = nx.DiGraph()
G.add_edges_from(edges)
print(f'Number of strongly connected components = {len(list(nx.strongly_connected_components(G)))}')

Number of strongly connected components = 1


In [28]:
self_transition_probs = np.diag(mc.tm)
chars = [symbols[i] for i in np.where(self_transition_probs > 0)[0]]

print('Characters that can transition to themselves are ' + ', '.join(chars))

Characters that can transition to themselves are -, ., ', *, 0, 1, 2, 6, 8, a, b, c, d, e, f, g, h, i, l, m, n, o, p, r, s, t, v, w, x, z
