**Hidden Markov models for cracking codes**

In this exercise you have to make a partially built HMM work and use it to solve some simple substitution ciphers. Plaintext data is provided in 'plaintext' directory. Encrypted data is in 'encrypted'. Some of the texts were originally English some of them were Russian; the sequences are also of different lengths. 

This homework is worth **15 points** and is due by the next class (**24th Oct.**), please submit the results of the **TASK 5** (a list of files and names of the author/work) to Anytask in the following format: 'filename author' where 'filename' is a file from "encrypted/\*_encrypted.txt" and 'author' is a file from "plaintext/\*.txt" (not including 'english.txt', 'russian.txt' or 'all.txt') which best matches the decrypted text.




In [1]:
# Utilities for loading data from file and converting characters to integers and back.
import numpy as np
    
def get_char_to_int_mapping(path):
    # Load data from path and get mapping from characters to integers and back.
    characters = set()
    for line in open(path):
        characters.update(set([c for c in line.strip()]))
    char_to_int_mapping = dict([(char, i) for i, char in enumerate(sorted(list(characters)))])
    int_to_char_mapping = [char for char, i in char_to_int_mapping.items()]
    return char_to_int_mapping, int_to_char_mapping

def load_sequences(path, char_to_int_mapping):
    # Load data from path and map to integers using mapping.
    return [[char_to_int_mapping[c] for c in line.strip()] for line in open(path)]

def estimate_markov_model_from_sequences(sequences, num_states):
    # Estimate a Markov model based on the sequences (integers) provided.
    # pi[i] = Pr(s_0 = i)
    pi_counts = np.zeros(num_states)
    # A[i, j] = Pr(s_t = j | s_{t-1} = i)
    A_counts = np.zeros((num_states, num_states))        
    for n, sequence in enumerate(sequences):
        if len(sequence) < 1:
            continue
        pi_counts[sequence[0]] += 1
        for i in range(1, len(sequence)):
            prev = sequence[i-1]
            cur = sequence[i]
            A_counts[prev, cur] += 1
    pi = pi_counts / sum(pi_counts)
    A = A_counts / sum(A_counts)

#         assert False, "Collect counts for pi and A and return parameter estimates."
    
    return pi, A

**TASK 1**: Make the following block run by completing the method 'estimate_markov_model_from_sequences' above.

In [2]:
# Some data to use.
plaintext = 'plaintext/english.txt'
# plaintext = 'plaintext/shakespeare.txt'
# plaintext = 'plaintext/russian.txt'

ciphertext = 'encrypted/1_encrypted.txt' # short sequences in english
# ciphertext = 'encrypted/99_encrypted.txt' # longer sequences in russian

# load a character to integer mapping and reverse                                                                                                         
char_to_int_mapping, int_to_char_mapping = get_char_to_int_mapping(plaintext)

# load sequences as ints                                                                                                                                  
plaintext_sequences = load_sequences(plaintext, char_to_int_mapping)
encrypted_sequences = load_sequences(ciphertext, char_to_int_mapping)

# estimate a markov model over characters                                                                                                                 
pi, A = estimate_markov_model_from_sequences(plaintext_sequences, len(char_to_int_mapping))

Below is a mostly implemented HMM.

#### Обоснование возможности нормализации:

$$
norm(\alpha_{:t}) = \sum_{j=0}^{n}\alpha_{jt}
$$

$$
\overline{\alpha_{it}} = \frac{\alpha_{it}}{norm(\alpha_{:t})}
$$


$$
norm(\beta_{:t}) = \sum_{j=0}^{n}\beta_{jt}
$$

$$
\overline{\beta_{it}} = \frac{\beta_{it}}{norm(\beta_{:t})}
$$

$$
\gamma_{it} = \frac{\alpha_{it}  \beta_{it}}{\sum_{j=0}^n\alpha_{jt} \beta_{jt}} =\frac{norm(\alpha_{:t}) \overline{\alpha_{it}} norm(\beta_{:t}) \overline{\beta_{it}}}{\sum_{j=0}^nnorm(\alpha_{:t}) \overline{\alpha_{jt}} norm(\beta_{:t}) \overline{\beta_{jt}}} = \
\frac{norm(\alpha_{:t}) \overline{\alpha_{it}} norm(\beta_{:t}) \overline{\beta_{it}}}{norm(\alpha_{:t})norm(\beta_{:t})\sum_{j=0}^n \overline{\alpha_{jt}}  \overline{\beta_{jt}}} =\
\frac{\overline{\alpha_{it}} \overline{\beta_{it}}}{\sum_{j=0}^n \overline{\alpha_{jt}} \overline{\beta_{jt}}}
$$

$$
\xi_{ij} = \sum_{t=1}^{T-1}\frac{\alpha_{it} a_{ij} \beta_{jt+1} b_{jo_{t+1}}}{\sum_i^n\sum_j^n\alpha_{it} a_{ij} \beta_{jt+1} b_{jo_{t+1}}} = \
\sum_{t=1}^{T-1}\frac{norm(\alpha_{:t}) \overline{\alpha_{it}} a_{ij} norm(\beta_{:t+1}) \overline{\beta_{jt+1}} b_{jo_{t+1}}}{\sum_i^n\sum_j^nnorm(\alpha_{:t}) \overline{\alpha_{it}} a_{ij} norm(\beta_{:t+1}) \overline{\beta_{jt+1}} b_{jo_{t+1}}} = \
\sum_{t=1}^{T-1}\frac{\overline{\alpha_{it}} a_{ij} \overline{\beta_{jt+1}} b_{jo_{t+1}}}{\sum_i^n\sum_j^n\overline{\alpha_{it}} a_{ij} \overline{\beta_{jt+1}} b_{jo_{t+1}}}
$$

$$
log\sum_{i=0}^n \alpha_{iT} = log\sum_{i=0}^n norm(\alpha_{:T}) \overline{\alpha_{iT}} = log[norm(\alpha_{:T})\sum_{i=0}^n  \overline{\alpha_{iT}}] = log(norm(\alpha_{:T})) + log\sum_{i=0}^n \overline{\alpha_{iT}}
$$

In [3]:
class HMM():

    def __init__(self, observations_to_char_mapping={}, states_to_char_mapping={}):
        # Determine number of states and observation space. 
        self.num_states = len(states_to_char_mapping)
        self.num_outputs = len(observations_to_char_mapping)
        self.states_to_char_mapping = states_to_char_mapping
        self.observations_to_char_mapping = observations_to_char_mapping
       
        np.random.seed(0)
        # Random initialization
        self.pi = np.random.rand(self.num_states)
        self.pi /= np.sum(self.pi)
        self.A = np.random.rand(self.num_states, self.num_states)
        self.A /= np.sum(self.A, 1, keepdims=True)
        self.B = np.random.rand(self.num_states, self.num_outputs)
        self.B /= np.sum(self.B, 1, keepdims=True) 
        
    def estimate_with_em(self, sequences, parameters={}, epsilon=0.001, max_iters=100):
        # Estimates all parameters not provided in 'parameters' based on 'sequences'.
        self.fixed_pi = 'pi' in parameters
        if self.fixed_pi:
            self.pi = parameters['pi']
        self.fixed_A = 'A' in parameters
        if self.fixed_A:
            self.A = parameters['A']
        self.fixed_B = 'B' in parameters
        if self.fixed_B:
            self.B = parameters['B']
    
        previous_llh = None
        iter = 0
        while True and iter < max_iters:
            # Infer expected counts.
            pi_counts, A_counts, B_counts, log_likelihood = self.e_step(sequences)

            # Update parameters based on counts.
            self.m_step(pi_counts, A_counts, B_counts)

            # Output some sequences for debugging.
#             self.output(sequences[:10])

            # Log likelihood should be increasing
            print('iteration %d; log likelihood %.4f' % (iter, log_likelihood))
            if previous_llh:
                assert log_likelihood >= previous_llh
                if log_likelihood - previous_llh < epsilon:
                    break
            previous_llh = log_likelihood
        
            iter += 1


    def e_step(self, sequences):
        # Reset counters of statistics
        pi_counts = np.zeros_like(self.pi)
        A_counts = np.zeros_like(self.A) 
        B_counts = np.zeros_like(self.B) 
        total_log_likelihood = 0.0

        for sequence in sequences:
            # Run Forward-Backward dynamic program
            alpha, beta, gamma, xi, log_likelihood = self.forward_backward(sequence)
  
            # Accumulate statistics.
            pi_counts += gamma[:, 0]
            A_counts += xi
            for t, x in enumerate(sequence):
                B_counts[:, x] += gamma[:, t]
            
            total_log_likelihood += log_likelihood

        return pi_counts, A_counts, B_counts, total_log_likelihood

    def m_step(self, pi_counts, A_counts, B_counts):
        if not self.fixed_pi:
            self.pi = pi_counts / np.sum(pi_counts)
        if not self.fixed_A:
            self.A = A_counts / np.sum(A_counts, 1, keepdims=True)
        if not self.fixed_B:
            self.B = B_counts / np.sum(B_counts, 1, keepdims=True)
        
    def max_posterior_decode(self, sequence):
        _, _, gamma, _, log_likelihood = self.forward_backward(sequence)
        return np.argmax(gamma, 0)
        
    def forward_backward(self, sequence):
        # alpha[i][t] = p(x_1, ..., x_t, z_t = i)
        alpha, log_alpha_norm = self.forward(sequence)
        
        # beta[i][t] = p(x_t+1, ..., x_T|z_t = i)
        beta = self.backward(sequence)

        # gamma[i][t] = p(z_t = i|x_1, ..., x_T)
        gamma = (alpha * beta) / np.sum(alpha * beta, 0)

        # xi[i][j] = p(z_t = i, z_{t+1} = j|x_1, ..., x_T)
        xi = np.zeros_like(self.A)
        for t in range(1, len(sequence)-1):
            this_xi = np.zeros_like(self.A)
            for i in range(self.num_states):
                for j in range(self.num_states):
                    this_xi[i, j] += alpha[i, t] * self.A[i, j] * beta[j, t+1] * self.B[j, sequence[t+1]]        
            xi += this_xi / np.sum(this_xi)
        return alpha, beta, gamma, xi, np.log(np.sum(alpha[:, len(sequence)-1])) + log_alpha_norm[len(sequence)-1]

    def forward(self, sequence):
        # alpha[i][t] = p(x_1, ..., x_t, z_t = i)
        log_alpha_norm = np.zeros(len(sequence))
        alpha = np.zeros((len(self.pi), len(sequence)))
        alpha[:,0] = self.pi * self.B[:,sequence[0]]
        log_alpha_norm[0] = np.log(np.sum(alpha[:,0]))
        alpha[:,0] = alpha[:,0] / np.sum(alpha[:,0])
        for t in range(1,len(sequence)):
            alpha[:,t] = self.B[:,sequence[t]] * self.A.T.dot(alpha[:,t-1])
            log_alpha_norm[t] = log_alpha_norm[t - 1] + np.log(np.sum(alpha[:,t]))
            alpha[:,t] = alpha[:,t] / np.sum(alpha[:,t])
#         assert False, "Implement forward recursion"
        return alpha, log_alpha_norm
    
    def backward(self, sequence):
        # beta[i][t] = p(x_t+1, ..., x_T|z_t = i)
        beta = np.zeros((len(self.pi), len(sequence)))
        beta[:,len(sequence) - 1] = np.ones(len(self.pi))
        beta[:,len(sequence) - 1] = beta[:,len(sequence) - 1] / np.sum(beta[:,len(sequence) - 1])
        for t in range(len(sequence) - 2, -1, -1):
            beta[:, t] =  self.A.dot(beta[:, t + 1] * self.B[:,sequence[t + 1]])
            beta[:,t] = beta[:,t] / np.sum(beta[:,t])
#         assert False, "Implement backwards recursion to compute betas."
        return beta

    def output(self, sequences):
        # Output some decoded states. 
        for i, sequence in enumerate(sequences):
            observations = [self.observations_to_char_mapping[x] for x in sequence]                
            map_states = [self.states_to_char_mapping[x] for x in self.max_posterior_decode(sequence)]
            print('(states):       %s\n(observations): %s' % (''.join(map_states), ''.join(observations)))


**TASK 2**: Implement the assertions in 'forward' and 'backward' methods on the HMM class so that the following block passes.

In [4]:
# Since it's a substitution cipher we assume hidden states and observations have same alphabet.
state_to_char_mapping = int_to_char_mapping
observation_to_char_mapping = int_to_char_mapping

# Initialize a HMM with the correct state/output spaces.
hmm = HMM(observation_to_char_mapping, state_to_char_mapping)

# Estimate the parameters and decode the encrypted sequences.
hmm.estimate_with_em(encrypted_sequences[:100], parameters={'A' :A,'pi':pi})

iteration 0; log likelihood -12170.5421
iteration 1; log likelihood -10381.2029
iteration 2; log likelihood -10345.4999
iteration 3; log likelihood -10310.9132
iteration 4; log likelihood -10272.9498
iteration 5; log likelihood -10230.1673
iteration 6; log likelihood -10183.0970
iteration 7; log likelihood -10133.8113
iteration 8; log likelihood -10084.9661
iteration 9; log likelihood -10038.7531
iteration 10; log likelihood -9996.3215
iteration 11; log likelihood -9957.8073
iteration 12; log likelihood -9922.7834
iteration 13; log likelihood -9890.7336
iteration 14; log likelihood -9861.2057
iteration 15; log likelihood -9833.7868
iteration 16; log likelihood -9808.1683
iteration 17; log likelihood -9784.2283
iteration 18; log likelihood -9761.9992
iteration 19; log likelihood -9741.5561
iteration 20; log likelihood -9722.9258
iteration 21; log likelihood -9706.0642
iteration 22; log likelihood -9690.8847
iteration 23; log likelihood -9677.2941
iteration 24; log likelihood -9665.2051


**TASK 3**: Some of the encrypted sequences are quite long. Try decoding some from 'encrypted/99_encrypted.txt' (note these are in Russian).

In [5]:
# Some data to use.
# plaintext = 'plaintext/shakespeare.txt'
plaintext_russian = 'plaintext/russian.txt'

ciphertext = 'encrypted/99_encrypted.txt' # longer sequences in russian

# load a character to integer mapping and reverse                                                                                                         
char_to_int_mapping_russian, int_to_char_mapping_russian = get_char_to_int_mapping(plaintext_russian)

# load sequences as ints                                                                                                                                  
plaintext_sequences = load_sequences(plaintext_russian, char_to_int_mapping_russian)
encrypted_sequences = load_sequences(ciphertext, char_to_int_mapping_russian)

# estimate a markov model over characters                                                                                                                 
pi, A = estimate_markov_model_from_sequences(plaintext_sequences, len(char_to_int_mapping_russian))

In [6]:
# Since it's a substitution cipher we assume hidden states and observations have same alphabet.
state_to_char_mapping_russian = int_to_char_mapping_russian
observation_to_char_mapping_russian = int_to_char_mapping_russian

# Initialize a HMM with the correct state/output spaces.
hmm_russian = HMM(observation_to_char_mapping_russian, state_to_char_mapping_russian)

# Estimate the parameters and decode the encrypted sequences.
hmm_russian.estimate_with_em(encrypted_sequences[:100], parameters={'A' :A,'pi':pi}, max_iters = 100)

iteration 0; log likelihood -75448.5833
iteration 1; log likelihood -61235.3633
iteration 2; log likelihood -61133.0051
iteration 3; log likelihood -60986.8765
iteration 4; log likelihood -60752.3970
iteration 5; log likelihood -60385.4446
iteration 6; log likelihood -59870.7365
iteration 7; log likelihood -59233.6394
iteration 8; log likelihood -58475.5952
iteration 9; log likelihood -57533.5415
iteration 10; log likelihood -56398.6531
iteration 11; log likelihood -55245.0282
iteration 12; log likelihood -54286.1565
iteration 13; log likelihood -53581.7584
iteration 14; log likelihood -53067.2524
iteration 15; log likelihood -52657.1907
iteration 16; log likelihood -52296.9903
iteration 17; log likelihood -51963.4035
iteration 18; log likelihood -51657.1083
iteration 19; log likelihood -51383.1091
iteration 20; log likelihood -51144.8315
iteration 21; log likelihood -50938.5389
iteration 22; log likelihood -50758.7364
iteration 23; log likelihood -50606.8932
iteration 24; log likeliho

KeyboardInterrupt: 

**TASK 4**: Make your implementation of forward and backward more efficient by removing all but the outermost for-loop.

**TASK 5**: Try to classify the author of each text. 

In [7]:
# Some data to use.
# plaintext = 'plaintext/shakespeare.txt'
plaintext = 'plaintext/all.txt'


# load a character to integer mapping and reverse                                                                                                         
char_to_int_mapping, int_to_char_mapping = get_char_to_int_mapping(plaintext)

# load sequences as ints                                                                                                                                  
plaintext_sequences = load_sequences(plaintext, char_to_int_mapping)

# estimate a markov model over characters                                                                                                                 
pi, A = estimate_markov_model_from_sequences(plaintext_sequences, len(char_to_int_mapping))

In [10]:
all_encrypted_sequences = {}
all_decrypted_sequences = {}

state_to_char_mapping = int_to_char_mapping
observation_to_char_mapping = int_to_char_mapping
hmm = HMM(observation_to_char_mapping, state_to_char_mapping)

for i in range(143):
    ciphertext = 'encrypted/' + str(i) + '_encrypted.txt' # longer sequences in russian
    print(ciphertext)
    encrypted_sequences = load_sequences(ciphertext, char_to_int_mapping)
    all_encrypted_sequences[ciphertext] = encrypted_sequences
    hmm.estimate_with_em(encrypted_sequences[:100], parameters={'A' :A,'pi':pi}, max_iters = 100)
    all_decrypted_sequences[ciphertext] = hmm.max_posterior_decode(encrypted_sequences)

encrypted/0_encrypted.txt
iteration 0; log likelihood -19277.3554
iteration 1; log likelihood -12696.4513
iteration 2; log likelihood -12659.3748
iteration 3; log likelihood -12621.0184
iteration 4; log likelihood -12574.8023
iteration 5; log likelihood -12518.2091
iteration 6; log likelihood -12448.1019
iteration 7; log likelihood -12364.1777
iteration 8; log likelihood -12273.4970
iteration 9; log likelihood -12186.4379
iteration 10; log likelihood -12108.7312
iteration 11; log likelihood -12040.5197
iteration 12; log likelihood -11979.9522
iteration 13; log likelihood -11925.4837
iteration 14; log likelihood -11876.4580
iteration 15; log likelihood -11832.9292
iteration 16; log likelihood -11795.1249
iteration 17; log likelihood -11762.8533
iteration 18; log likelihood -11735.3892
iteration 19; log likelihood -11711.8632
iteration 20; log likelihood -11691.5979
iteration 21; log likelihood -11674.1026
iteration 22; log likelihood -11658.8772
iteration 23; log likelihood -11645.4139


ValueError: operands could not be broadcast together with shapes (69,) (69,61) 

In [None]:
_,_,gamma,_ = hmm_russian.forward_backward()