In [1]:
from collections import Counter
import sys
from nlm import *
from ngram import *
from nltk.util import ngrams
import bz2
import re
import pandas as pd
import numpy as np
import string
sys.path.append('../')

nlm_model = load_model("data/mlstm_ns.pt", cuda=False)
ngram_model = LM("data/6-gram-wiki-char.lm.bz2", n=8, verbose=False)

Loading model data/mlstm_ns.pt..
Model on board!
Reading language model from data/6-gram-wiki-char.lm.bz2...
Done.


In [2]:
def read_file(filename):
    if filename[-4:] == ".bz2":
        with bz2.open(filename, 'rt') as f:
            content = f.read()
            f.close()
    else:
        with open(filename, 'r') as f:
            content = f.read()
            f.close()
    return content

 # Beam search algorithm
def beam_search(plaintext_alph, cipher_text, ext_order, ext_limits=1, beam_size=1):

    scored_hypotheses = [(0, [])]
    hypothesis_extensions = []

    for cipher_sym in ext_order:
        for hyp in scored_hypotheses:
            for pl_sym in plaintext_alph:
                new_hypothesis = hyp[1] + [(pl_sym, cipher_sym)]
                
                if within_ext_limits(ext_limits, new_hypothesis):
                    hypothesis_extensions.append((score(new_hypothesis, cipher_text), new_hypothesis,))

        if  hypothesis_extensions:

            hypothesis_extensions = histogram_prune(hypothesis_extensions, beam_size)

            scored_hypotheses = [h for h in hypothesis_extensions]
            
        hypothesis_extensions.clear()

    return winning_hypothesis(scored_hypotheses)

In [3]:
# Helper function for score(). Returns plaintext string for a cipher symbol given a hypothesized mapping
def g_funct(hypothesis, cipher_sym):
    for tup in hypothesis:
        if tup[1] == cipher_sym:
            return tup[0]
    return '_'

# if the hypothesis exceeds the constraint on many-to-one mapping
def within_ext_limits(limit, hyp):
    plaintxt_sym_counter = Counter([tup[0] for tup in hyp])
    return not [k for k in plaintxt_sym_counter if plaintxt_sym_counter[k] > limit]

# Chooses the best scoring hypothesis
def histogram_prune(hypotheses, n=1):
    hypotheses.sort(reverse=True)
    return hypotheses[:n]

# Pick the best hypothesis
def winning_hypothesis(hypotheses):
    return histogram_prune(hypotheses, 1)[0][1]

# Get an extension order based on contiguous deciphered ngrams
def get_ext_order(alphabet, cipher):
    first_char = alphabet[0]
    ext_order = [first_char]
    cipher = cipher.replace(first_char, '#')
    alphabet.pop(0)

    while alphabet:
        max_sum = 0
        max_char = first_char

        for a in alphabet:
            curr_sum = weighted_sum(a, cipher)

            if curr_sum > max_sum:
                max_sum = curr_sum
                max_char = a

        ext_order.append(max_char)
        alphabet.remove(max_char)

    return ext_order

# Calculate the weighted sum for ext order candidates
def weighted_sum(ch, cipher, repl='#'):
    cipher = cipher.replace(ch, repl)
    sum = 0
    for n in range(1, 8):
        grams = [g for g in ngrams(cipher, n) if not [ch for ch in g if not ch == repl]]
        sum += len(grams)*n

    return sum


In [14]:
def score(hypothesis, cipher_text):
    num_hypos = len(hypothesis)

    decipherment = ''.join([g_funct(hypothesis, ch) for ch in cipher_text])  
    print('deciph: ',decipherment)  
    
    # get the mininum lenght of the decipherment that covers at least one of each of the mapped cipher symbols
    mapped_symbols = [x[0] for x in hypothesis]    
    print('mapped_symbols:',mapped_symbols)

    # Define the initial character of the sampling
    sample_full = decipherment[0].upper()
    sample = decipherment[0]
    
    for i in range(1,len(cipher_text)):
                
        if decipherment[i] == '_':

                next_ch = next_chars(c=sample_full, cuda=False, model=nlm_model, k=1)[0][0]
                print('next sampled char:',next_ch)
                sample_full += next_ch
                while (next_ch == ' ') | (next_ch in string.punctuation):
                    next_ch = next_chars(c=sample_full, cuda=False, model=nlm_model, k=1)[0][0]
                    sample_full += next_ch
                sample += next_ch.lower()

        else:
            sample_full += decipherment[i]
            sample += decipherment[i]
        print('sample_full:',sample_full)
        print('sample:',sample)    
        print()

    print('sample: ',sample)
    score = ngram_model.score_seq(sample) 
    return score

In [12]:
cipher_text = read_file("data/cipher_simple.txt").replace('\n', '')
plaintxt = read_file("data/default.wiki.txt.bz2")

# Cipher and plaintext alphabets
cipher_count = Counter([ch for ch in cipher_text if not ch == '\n'])

ext_order = [tup[0] for tup in cipher_count.most_common()]

eng_alphabet = [ch for ch in 'abcdefghijklmnopqrstuvxyz']

In [None]:
best_hypothesis = beam_search(eng_alphabet, cipher_text, ext_order, 6, 100)

In [None]:
print('\n\n\nBEST HYOOTHESIS:',best_hypothesis)

decipherment = ''.join([g_funct(best_hypothesis, ch) for ch in cipher_text])

print(best_hypothesis)
print('Length of cipher alphabet: ', len(ext_order))
print('Length of best hypothesis: ', len(best_hypothesis))
print(decipherment)