In [1]:
from collections import Counter
import sys
sys.path.append('../')
from ngram import *
from nltk.util import ngrams

# ngram model for scoring hypotheses
ngram_model = LM("data/6-gram-wiki-char.lm.bz2", n=8, verbose=False)

Reading language model from data/6-gram-wiki-char.lm.bz2...
Done.


In [8]:
def read_file(filename):
    if filename[-4:] == ".bz2":
        with bz2.open(filename, 'rt') as f:
            content = f.read()
            f.close()
    else:
        with open(filename, 'r') as f:
            content = f.read()
            f.close()
    return content

def beam_search(plaintext_alph, cipher_text, ext_order, ext_limits, beam_size=1):

    scored_hypotheses = [(0, [])]
    hypothesis_extensions = []
   
    num_symbols = len(ext_order)
        
    while ext_order:
        
        cipher_sym = ext_order[0]
        ext_order.remove(cipher_sym)
        
        for hyp in scored_hypotheses:
            for pl_sym in plaintext_alph:
                new_hypothesis = hyp[1] + [(pl_sym, cipher_sym)]
                if within_ext_limits(ext_limits, new_hypothesis):
                    hypothesis_extensions.append((score(new_hypothesis, cipher_text), new_hypothesis,))

        if  hypothesis_extensions:
            hypothesis_extensions = histogram_prune(hypothesis_extensions, 80)
            scored_hypotheses = [h for h in hypothesis_extensions]
         
        decipherment = ''.join([g_funct(scored_hypotheses[0][1], ch) for ch in cipher_text])
        bitstring = ''
        for ch in decipherment:
            if ch == '_':
                bitstring += '.'
            else:
                bitstring += 'o'

        if len(ext_order) > 0:
            ext_order = update_ext_order(cipher_text, bitstring)

        hypothesis_extensions.clear()
        
    return winning_hypothesis(scored_hypotheses) 

def get_max_n_gram(cipher_map):
    max_n_gram = 0
    n_gram = 0
    for i in range(len(cipher_map)):
        if cipher_map[i] != '.':
            n_gram += 1
        else:
            max_n_gram = max(max_n_gram,n_gram)
            n_gram = 0
    return max_n_gram

def update_ext_order(cipher_text, bitstring):

    max_n_gram_allowed = 6
    # weights assuming n-gram = 6
    ngram_weights = [1.0,1.0,1.0,1.0,2.0,3.0] 
    
    max_n_gram = get_max_n_gram(bitstring)

    non_mapped_positions = [(x[1],x[2]) for x in [x for x in zip([x for x in bitstring],
                                                                 [i for i in cipher_text],
                                                                 [i for i in range(len(cipher_text))]) if x[0] == '.']]

    non_mapped_symbols = list(set([x[1] for x in [x for x in zip([x for x in bitstring],
                                                                 [x for x in cipher_text]) if x[0] == '.']]))

    main_dict = dict()
    for symbol in non_mapped_symbols:

        symbol_dict = dict()
        for n_gram in range(1, max_n_gram_allowed+1):
            symbol_dict[n_gram] = 0

        positions = [x[1] for x in non_mapped_positions if x[0]==symbol]
        for pos in positions:

            left_neighbors = len(bitstring[:pos].split('.')[-1])
            for i in range(min(left_neighbors,max_n_gram_allowed)):
                symbol_dict[i+1]+=1

            right_neighbors = len(bitstring[pos+1:].split('.')[0])
            for i in range(min(right_neighbors,max_n_gram_allowed)):
                symbol_dict[i+1]+=1
        main_dict[symbol] = symbol_dict

    symbol_scores = []

    for symbol in main_dict.keys():
        score = 0
        for ngram in range(1,min(max_n_gram,6)):
            score += ngram_weights[ngram-1] * main_dict[symbol][ngram]
        symbol_scores.append((symbol,score))

    return [x[0] for x in sorted(symbol_scores, key=lambda tup: tup[1],reverse=True)]
    
    
def score(hypothesis, text):
    decipherment = ''.join([g_funct(hypothesis, ch) for ch in text])
    bitstring = ''
    for ch in decipherment:
        if ch == '_':
            bitstring += '.'
        else:
            bitstring += 'o'
    
    #return ngram_model.score_seq(decipherment)
    return ngram_model.score_bitstring(decipherment, bitstring)


# Helper function for score(). Returns plaintext string for a cipher symbol given a hypothesized mapping
def g_funct(hypothesis, cipher_sym):
    for tup in hypothesis:
        if tup[1] == cipher_sym:
            return tup[0]
    return '_'

# Chooses the best scoring hypothesis
def histogram_prune(hypotheses, n=1):
    hypotheses.sort()
    return hypotheses[-n:]


# Pick the best hypothesis
def winning_hypothesis(hypotheses):
    return histogram_prune(hypotheses, 1)[0][1]


def within_ext_limits(ext_limits, hyp):
    plaintxt_sym_counter = Counter([tup[0] for tup in hyp])
    return not [k for k in plaintxt_sym_counter if plaintxt_sym_counter[k] > ext_limits[k]]

# Get an extension order based on contiguous deciphered ngrams
# This is used to pick the first 
def get_ext_order(alphabet, cipher):
    ext_order = [alphabet[0]]
    alphabet.pop(0)

    while alphabet:
        max_sum = weighted_sum(alphabet[0], cipher, ext_order)
        max_char = alphabet[0]

        for a in alphabet[1:]:
            curr_sum = weighted_sum(a, cipher, ext_order)

            if curr_sum > max_sum:
                max_sum = curr_sum
                max_char = a
        
        ext_order.append(max_char)
        alphabet.remove(max_char)
    
    return ext_order

# Calculate the weighted sum for ext order candidates
def weighted_sum(ch, cipher, ext_order):
    sum = 0
    for n in range(2, 7):
        grams = [g for g in ngrams(cipher, n) if all(c in ext_order + [ch] for c in g)]
        sum += len(grams)*n

    return sum    

In [9]:
cipher = read_file("data/cipher.txt").replace('\n', '')
plaintxt = read_file("data/default.wiki.txt.bz2")

# Cipher and plaintext alphabets
cipher_count = Counter([ch for ch in cipher if not ch == '\n'])
ext_order = [tup[0] for tup in cipher_count.most_common()]
ext_order = get_ext_order(ext_order, cipher)
eng_alphabet = [ch for ch in 'abcdefghijklmnopqrstuvwxyz']

In [10]:
e = 6
ext_limit = {'e': e, 't': e, 'a': e, 'o': e, 'i': e, 'n': e, 's': e, 'h': e,'r': e, 'd': e, 
              'l': e, 'c': e, 'u': e, 'm': e, 'w': e, 'f': e, 'g': e, 'y': e, 'p': e, 'b': e,
              'v': e, 'k': e, 'j': e, 'x': e, 'q': e, 'z': e}

In [11]:
best_hypothesis = beam_search(eng_alphabet, cipher, ext_order, ext_limit, beam_size=26)

In [12]:
decipherment = ''.join([g_funct(best_hypothesis, ch) for ch in cipher])

print(best_hypothesis)
print(decipherment)

[('e', '—'), ('t', '§'), ('e', 'Z'), ('e', 'I'), ('e', 'u'), ('r', 'W'), ('e', 'V'), ('r', '∞'), ('s', 'E'), ('t', 'X'), ('a', '≈'), ('t', 'H'), ('r', 'M'), ('a', 'º'), ('s', '“'), ('a', '£'), ('n', '–'), ('s', 'K'), ('h', '•'), ('s', 'B'), ('s', '∑'), ('e', 'P'), ('h', 'A'), ('t', 'À'), ('s', 'D'), ('o', 'R'), ('i', 'G'), ('r', '+'), ('i', 'π'), ('t', 'y'), ('o', 'F'), ('t', 'Ç'), ('n', '∏'), ('h', 'æ'), ('a', '√'), ('r', 'µ'), ('d', 'O'), ('n', '∫'), ('a', 'T'), ('a', 'Q'), ('i', 'J'), ('n', '‘'), ('o', 'L'), ('m', '^'), ('i', 'S'), ('n', 'ƒ'), ('l', '/'), ('o', '\\'), ('d', 'N'), ('r', 'Ã'), ('d', '¢'), ('l', 'Ω'), ('d', '∆'), ('c', 'j')]
anelelesntdohnthsreraitorateistetaritmeetleandatsraidlassedohendorestmoreinsonoterahtireisasearethrtidodnetsrdheisaaissonlessitestremootaesenearethrorsassesorchroddasettoeandererrnoarmorteasonatsonalitiaherrhoennoadessthisetitrahorrdhaedetenthasseroreandemhisrdearlstinnerdtaiaslannedhessenaterenoshariahasndaroearnntenmiereeaitssnathansonnttostatths