In [1]:
import os
from os.path import join, pardir
from collections import Counter
from copy import deepcopy
import numpy as np
from deap import base, creator, algorithms, tools
from dssg_challenge import compute_cost, check_keyboard

RNG_SEED = 0
DATA_DSSG = join(pardir, 'data', 'raw')

rng = np.random.RandomState(RNG_SEED)

In [2]:
os.listdir(DATA_DSSG)

['pt-corpus.txt', '.gitkeep', 'pt-keys.txt', 'en-keys.txt', 'en-corpus.txt']

In [3]:
# get keys
with open(join(DATA_DSSG, 'pt-keys.txt'), 'r') as file:
    keys = file.read()

# get corpus example
with open(join(DATA_DSSG, 'pt-corpus.txt'), 'r') as file:
    corpus = file.read()

keys = ''.join(keys.split('\n'))
corpus = ''.join(corpus.split(keys)).split('\n')[0]

Some keys are used to signal special characters. Namely,

- The ENTER key is represented as 0.
- The shift key for capitalization is represented as ^.
- The backspace key is represented as <.
- All the remaining characters not found in the valid keys are encoded as #.
- Empty keys will contain the character _.


In [4]:
len(keys), keys

(36, "ABCDEFGHIJKLMNOPQ RSTUVWXYZ0.#,^?<'~")

In [11]:
corpus

"0HER'OIS DO MAR, NOBRE PO<VO,0NA#C~AO VALENTE, IMORTAL,0LEVANTAI HOJE DE NOVO0O ESPLENDOR DE PORTUGAL#0ENTRE AS BRUMAS DA MEM'ORIA,0'O P'ATRIA SENTE#SE A VOZ0DOS TEUS EGR'EGIOS AV'OS,0QUE H'A#DE GUIAR#TE #A VIT'ORIA#00#AS ARMAS, #AS ARMAS#0SOBRE A TERRA, SOBRE O MAR,0#AS ARMAS, #AS ARMAS#0PELA P'ATRIA LUTAR0CONTRA OS <CANH~OES MARCHAR, MARCHAR#00DESFRALDA A INVICTA BANDEIRA,0#A LUZ VIVA DO TEU C'EU#0BRADE A EUROPA #A TERRA INTEIRA#0PORTUGAL N~AO PERECEU0BEIJA O SOLO TEU JUCUNDO0O OCEANO, A RUGIR D#AMOR,0E TEU BRA#CO VENCEDOR0DEU MUNDOS< NOVOS <AO MUNDO#00#AS ARMAS, #AS ARMAS#0SOBRE A <TERRA, SOBRE O MAR,0#AS ARMAS, #AS ARMAS#0PELA P'ATRIA LUTAR0CONTRA OS CANH~OES MARCHAR, MARCHAR#00SAUDAI O SOL QUE DESPONTA0SOBRE UM RIDENTE PORVIR#0SEJA O ECO DE UMA AFRONTA0O SINAL DO RESSURGIR.0RAIOS DESSA AURORA FORTE0S~AO COMO< BEIJOS DE M~AE,0QUE NOS <GUARDAM, NOS SUST#EM,0CONTRA AS <INJ'URIAS DA SORTE.00#AS ARMAS, #AS ARMAS#0SOBRE A <TERRA, SOBRE O MAR,0#AS ARMAS, #AS ARMAS#0PELA P'ATRIA LUTAR0CO

## The most basic approaches

In [5]:
Counter(corpus).most_common()[:10]

[(' ', 138),
 ('A', 137),
 ('R', 91),
 ('O', 78),
 ('E', 73),
 ('S', 70),
 ('0', 44),
 ('#', 37),
 ('T', 36),
 ('M', 35)]

In [6]:
baseline = ''.join([i[0] for i in Counter(corpus).most_common()])
baseline = baseline + ''.join([i for i in keys if i not in baseline]) + ' '
baseline

" AROES0#TMNUID,CLPB'VH<G~JQFZ.KWXY^? "

In [7]:
shuffled = list(baseline)
rng.shuffle(shuffled)

check_keyboard(baseline, keys)
check_keyboard(keys+' ', keys)
check_keyboard(shuffled, keys)

print('Shuffled cost:\t\t', compute_cost(''.join(shuffled), corpus))
print('Original keys cost:\t', compute_cost(keys+' ', corpus))
print('Baseline cost:\t\t', compute_cost(baseline, corpus))

Shuffled cost:		 5088.806814781539
Original keys cost:	 4541.244466418746
Baseline cost:		 3189.713309637487


## First attempt with GA

In [8]:
keys_list = list(keys)

def evaluate(individual):
    """
    Computes the cost for each individual.
    """
    try:
        check_keyboard(individual, keys)
        return [compute_cost(''.join(list(individual)), corpus)]
    except AssertionError:
        return [np.inf]

def mutFlip(ind1, ind2):
    """Execute a two points crossover with copy on the input individuals. The
    copy is required because the slicing in numpy returns a view of the data,
    which leads to a self overwritting in the swap operation.
    """

    ind = ind1.copy()
    for x, value in np.ndenumerate(ind):
        if np.random.random() < .05:
            ind[x] = np.random.choice(keys_list)
    try:
        check_keyboard(ind, keys)
        return ind, ind2
    except AssertionError:
        return mutFlip(individual, ind2)
    
    return ind, ind2


In [13]:
creator.create('FitnessMin', base.Fitness, weights=(-1.0,))
creator.create('Individual', np.ndarray, fitness=creator.FitnessMin)

toolbox = base.Toolbox()

# Tool to randomly initialize an individual
toolbox.register('attribute',
        np.random.permutation, np.array(list(baseline))
)

toolbox.register('individual',
    tools.initIterate,
    creator.Individual,
    toolbox.attribute
)

toolbox.register('population',
    tools.initRepeat,
    list,
    toolbox.individual
)

toolbox.register("evaluate", evaluate)
toolbox.register("mate", tools.cxOnePoint)
toolbox.register("mutate", tools.mutShuffleIndexes, indpb=0.05)
toolbox.register("select", tools.selTournament, tournsize=3)

def main():
    np.random.seed(64)

    pop = toolbox.population(n=2)

    # Numpy equality function (operators.eq) between two arrays returns the
    # equality element wise, which raises an exception in the if similar()
    # check of the hall of fame. Using a different equality function like
    # numpy.array_equal or numpy.allclose solve this issue.
    hof = tools.HallOfFame(1, similar=np.array_equal)

    stats = tools.Statistics(lambda ind: ind.fitness.values)
    stats.register("avg", np.mean)
    stats.register("std", np.std)
    stats.register("min", np.min)
    stats.register("max", np.max)

    algorithms.eaSimple(pop, toolbox, cxpb=0, mutpb=0.6, ngen=1000, stats=stats,
                        halloffame=hof)

    return pop, stats, hof


pop, stats, hof = main()


gen	nevals	avg    	std    	min    	max    
0  	2     	5081.14	156.206	4924.93	5237.34
1  	2     	4990.37	5.43034	4984.94	4995.8 
2  	2     	5468.93	228.257	5240.67	5697.19
3  	1     	5228.14	12.536 	5215.6 	5240.67
4  	2     	5107.66	107.942	4999.72	5215.6 
5  	2     	4922.76	76.9588	4845.8 	4999.72
6  	2     	4875.98	9.67329	4866.31	4885.65
7  	1     	4697.91	168.393	4529.52	4866.31
8  	2     	4541.9 	237.534	4304.36	4779.43
9  	2     	4653.47	186.525	4466.95	4840   
10 	1     	4434.81	32.1355	4402.68	4466.95
11 	2     	4798.12	15.774 	4782.35	4813.9 
12 	1     	5051.86	269.506	4782.35	5321.36
13 	1     	4792.02	9.66821	4782.35	4801.69
14 	0     	4792.02	9.66821	4782.35	4801.69
15 	1     	4792.44	10.0883	4782.35	4802.53


KeyboardInterrupt: 