In [1]:
import os
from os.path import join, pardir
from collections import Counter
from copy import deepcopy
import numpy as np
from deap import base, creator, algorithms, tools
from dssg_challenge import compute_cost, check_keyboard

RNG_SEED = 0
DATA_DSSG = join(pardir, 'data', 'processed')

rng = np.random.RandomState(RNG_SEED)

In [2]:
os.listdir(DATA_DSSG)

['pt-corpus.txt', '.gitkeep', 'pt-keys.txt', 'en-keys.txt', 'en-corpus.txt']

In [3]:
# get keys
with open(join(DATA_DSSG, 'pt-keys.txt'), 'r') as file:
    keys = file.read()

# get corpus example
with open(join(DATA_DSSG, 'pt-corpus.txt'), 'r') as file:
    corpus = file.read()

keys = ''.join(keys.split('\n'))
corpus = ''.join(corpus.split(keys)).split('\n')[0]

Some keys are used to signal special characters. Namely,

- The ENTER key is represented as 0.
- The shift key for capitalization is represented as ^.
- The backspace key is represented as <.
- All the remaining characters not found in the valid keys are encoded as #.
- Empty keys will contain the character _.


In [4]:
len(keys), keys

(36, "ABCDEFGHIJKLMNOPQ RSTUVWXYZ0.#,^?<'~")

In [5]:
corpus

" ^BOUCHEHR , NO SUL DO ^IR~AO . ^A ^R'USSIA , INSENS'IVEL AOS PROTESTOS DOS EUA E DE ^ISRAEL , J'A SE TINHA COMPROMETIDO , NO IN'ICIO DO ANO , A INSTALAR NAQUELE MESMO LOCAL UM REACTOR DE #### MEGAWATTS , NUM NEG'OCIO AVALIADO EM MIL MILH~OES DE D'OLARES . ^EM RELA#C~AO AO ^IRAQUE , ^VALERI ^PROGREBENKOV , PORTA#VOZ DA SOCIEDADE DE ^ESTADO ^ROSVOOROUJENIE , RESPONS'AVEL PELAS EXPORTA#C~OES MILITARES , DESMENTIU A EXIST#ENCIA DE UMA ENCOMENDA DE #### CARROS DE COMBATE RUSSOS , COMO AFIRMARA O GENRO DE ^SADDAM ^HUSSEIN QUE DESERTOU PARA A ^JORD#ANIA . ^SEGUNDO ESTE , OS BLINDADOS SERIAM ENTREGUES AO LONGO DE V'ARIOS ANOS E PAGOS EM PETR'OLEO , DEPOIS DO LEVANTAMENTO DAS SAN#C~OES IMPOSTAS AO ^IRAQUE . # PLAY#OFF # DA NBA ^BULLS MAIS LONGE DO T'ITULO 0 ^OS ^NEW ^YORK ^KNICKS VENCERAM TER#CA#FEIRA NO SEU REDUTO OS ^CHICAGO ^BULLS , POR ##### , PASSANDO A LIDERAR , POR ### , A FINAL DA ^CONFER#ENCIA ^LESTE DA ^LIGA ^NORTE#AMERICANA DE ^BASQUETEBOL ^PROFISSIONAL # NBA # , QUE SE DISPUTA 'A 

## The most basic approaches

In [6]:
Counter(corpus).most_common()[:10]

[(' ', 8955),
 ('A', 5028),
 ('E', 4453),
 ('O', 3982),
 ('S', 2875),
 ('R', 2564),
 ('I', 2400),
 ('N', 2013),
 ('D', 1913),
 ('T', 1800)]

In [7]:
baseline = ''.join([i[0] for i in Counter(corpus).most_common()])
baseline = baseline + ''.join([i for i in keys if i not in baseline]) + ' '
baseline

" AEOSRINDTMCU#^PL,'GVBFH.Q~ZJ0XKYW?< "

In [8]:
shuffled = list(baseline)
rng.shuffle(shuffled)

check_keyboard(baseline, keys)
check_keyboard(keys+' ', keys)
check_keyboard(shuffled, keys)

print('Shuffled cost:\t\t', compute_cost(''.join(shuffled), corpus))
print('Original keys cost:\t', compute_cost(keys+' ', corpus))
print('Baseline cost:\t\t', compute_cost(baseline, corpus))

Shuffled cost:		 238065.9804694326
Original keys cost:	 217456.47886335937
Baseline cost:		 159962.8399924338


## First attempt with GA

In [9]:
keys_list = list(keys)

def evaluate(individual):
    """
    Computes the cost for each individual.
    """
    try:
        check_keyboard(individual, keys)
        return [compute_cost(''.join(list(individual)), corpus)]
    except AssertionError:
        return [np.inf]

def mutFlip(ind1, ind2):
    """Execute a two points crossover with copy on the input individuals. The
    copy is required because the slicing in numpy returns a view of the data,
    which leads to a self overwritting in the swap operation.
    """

    ind = ind1.copy()
    for x, value in np.ndenumerate(ind):
        if np.random.random() < .05:
            ind[x] = np.random.choice(keys_list)
    try:
        check_keyboard(ind, keys)
        return ind, ind2
    except AssertionError:
        return mutFlip(individual, ind2)
    
    return ind, ind2


In [14]:
creator.create('FitnessMin', base.Fitness, weights=(-1.0,))
creator.create('Individual', np.ndarray, fitness=creator.FitnessMin)

toolbox = base.Toolbox()

# Tool to randomly initialize an individual
toolbox.register('attribute',
        np.random.permutation, np.array(list(baseline))
)

toolbox.register('individual',
    tools.initIterate,
    creator.Individual,
    toolbox.attribute
)

toolbox.register('population',
    tools.initRepeat,
    list,
    toolbox.individual
)

toolbox.register("evaluate", evaluate)
toolbox.register("mate", tools.cxOnePoint)
toolbox.register("mutate", tools.mutShuffleIndexes, indpb=0.05)
toolbox.register("select", tools.selTournament, tournsize=3)

def main():
    np.random.seed(64)

    pop = toolbox.population(n=20)

    # Numpy equality function (operators.eq) between two arrays returns the
    # equality element wise, which raises an exception in the if similar()
    # check of the hall of fame. Using a different equality function like
    # numpy.array_equal or numpy.allclose solve this issue.
    hof = tools.HallOfFame(1, similar=np.array_equal)

    stats = tools.Statistics(lambda ind: ind.fitness.values)
    stats.register("avg", np.mean)
    stats.register("std", np.std)
    stats.register("min", np.min)
    stats.register("max", np.max)

    algorithms.eaSimple(pop, toolbox, cxpb=0, mutpb=0.6, ngen=1000, stats=stats,
                        halloffame=hof)

    return pop, stats, hof


pop, stats, hof = main()




gen	nevals	avg   	std    	min   	max   
0  	20    	246890	15249.5	213135	278982
1  	11    	233756	12249.3	209868	248230
2  	12    	227314	13326  	209868	251671
3  	13    	221874	13276.3	208320	252809
4  	9     	215181	7587.48	208320	241664
5  	13    	211675	4567.05	208320	228472
6  	7     	210179	2954.8 	208320	217461
7  	14    	211003	4695.54	207512	224317
8  	12    	208763	2562.4 	205854	218638
9  	15    	212833	8246.92	201734	234191
10 	7     	208317	5917.33	200242	226983
11 	14    	211152	12046  	200242	249402
12 	7     	203581	3414.81	197444	211382
13 	9     	203342	7268.7 	197161	224139
14 	11    	203668	9060.25	196872	229232
15 	12    	203706	8660.55	197444	224638
16 	12    	201276	5936.67	197148	217414
17 	11    	200567	6190.93	195366	222350
18 	9     	199363	6294.83	192572	220256
19 	12    	199500	5711.94	192572	215197
20 	14    	198934	6360.18	192572	218163
21 	14    	198166	6806.5 	192572	218961
22 	12    	199476	11779.5	189687	243132
23 	7     	194272	4592.21	189687	210536


KeyboardInterrupt: 

In [None]:
''.join(list(hof)[0])

In [4]:
check_keyboard(" D#SEAOF,.^PMURTINC?K0<WJYXBQG'HLVH~Z", keys)
compute_cost(" D#SEAOF,.^PMURTINC?K0<WJYXBQG'HLVH~Z", corpus)

144949.99689663752

"OUP EAMQ^.,#SDRTNHL?K 0JBZF'VICG~X<WY" - 150423.74

