In [1]:
import os
from os.path import join, pardir
from collections import Counter
from copy import deepcopy
import numpy as np
from deap import base, creator, algorithms, tools
from dssg_challenge import compute_cost, check_keyboard

RNG_SEED = 0
DATA_DSSG = join(pardir, 'data', 'processed')

rng = np.random.RandomState(RNG_SEED)

In [2]:
os.listdir(DATA_DSSG)

['pt-corpus.txt', '.gitkeep', 'pt-keys.txt', 'en-keys.txt', 'en-corpus.txt']

In [3]:
# get keys
with open(join(DATA_DSSG, 'en-keys.txt'), 'r') as file:
    keys = file.read()

# get corpus example
with open(join(DATA_DSSG, 'en-corpus.txt'), 'r') as file:
    corpus = file.read()

keys = ''.join(keys.split('\n'))
corpus = ''.join(corpus.split(keys)).split('\n')[0]

Some keys are used to signal special characters. Namely,

- The ENTER key is represented as 0.
- The shift key for capitalization is represented as ^.
- The backspace key is represented as <.
- All the remaining characters not found in the valid keys are encoded as #.
- Empty keys will contain the character _.


In [4]:
len(keys), keys

(34, 'ABCDEFGHIJKLMNOPQ RSTUVWXYZ0.#,^?<')

## The most basic approaches

In [5]:
Counter(corpus).most_common()[:10]

[(' ', 7888),
 ('E', 4488),
 ('T', 3443),
 ('A', 3234),
 ('I', 2890),
 ('O', 2832),
 ('S', 2806),
 ('N', 2775),
 ('R', 2577),
 ('^', 1565)]

In [6]:
baseline = ''.join([i[0] for i in Counter(corpus).most_common()])
baseline = baseline + ''.join([i for i in keys if i not in baseline]) + '  T'
baseline

' ETAIOSNR^HLDC#UMPGFBWY.V,K0XJQZ?<  T'

In [7]:
shuffled = list(baseline)
rng.shuffle(shuffled)

anthony = 'EINOA TCGVDURL<^SWH_Z__XJQFPBMY,#.0K?'

check_keyboard(baseline, keys)
check_keyboard(keys+'  T', keys)
check_keyboard(shuffled, keys)
check_keyboard(''.join([i if i!='_' else ' ' for i in anthony]), keys)

print('Shuffled cost:\t\t\t', compute_cost(''.join(shuffled), corpus))
print('Original keys cost:\t\t', compute_cost(keys+' ', corpus))
print('Baseline cost:\t\t\t', compute_cost(baseline, corpus))
print('Anthony Carbajal\'s solution:\t', compute_cost(''.join([i for i in anthony if i!='_']), corpus))

Shuffled cost:			 233432.54787591402
Original keys cost:		 215629.56595268694
Baseline cost:			 170178.49650661062
Anthony Carbajal's solution:	 158733.94185884856


## First attempt with GA

In [8]:
keys_list = list(keys)

def evaluate(individual):
    """
    Computes the cost for each individual.
    """
    try:
        check_keyboard(individual, keys)
        return [compute_cost(''.join(list(individual)), corpus)]
    except AssertionError:
        return [np.inf]

def mutFlip(ind1, ind2):
    """Execute a two points crossover with copy on the input individuals. The
    copy is required because the slicing in numpy returns a view of the data,
    which leads to a self overwritting in the swap operation.
    """

    ind = ind1.copy()
    for x, value in np.ndenumerate(ind):
        if np.random.random() < .05:
            ind[x] = np.random.choice(keys_list)
    try:
        check_keyboard(ind, keys)
        return ind, ind2
    except AssertionError:
        return mutFlip(individual, ind2)
    
    return ind, ind2


In [9]:
creator.create('FitnessMin', base.Fitness, weights=(-1.0,))
creator.create('Individual', np.ndarray, fitness=creator.FitnessMin)

toolbox = base.Toolbox()

# Tool to randomly initialize an individual
toolbox.register('attribute',
        np.random.permutation, np.array(list(baseline))
)

toolbox.register('individual',
    tools.initIterate,
    creator.Individual,
    toolbox.attribute
)

toolbox.register('population',
    tools.initRepeat,
    list,
    toolbox.individual
)

toolbox.register("evaluate", evaluate)
toolbox.register("mate", tools.cxOnePoint)
toolbox.register("mutate", tools.mutShuffleIndexes, indpb=0.05)
toolbox.register("select", tools.selTournament, tournsize=3)

def main():
    np.random.seed(64)

    pop = toolbox.population(n=20)

    # Numpy equality function (operators.eq) between two arrays returns the
    # equality element wise, which raises an exception in the if similar()
    # check of the hall of fame. Using a different equality function like
    # numpy.array_equal or numpy.allclose solve this issue.
    hof = tools.HallOfFame(1, similar=np.array_equal)

    stats = tools.Statistics(lambda ind: ind.fitness.values)
    stats.register("avg", np.mean)
    stats.register("std", np.std)
    stats.register("min", np.min)
    stats.register("max", np.max)

    algorithms.eaSimple(pop, toolbox, cxpb=0, mutpb=0.6, ngen=1000, stats=stats,
                        halloffame=hof)

    return pop, stats, hof


pop, stats, hof = main()


gen	nevals	avg   	std    	min   	max   
0  	20    	230711	11730.7	211328	253526
1  	15    	222479	8550.19	209768	236974
2  	12    	214193	5623.62	207266	225510
3  	10    	211550	3468.81	207065	217218
4  	17    	210912	4420.74	202385	220128
5  	15    	210301	5659   	203495	225716
6  	12    	207232	4177.73	201347	222048
7  	13    	204788	3706.67	197492	211939
8  	14    	204308	8192.06	194556	230339
9  	8     	200895	4927.39	194556	214215
10 	12    	199904	6442.03	194556	218699
11 	11    	197678	5048.9 	191092	209712
12 	11    	197839	6423.91	191092	214188
13 	11    	195342	3585.77	191092	206202
14 	13    	197410	7784.7 	184579	221685
15 	15    	194385	3853   	184615	202499
16 	13    	194296	4752.53	184615	210685
17 	10    	192349	4214.85	184615	206343
18 	12    	193349	7550.42	184527	210265
19 	11    	190577	4752.73	184527	198599
20 	10    	187571	6122.38	182779	209967
21 	10    	187279	4831.65	181872	200383
22 	14    	185843	3544.15	181872	196038
23 	9     	186302	6123.17	180905	209643


204	11    	168123	7373.6 	162497	185457
205	17    	171845	8656.36	162497	194369
206	14    	171565	9639.51	162497	194410
207	12    	169627	8375.71	162497	197576
208	13    	169889	7920   	162497	184573
209	14    	170717	9560.51	162497	193510
210	11    	170310	8247.73	162497	193761
211	7     	170961	10538.8	162497	195237
212	14    	169794	10307.5	162497	198587
213	15    	169518	8772.08	162117	188194
214	9     	167078	7762.08	162497	192245
215	12    	166904	7643.39	162497	193584
216	11    	166419	6455.02	162497	185463
217	16    	170053	8640.54	162497	196111
218	13    	170154	10074.8	162497	199321
219	11    	168222	7278.35	162497	184286
220	10    	168783	7209.85	162497	187856
221	12    	168819	8837.02	162497	190394
222	12    	168307	8523.32	162497	191079
223	9     	166410	5646.25	162497	180811
224	13    	168277	7276.26	161887	187600
225	16    	165966	4503.04	162497	175976
226	10    	170256	10745.7	162497	202573
227	12    	170142	8723.94	162497	190712
228	14    	173556	9854.91	162497	191169


409	13    	163179	7329.57	158726	182790
410	10    	161221	3394.85	158726	170581
411	10    	162279	6464.59	158726	178507
412	13    	165214	9255.67	158606	187930
413	15    	171132	12184.3	158606	203496
414	12    	167078	8554.92	158606	185749
415	9     	162452	5682.78	158606	180055
416	13    	163456	7302.52	158606	181890
417	12    	163246	4922.63	158606	172500
418	12    	163836	7620.68	158606	190245
419	15    	166644	7124.78	158511	181114
420	13    	167774	8927.64	158511	189681
421	11    	164740	8699.92	158511	187074
422	10    	163259	6932.77	158511	184163
423	12    	164710	8965.44	158511	187683
424	17    	169047	12068.2	158511	201506
425	13    	167971	10191.4	158511	187441
426	12    	168179	10178.9	158511	194798
427	13    	164506	6686.07	158511	182991
428	9     	166291	11083.3	158511	194679
429	12    	164913	9799.68	158511	196823
430	14    	166267	8475.62	158511	184339
431	12    	164865	8155.23	158511	186783
432	9     	162285	5770.24	158511	177159
433	11    	164015	7555.24	158511	187928


614	13    	167821	10433.5	158588	198075
615	14    	167645	11282  	158588	204088
616	15    	166535	8638.9 	158588	194798
617	11    	164456	8260.63	158588	189841
618	11    	166625	10422.5	158588	188781
619	8     	162544	5566.06	158588	176618
620	14    	168405	10775.5	158588	188535
621	9     	164569	8025.87	158588	180963
622	8     	163599	8558.82	158588	187136
623	16    	166756	8894.46	158588	187401
624	11    	167691	9807.03	158588	189607
625	14    	167183	12183.5	158588	213520
626	14    	165415	5938.22	158588	183544
627	15    	164337	6245.85	158588	183866
628	12    	165044	6970.85	158588	179897
629	12    	166833	11632  	158588	196992
630	13    	166169	8281.86	158588	180420
631	10    	165450	7945.65	158335	185509
632	12    	163943	7220.32	158335	182965
633	13    	162243	6144.69	158335	179588
634	10    	167355	10200  	158335	187199
635	14    	166543	8935.99	158588	185751
636	12    	162303	5543.76	158588	179416
637	13    	165283	8071.51	158588	181565
638	15    	163100	5868.67	158588	180899


819	14    	163306	5840.64	157597	174213
820	16    	165602	6641.71	157597	179996
821	10    	163643	6488.41	157597	177280
822	13    	163969	5802.93	157597	178908
823	12    	163897	5909.36	157597	178511
824	12    	163742	7088.82	157597	184583
825	12    	164556	9933.2 	157597	194429
826	10    	161021	5171.94	157597	175073
827	17    	163661	7094.9 	157355	183847
828	10    	163651	7902.96	157597	184232
829	10    	162577	6814.51	157597	180061
830	14    	163084	7008.82	157597	183880
831	15    	163872	7037.28	157597	181670
832	7     	161758	6877.14	157597	178039
833	7     	160055	5468.76	157597	179250
834	14    	162433	5857.96	157597	174668
835	10    	160850	4231.76	157597	173768
836	17    	167942	10040.2	157597	194366
837	6     	164528	8378.79	157597	181829
838	14    	167082	9919.02	157322	193974
839	8     	163851	6972.61	157322	177239
840	12    	164106	6255.3 	157322	179904
841	16    	165825	6195.23	157322	180086
842	12    	163788	7247.42	157322	184539
843	10    	162647	6312.41	157322	179115


In [10]:
''.join(list(hof)[0])

'EDRATS ^MO NILHYB#P0JKFUGVCW<TXQZ?,. '

In [12]:
check_keyboard(''.join(list(hof)[0]), keys)
compute_cost(''.join(list(hof)[0]), corpus)

155258.08759585524

In [11]:
check_keyboard(' RDSTOECP#<WINALGYKX , ^0.ZMFHUJVBT?Q', keys)
compute_cost(' RDSTOECP#<WINALGYKX , ^0.ZMFHUJVBT?Q', corpus)

167836.16041230192

## Hall of fame solutions

    ' ONYTIAIMZGBCHEDRSL,P#.^0TQX VK?W<JFU' - 1673.418
    ' REASTO<DGWVPMILNYHTJC ^0. #?X,QUBZFK' - 1637.709
    ' RDSTOECP#<WINALGYKX , ^0.ZMFHUJVBT?Q' - 1582.775
    'T OISLADERNMGW #UYHTVKCFPX<, ?ZJ.0^BQ' - 1597.119
    'OSNA ETM GWYPRLV HI#.0^<J?BKC,FTUDQZX' - 1599.910