In [1]:
import os
from os.path import join, pardir
from collections import Counter
from copy import deepcopy
import numpy as np
from deap import base, creator, algorithms, tools
from dssg_challenge import compute_cost, check_keyboard

RNG_SEED = 0
DATA_DSSG = join(pardir, 'data', 'raw')

rng = np.random.RandomState(RNG_SEED)

In [2]:
os.listdir(DATA_DSSG)

['pt-corpus.txt', '.gitkeep', 'pt-keys.txt', 'en-keys.txt', 'en-corpus.txt']

In [3]:
# get keys
with open(join(DATA_DSSG, 'en-keys.txt'), 'r') as file:
    keys = file.read()

# get corpus example
with open(join(DATA_DSSG, 'en-corpus.txt'), 'r') as file:
    corpus = file.read()

keys = ''.join(keys.split('\n'))
corpus = ''.join(corpus.split(keys)).split('\n')[0]

Some keys are used to signal special characters. Namely,

- The ENTER key is represented as 0.
- The shift key for capitalization is represented as ^.
- The backspace key is represented as <.
- All the remaining characters not found in the valid keys are encoded as #.
- Empty keys will contain the character _.


In [4]:
len(keys), keys

(34, 'ABCDEFGHIJKLMNOPQ RSTUVWXYZ0.#,^?<')

## The most basic approaches

In [5]:
Counter(corpus).most_common()[:10]

[(' ', 83),
 ('T', 45),
 ('I', 45),
 ('O', 39),
 ('E', 35),
 ('A', 31),
 ('S', 27),
 ('N', 21),
 ('L', 20),
 ('H', 19)]

In [6]:
baseline = ''.join([i[0] for i in Counter(corpus).most_common()])
baseline = baseline + ''.join([i for i in keys if i not in baseline]) + '  T'
baseline

' TIOEASNLHRM<DVW#PYKFGUC0BZ.^J,QX?  T'

In [7]:
shuffled = list(baseline)
rng.shuffle(shuffled)

anthony = 'EINOA TCGVDURL<^SWH_Z__XJQFPBMY,#.0K?'

check_keyboard(baseline, keys)
check_keyboard(keys+'  T', keys)
check_keyboard(shuffled, keys)
check_keyboard(''.join([i if i!='_' else ' ' for i in anthony]), keys)

print('Shuffled cost:\t\t\t', compute_cost(''.join(shuffled), corpus))
print('Original keys cost:\t\t', compute_cost(keys+' ', corpus))
print('Baseline cost:\t\t\t', compute_cost(baseline, corpus))
print('Anthony Carbajal\'s solution:\t', compute_cost(''.join([i for i in anthony if i!='_']), corpus))

Shuffled cost:			 2421.8189712388935
Original keys cost:		 2384.127991417558
Baseline cost:			 1792.1951037269362
Anthony Carbajal's solution:	 1737.8888937148206


## First attempt with GA

In [8]:
keys_list = list(keys)

def evaluate(individual):
    """
    Computes the cost for each individual.
    """
    try:
        check_keyboard(individual, keys)
        return [compute_cost(''.join(list(individual)), corpus)]
    except AssertionError:
        return [np.inf]

def mutFlip(ind1, ind2):
    """Execute a two points crossover with copy on the input individuals. The
    copy is required because the slicing in numpy returns a view of the data,
    which leads to a self overwritting in the swap operation.
    """

    ind = ind1.copy()
    for x, value in np.ndenumerate(ind):
        if np.random.random() < .05:
            ind[x] = np.random.choice(keys_list)
    try:
        check_keyboard(ind, keys)
        return ind, ind2
    except AssertionError:
        return mutFlip(individual, ind2)
    
    return ind, ind2


In [None]:
creator.create('FitnessMin', base.Fitness, weights=(-1.0,))
creator.create('Individual', np.ndarray, fitness=creator.FitnessMin)

toolbox = base.Toolbox()

# Tool to randomly initialize an individual
toolbox.register('attribute',
        np.random.permutation, np.array(list(baseline))
)

toolbox.register('individual',
    tools.initIterate,
    creator.Individual,
    toolbox.attribute
)

toolbox.register('population',
    tools.initRepeat,
    list,
    toolbox.individual
)

toolbox.register("evaluate", evaluate)
toolbox.register("mate", tools.cxOnePoint)
toolbox.register("mutate", tools.mutShuffleIndexes, indpb=0.05)
toolbox.register("select", tools.selTournament, tournsize=3)

def main():
    np.random.seed(64)

    pop = toolbox.population(n=10)

    # Numpy equality function (operators.eq) between two arrays returns the
    # equality element wise, which raises an exception in the if similar()
    # check of the hall of fame. Using a different equality function like
    # numpy.array_equal or numpy.allclose solve this issue.
    hof = tools.HallOfFame(1, similar=np.array_equal)

    stats = tools.Statistics(lambda ind: ind.fitness.values)
    stats.register("avg", np.mean)
    stats.register("std", np.std)
    stats.register("min", np.min)
    stats.register("max", np.max)

    algorithms.eaSimple(pop, toolbox, cxpb=0, mutpb=0.6, ngen=1000, stats=stats,
                        halloffame=hof)

    return pop, stats, hof


pop, stats, hof = main()


gen	nevals	avg    	std    	min    	max   
0  	10    	2483.07	116.014	2331.73	2715.8
1  	6     	2399.46	76.1618	2295.27	2546.03
2  	5     	2350.1 	50.7146	2295.27	2427.01
3  	6     	2311.41	43.2119	2258.26	2428.74
4  	7     	2310.79	49.1811	2258.26	2408.03
5  	7     	2293.64	63.8695	2196.51	2417   
6  	3     	2250.35	23.0465	2196.51	2280.14
7  	7     	2277.65	93.3237	2196.51	2529.48
8  	5     	2236.42	36.6565	2196.51	2295.51
9  	5     	2239.25	79.6225	2178.46	2459.1 
10 	7     	2225.3 	54.9709	2148.9 	2338.7 
11 	8     	2217.85	42.1394	2148.9 	2274.79
12 	6     	2192.42	31.6581	2148.9 	2253.98
13 	4     	2192.27	34.4416	2148.9 	2251.37
14 	7     	2177   	33.9997	2141.19	2242.83
15 	7     	2164.36	40.0191	2141.19	2254.29
16 	9     	2159.23	58.9487	2067.23	2295.14
17 	4     	2112.41	51.57  	2000.61	2188.84
18 	5     	2065.36	58.7625	2000.61	2153.97
19 	5     	2021.97	34.2798	1970.9 	2078.17
20 	7     	2038.27	78.1091	1970.9 	2257.2 
21 	7     	2026.52	65.4042	1968.44	2172.46
22 	4     	20

191	4     	1763.54	73.8178	1696.23	1906.97
192	5     	1727.36	54.7568	1696.23	1885.28
193	5     	1751.65	77.5666	1696.23	1937.64
194	7     	1765.65	66.9443	1696.23	1884.62
195	9     	1830.28	128.392	1682.13	2114.31
196	6     	1795.9 	92.3175	1682.13	1929.02
197	7     	1789.88	124.137	1697.29	2119.16
198	9     	1795.14	96.0446	1697.29	2010.7 
199	6     	1776.1 	72.9562	1697.29	1899.96
200	6     	1758.88	111.734	1697.29	1987.38
201	6     	1795.52	126.385	1696.68	2068.58
202	4     	1776.73	99.0661	1696.68	1959.64
203	4     	1758.9 	105.651	1696.68	1990.53
204	6     	1812   	138.713	1696.68	2136   
205	5     	1719.59	49.0496	1684.6 	1859.61
206	4     	1717.72	37.7367	1684.6 	1811.01
207	7     	1756.66	76.4833	1684.6 	1879.92
208	4     	1757.47	113.159	1684.6 	2051.84
209	7     	1703.58	36.7386	1674.08	1800.42
210	10    	1745.73	80.3404	1674.08	1951.8 
211	8     	1787.09	118.466	1674.08	2035.93
212	7     	1795.71	125.91 	1672.51	2049.39


In [29]:
''.join(list(hof)[0])

' ONYTIAIMZGBCHEDRSL,P#.^0TQX VK?W<JFU'

In [31]:
check_keyboard(' ONYTIAIMZGBCHEDRSL,P#.^0TQX VK?W<JFU', keys)
compute_cost(' ONYTIAIMZGBCHEDRSL,P#.^0TQX VK?W<JFU', corpus)

1673.418399379088

## Hall of fame solutions

    ' ONYTIAIMZGBCHEDRSL,P#.^0TQX VK?W<JFU' - 1673.418