In [1]:
!pip install deap -q

In [13]:
import os
from os.path import join, pardir
from collections import Counter
from copy import deepcopy
import numpy as np
from deap import base, creator, algorithms, tools
from dssg_challenge import compute_cost, check_keyboard

RNG_SEED = 0
DATA_DSSG = join(pardir, 'data', 'raw')

rng = np.random.RandomState(RNG_SEED)

In [14]:
os.listdir(DATA_DSSG)

['pt-corpus.txt', '.gitkeep', 'pt-keys.txt', 'en-keys.txt', 'en-corpus.txt']

In [15]:
# get keys
with open(join(DATA_DSSG, 'en-keys.txt'), 'r') as file:
    keys = file.read()

# get corpus example
with open(join(DATA_DSSG, 'en-corpus.txt'), 'r') as file:
    corpus = file.read()

keys = ''.join(keys.split('\n'))
corpus = ''.join(corpus.split(keys)).split('\n')[0]

Some keys are used to signal special characters. Namely,

- The ENTER key is represented as 0.
- The shift key for capitalization is represented as ^.
- The backspace key is represented as <.
- All the remaining characters not found in the valid keys are encoded as #.
- Empty keys will contain the character _.


In [16]:
len(keys), keys

(34, 'ABCDEFGHIJKLMNOPQ RSTUVWXYZ0.#,^?<')

## The most basic approaches

In [17]:
Counter(corpus).most_common()[:10]

[(' ', 83),
 ('T', 45),
 ('I', 45),
 ('O', 39),
 ('E', 35),
 ('A', 31),
 ('S', 27),
 ('N', 21),
 ('L', 20),
 ('H', 19)]

In [22]:
baseline = ''.join([i[0] for i in Counter(corpus).most_common()])
baseline = baseline + ''.join([i for i in keys if i not in baseline]) + '  T'
baseline

' TIOEASNLHRM<DVW#PYKFGUC0BZ.^J,QX? TI'

In [42]:
shuffled = list(baseline)
rng.shuffle(shuffled)

anthony = 'EINOA TCGVDURL<^SWH_Z__XJQFPBMY,#.0K?'

check_keyboard(baseline, keys)
check_keyboard(keys+'  T', keys)
check_keyboard(shuffled, keys)
check_keyboard(''.join([i if i!='_' else ' ' for i in anthony]), keys)

print('Shuffled cost:\t\t\t', compute_cost(''.join(shuffled), corpus))
print('Original keys cost:\t\t', compute_cost(keys+' ', corpus))
print('Baseline cost:\t\t\t', compute_cost(baseline, corpus))
print('Anthony Carbajal\'s solution:\t', compute_cost(''.join([i for i in anthony if i!='_']), corpus))

Shuffled cost:			 2125.932411784575
Original keys cost:		 2384.127991417558
Baseline cost:			 1788.10463736441
Anthony Carbajal's solution:	 1737.8888937148206


## First attempt with GA

In [24]:
keys_list = list(keys)

def evaluate(individual):
    """
    Computes the cost for each individual.
    """
    try:
        check_keyboard(individual, keys)
        return [compute_cost(''.join(list(individual)), corpus)]
    except AssertionError:
        return [np.inf]

def mutFlip(ind1, ind2):
    """Execute a two points crossover with copy on the input individuals. The
    copy is required because the slicing in numpy returns a view of the data,
    which leads to a self overwritting in the swap operation.
    """

    ind = ind1.copy()
    for x, value in np.ndenumerate(ind):
        if np.random.random() < .05:
            ind[x] = np.random.choice(keys_list)
    try:
        check_keyboard(ind, keys)
        return ind, ind2
    except AssertionError:
        return mutFlip(individual, ind2)
    
    return ind, ind2


In [25]:
creator.create('FitnessMin', base.Fitness, weights=(-1.0,))
creator.create('Individual', np.ndarray, fitness=creator.FitnessMin)

toolbox = base.Toolbox()

# Tool to randomly initialize an individual
toolbox.register('attribute',
        np.random.permutation, np.array(list(baseline))
)

toolbox.register('individual',
    tools.initIterate,
    creator.Individual,
    toolbox.attribute
)

toolbox.register('population',
    tools.initRepeat,
    list,
    toolbox.individual
)

toolbox.register("evaluate", evaluate)
toolbox.register("mate", tools.cxOnePoint)
toolbox.register("mutate", tools.mutShuffleIndexes, indpb=0.05)
toolbox.register("select", tools.selTournament, tournsize=3)

def main():
    np.random.seed(64)

    pop = toolbox.population(n=10)

    # Numpy equality function (operators.eq) between two arrays returns the
    # equality element wise, which raises an exception in the if similar()
    # check of the hall of fame. Using a different equality function like
    # numpy.array_equal or numpy.allclose solve this issue.
    hof = tools.HallOfFame(1, similar=np.array_equal)

    stats = tools.Statistics(lambda ind: ind.fitness.values)
    stats.register("avg", np.mean)
    stats.register("std", np.std)
    stats.register("min", np.min)
    stats.register("max", np.max)

    algorithms.eaSimple(pop, toolbox, cxpb=0, mutpb=0.6, ngen=1000, stats=stats,
                        halloffame=hof)

    return pop, stats, hof


pop, stats, hof = main()


gen	nevals	avg    	std    	min    	max    
0  	10    	2530.63	172.627	2231.24	2736.13
1  	7     	2376.26	120.098	2216   	2680.66
2  	4     	2315.08	85.1966	2216   	2441.98
3  	5     	2245.41	77.3498	2133.3 	2412.45
4  	6     	2218.84	76.919 	2127.15	2425.72
5  	6     	2169.57	27.9008	2127.15	2216   
6  	6     	2148.47	42.6662	2081.33	2256.03
7  	4     	2142.59	35.687 	2081.33	2216   
8  	4     	2124.56	46.302 	2072.76	2233.45
9  	7     	2141.19	89.3873	2002.22	2261.36
10 	6     	2087.44	66.3696	1996.72	2228.43
11 	3     	2049.2 	62.9812	1996.72	2218.29
12 	9     	2039.58	59.3195	1990.26	2164.81
13 	7     	2005.35	49.7183	1918.32	2131.89
14 	9     	1997.26	50.6478	1918.32	2055.85
15 	5     	1968.36	37.0875	1918.32	2016.4 
16 	8     	1967.53	73.0034	1876.78	2138.18
17 	7     	1940.43	41.0679	1876.78	1996.97
18 	9     	1996.87	62.4342	1918.32	2125.75
19 	6     	1962.8 	53.6196	1892.59	2092.41
20 	6     	1983.39	65.403 	1892.59	2101.52
21 	7     	1984.54	90.975 	1888.37	2138.42
22 	9     	

190	5     	1745.15	106.591	1690.76	2058.78
191	4     	1754.6 	161.3  	1690.76	2236.2 
192	6     	1758.08	83.4416	1690.76	1917.5 
193	6     	1723.88	45.4282	1690.76	1828.79
194	5     	1738.14	79.5496	1690.76	1926.42
195	8     	1806.43	122.888	1690.76	2069.04
196	5     	1747.38	125.577	1690.76	2117.93
197	8     	1732.34	76.3148	1690.76	1952.33
198	5     	1724.95	55.0303	1690.76	1841.26
199	8     	1810.62	142.082	1680.24	2078.55
200	7     	1769.46	91.1828	1690.76	1982.05
201	8     	1748.66	56.1303	1690.76	1877.18
202	5     	1743.89	78.8029	1690.76	1947.53
203	7     	1770.39	85.394 	1690.76	1913.43
204	8     	1734.85	60.8341	1690.76	1860.18
205	4     	1758.23	145.458	1690.76	2179.39
206	5     	1717.2 	28.2564	1690.76	1772.35
207	6     	1737.52	62.6214	1690.76	1859.34
208	5     	1732.5 	85.7831	1690.76	1980.65
209	6     	1777.27	108.834	1690.76	1995.01
210	3     	1769.23	79.3159	1690.76	1920.43
211	6     	1757.79	61.8982	1690.76	1861.55
212	7     	1829.85	177.272	1690.76	2201.97
213	8     	

381	8     	1749.81	54.6749	1690.57	1840.01
382	7     	1796.48	84.6807	1699.57	2007.94
383	6     	1750.63	45.3141	1699.57	1809.89
384	8     	1757.12	67.1385	1699.57	1903.87
385	6     	1753.6 	59.9431	1699.57	1882.2 
386	6     	1774.03	111.913	1699.57	2092.4 
387	7     	1765.31	65.8489	1699.57	1911.52
388	4     	1766.26	87.9829	1699.57	1961.27
389	4     	1740.95	66.8292	1699.57	1904.55
390	7     	1749.56	56.6614	1697.71	1888.05
391	7     	1794.19	125.358	1699.57	2080.2 
392	6     	1737.79	46.8743	1699.57	1839.8 
393	4     	1774.07	118.262	1699.57	2080.71
394	8     	1819.88	86.0124	1699.57	1968.47
395	7     	1775.36	87.2   	1699.57	1959.56
396	7     	1780.2 	80.7894	1687.01	1905.41
397	6     	1772.23	92.0441	1687.01	1999.43
398	6     	1768.99	73.4351	1699.57	1915.37
399	8     	1812.9 	88.7681	1699.57	1992.16
400	5     	1757.22	79.4469	1699.57	1966.83
401	5     	1743.21	81.9205	1699.57	1979.37
402	8     	1748.87	69.9031	1699.57	1914.2 
403	5     	1793.43	136.654	1699.57	2133.85
404	6     	

572	4     	1766.91	51.7801	1726.99	1856.66
573	4     	1750.41	42.2553	1726.99	1868.47
574	5     	1788.29	123.093	1719.07	2067.49
575	7     	1769.97	52.7502	1724.26	1889.97
576	6     	1790.95	85.9194	1724.26	1975.56
577	8     	1832.38	116.017	1724.26	2074.86
578	5     	1846.8 	113.074	1724.26	2060.39
579	5     	1797.16	90.7707	1724.26	1986.44
580	7     	1847.17	135.666	1724.26	2173.81
581	7     	1797.18	97.5685	1724.26	2050.07
582	5     	1794.31	86.0069	1724.26	1964.71
583	6     	1827.72	117.211	1724.26	2063.32
584	5     	1765.09	50.6856	1724.26	1873.96
585	6     	1754.4 	51.6313	1691.96	1873.13
586	6     	1744.73	55.3973	1691.96	1863.95
587	5     	1755.32	54.4849	1691.96	1885.15
588	7     	1849.96	106.583	1703.66	2064.05
589	5     	1810.39	91.2276	1703.66	2044.41
590	7     	1795.76	87.1823	1703.66	1971.63
591	5     	1800.8 	109.976	1703.66	2012.43
592	5     	1832.84	198.635	1703.66	2371.26
593	7     	1767.81	101.908	1693.55	1988.66
594	7     	1788.62	98.0451	1703.66	2029.31
595	5     	

763	8     	1792.45	57.6743	1710.07	1909.2 
764	5     	1756.46	39.1674	1710.07	1810.01
765	4     	1796.5 	118.098	1710.07	2055.54
766	6     	1747.97	36.4625	1705.73	1830.41
767	6     	1772.83	57.1483	1710.07	1863   
768	5     	1786.22	91.9563	1710.07	1965.25
769	8     	1778.98	78.1852	1710.07	1988.77
770	7     	1845.29	117.853	1710.07	2054.77
771	6     	1857.09	121.102	1710.07	2110.76
772	8     	1803.82	90.2509	1710.07	1996.05
773	5     	1744.55	55.6431	1710.07	1905.85
774	5     	1743.63	43.668 	1710.07	1857.18
775	3     	1786.63	115.895	1710.07	2059.01
776	9     	1748.64	63.2618	1696.44	1919.84
777	5     	1739.98	49.7499	1696.44	1875.61
778	4     	1751.56	60.8813	1696.44	1898.42
779	6     	1762.1 	76.9153	1696.44	1927.28
780	9     	1821.35	108.747	1701.87	2022.37
781	4     	1749.87	42.2308	1701.87	1813.59
782	4     	1763.99	84.3174	1701.87	1987.72
783	6     	1789.16	106.788	1701.87	2042.67
784	6     	1751.55	54.9549	1701.87	1863.02
785	7     	1792.87	129.876	1701.87	2134.83
786	6     	

954	7     	1805.97	107.041	1705.47	2010.58
955	9     	1827.29	113.243	1705.47	2067.35
956	7     	1833.5 	160.449	1693.83	2270.51
957	6     	1788.49	96.8741	1693.83	1973.09
958	3     	1790.59	113.942	1693.83	2009.57
959	6     	1741.95	76.2412	1693.83	1958.73
960	8     	1774.21	80.6585	1701.06	1927.33
961	3     	1742.74	62.2528	1701.06	1865.83
962	8     	1792.46	103.815	1701.06	2020.77
963	6     	1731.77	43.1714	1701.06	1815.75
964	6     	1772.25	91.1621	1701.06	2002.22
965	8     	1768.02	75.6199	1701.06	1926.08
966	6     	1788.93	115.402	1700.99	2066.06
967	5     	1761.69	67.6752	1701.06	1916.09
968	5     	1741.89	65.3356	1701.06	1912.72
969	9     	1837.03	121.319	1701.06	2028.71
970	6     	1777.29	67.5489	1694.7 	1880.58
971	6     	1792.07	84.0584	1701.06	1949.44
972	6     	1766.01	79.9833	1701.06	1947.46
973	4     	1739.47	89.7638	1701.06	2004.47
974	4     	1735.69	46.1836	1701.06	1813.68
975	6     	1758.82	69.9805	1701.06	1923.85
976	7     	1782.05	66.5074	1701.06	1905.72
977	5     	

In [29]:
''.join(list(hof)[0])

' ONYTIAIMZGBCHEDRSL,P#.^0TQX VK?W<JFU'

In [31]:
check_keyboard(' ONYTIAIMZGBCHEDRSL,P#.^0TQX VK?W<JFU', keys)
compute_cost(' ONYTIAIMZGBCHEDRSL,P#.^0TQX VK?W<JFU', corpus)

1673.418399379088

## Hall of fame solutions

    ' ONYTIAIMZGBCHEDRSL,P#.^0TQX VK?W<JFU' - 1673.418