## Reinforcement Learning with Genetic Algorithm Optimizer

In [22]:
"""
This is a Numpy module for Reinforcement Learning with Genetic Algorithm Optimizer
"""
import numpy as np
import random

class NonBiasedNN:
    def __init__(self, input_size=None, hidden_size=None, output_size=None):
        self.W = np.random.rand(hidden_size, input_size)
        self.U = np.random.rand(output_size, hidden_size)

    def output(self, input_vector):
        """
        Input: (self.input_size, 1) Numpy input vector
        Output: (self.output_size, 1) Numpy output vector
        """
        hidden_output = np.amax(np.stack((np.dot(self.W, input_vector), np.zeros(np.shape(self.W)[0]))), axis=0)
        nn_output = np.amax(np.stack((np.dot(self.U, hidden_output), np.zeros(np.shape(self.U)[0]))), axis=0)
        softmax_prob = np.exp(nn_output)/np.sum(np.exp(nn_output))
        return np.argmax(softmax_prob)

class GeneticRL:
    def __init__(self, population_size=100, input_size=None, hidden_size=None, output_size=None):
        self.pop_size = population_size
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.gene_length = self.input_size*self.hidden_size + self.hidden_size*self.output_size

        #Gene Pool (Stored as set of weight matrices)
        self.gene_pool= []
        for i in range(self.pop_size):
            individual_agent = NonBiasedNN(self.input_size, self.hidden_size, self.output_size)
            self.gene_pool.append(individual_agent)

        #Record Data
        self.fitness = np.empty(self.pop_size)
        self.best_network = NonBiasedNN(self.input_size, self.hidden_size, self.output_size)
        self.best_fitness = 0

    def elitism(self, number_of_elites):
        elite_index = np.argsort(self.fitness)[:number_of_elites]
        elite_genes = []
        for i in range(np.size(elite_index)):
            elite_genes.append(self.gene_pool[i])
        return

    def parent_genes(self, number_of_parents):
        score = self.fitness-np.amin(self.fitness)
        survival_probability = score/np.sum(score)
        parent_genotype = []
        for choice in np.random.choice(range(self.pop_size), number_of_parents, replace=False, p=survival_probability):
            parent_genotype.append(self.gene_pool[choice])
        return parent_genotype

    def crossover(self, parent_gene_a, parent_gene_b):
        """
        INPUT: (gene_length, 1)Numpy Ndarray: 2 Parent Genes
        OUTPUT: (gene_length, 1)Numpy Ndarray: 2 Child Gene
        """
        def child(_a, _b):
            snippets = sorted(random.sample(range(0, self.gene_length), 2))
            a = _a[snippets[0]:snippets[1]]
            b = _b
            child_gene = b[:snippets[0]]
            child_gene = np.append(child_gene, a)
            child_gene = np.append(child_gene, b[snippets[1]:])
            return child_gene
        child_genotype = child(parent_gene_a, parent_gene_b)
        child_genotype = np.stack((child_genotype, child(parent_gene_b, parent_gene_a)))
        return child_genotype

    def mutation(self, parent_gene):
        """
        INPUT: (gene_length, 1)Numpy Ndarray: Parent Gene
        OUTPUT: (gene_length, 1)Numpy Ndarray: Child Gene
        """
        child_gene = parent_gene + np.random.rand(self.gene_length)
        return child_gene

    def matrix_to_gene(self, nn_object):
        gene = np.reshape(nn_object.W, self.input_size*self.hidden_size)
        gene = np.append(gene, np.reshape(nn_object.U, self.hidden_size*self.output_size))
        return gene

    def gene_to_matrix(self, gene):
        W = np.reshape(gene[:self.input_size*self.hidden_size], (self.hidden_size, self.input_size))
        U = np.reshape(gene[self.input_size*self.hidden_size:], (self.output_size, self.hidden_size))
        new_object = NonBiasedNN(self.input_size, self.hidden_size, self.output_size)
        new_object.W = W
        new_object.U = U
        return new_object

    def best_of(self):
        self.best_fitness = np.max(self.fitness)
        self.best_network = self.gene_pool[np.argsort(self.fitness)[0]]
        print("Best Fitness: ", self.best_fitness)

    def optimizer(self, number_of_elites, number_of_parents, crossover_rate = 0.7):
        nextgen = self.elitism(number_of_elites)
        parents = self.parent_genes(number_of_parents)
        #Performing Crossover
        for iter in range(int(self.pop_size*crossover_rate)):
            a_index, b_index = random.sample(range(number_of_parents), 2)
            a_gene, b_gene = self.matrix_to_gene(parents[a_index]), self.matrix_to_gene(parents[b_index])
            children = self.crossover(a_gene, b_gene)
            nextgen = np.append(nextgen, self.gene_to_matrix(children[0]))
            nextgen = np.append(nextgen, self.gene_to_matrix(children[1]))
        #Performing Mutation
        for iter in range(self.pop_size - number_of_elites - int(self.pop_size * crossover_rate)):
            chosen_parent = random.sample(range(number_of_parents), 1)[0]
            chosen_gene = self.matrix_to_gene(parents[chosen_parent])
            mutant = self.mutation(chosen_gene)
            nextgen = np.append(nextgen, self.gene_to_matrix(mutant))
        self.gene_pool = nextgen


In [23]:
test = GeneticRL(population_size=100, input_size=3, hidden_size=5, output_size=2)

In [24]:
print(test.gene_pool[0].W)
print(test.gene_pool[0].output([2, 3, 5]))

[[0.0121808  0.6940541  0.60238944]
 [0.46674805 0.22792021 0.71330689]
 [0.61636593 0.92284141 0.73606135]
 [0.24241366 0.16549675 0.74438769]
 [0.21912901 0.63744291 0.49477208]]
1


In [1]:
import gym
env = gym.make('CartPole-v0')
for i_episode in range(20):
    observation = env.reset()
    for t in range(1000):
        env.render()
        print("obs: ", observation) #Has 4 floating point values
        action = env.action_space.sample()
        print("act: ", action) #Binary
        observation, reward, done, info = env.step(action)
        if done:
            print("Episode finished after {} timesteps".format(t+1))
            break


[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
obs:  [-0.03214913 -0.01295239  0.04426924  0.01892158]
act:  0
obs:  [-0.03240818 -0.20868033  0.04464767  0.32523676]
act:  1
obs:  [-0.03658178 -0.01422158  0.05115241  0.04696137]
act:  1
obs:  [-0.03686622  0.18013098  0.05209163 -0.22915391]
act:  0
obs:  [-0.0332636  -0.01569517  0.04750855  0.0794949 ]
act:  1
obs:  [-0.0335775   0.17871467  0.04909845 -0.19782857]
act:  1
obs:  [-0.03000321  0.37310122  0.04514188 -0.47462788]
act:  1
obs:  [-0.02254118  0.5675576   0.03564932 -0.75274808]
act:  1
obs:  [-0.01119003  0.76217033  0.02059436 -1.03400303]
act:  1
obs:  [ 4.05337752e-03  9.57012445e-01 -8.56983235e-05 -1.32014991e+00]
act:  1
obs:  [ 0.02319363  1.15213548 -0.0264887  -1.61285966]
act:  1
obs:  [ 0.04623634  1.34755989 -0.05874589 -1.91368021]
act:  0
obs:  [ 0.07318753  1.15311778 -0.09701949 -1.63978153]
act:  0
obs:  [ 0.09624989  0.9592575  -0.12981512 -

obs:  [-0.08543393  0.57737784 -0.12773315 -1.3914434 ]
act:  0
obs:  [-0.07388637  0.38405683 -0.15556202 -1.14127582]
act:  1
obs:  [-0.06620523  0.58083144 -0.17838754 -1.47842454]
act:  0
obs:  [-0.0545886   0.38827912 -0.20795603 -1.24634768]
act:  0
Episode finished after 41 timesteps
obs:  [-0.00350814  0.00677543 -0.01461517 -0.02597951]
act:  0
obs:  [-0.00337263 -0.18813391 -0.01513476  0.26205657]
act:  0
obs:  [-0.00713531 -0.38303659 -0.00989363  0.54992765]
act:  1
obs:  [-0.01479604 -0.18777707  0.00110492  0.25414404]
act:  1
obs:  [-0.01855158  0.00732908  0.0061878  -0.03819018]
act:  0
obs:  [-0.018405   -0.18788105  0.005424    0.25643862]
act:  0
obs:  [-0.02216262 -0.38308002  0.01055277  0.55082742]
act:  1
obs:  [-0.02982422 -0.18810787  0.02156932  0.26148793]
act:  0
obs:  [-0.03358638 -0.38353097  0.02679908  0.56089528]
act:  1
obs:  [-0.041257   -0.18879518  0.03801698  0.27677447]
act:  1
obs:  [-0.0450329   0.00576436  0.04355247 -0.00367966]
act:  1
obs:

obs:  [ 0.32348674  1.18001748 -0.2076265  -1.61562795]
act:  1
Episode finished after 43 timesteps
obs:  [-4.52247640e-02 -2.76758943e-02  3.28531454e-05 -2.25987532e-04]
act:  0
obs:  [-4.57782819e-02 -2.22798317e-01  2.83333948e-05  2.92467305e-01]
act:  0
obs:  [-0.05023425 -0.41792067  0.00587768  0.58515917]
act:  0
obs:  [-0.05859266 -0.61312446  0.01758086  0.87968782]
act:  1
obs:  [-0.07085515 -0.41824573  0.03517462  0.5925834 ]
act:  1
obs:  [-0.07922007 -0.22363338  0.04702629  0.31118465]
act:  0
obs:  [-0.08369273 -0.41939267  0.05324998  0.61831959]
act:  1
obs:  [-0.09208059 -0.22505338  0.06561637  0.34287197]
act:  1
obs:  [-0.09658165 -0.03092332  0.07247381  0.07158089]
act:  1
obs:  [-0.09720012  0.16308882  0.07390543 -0.19738472]
act:  1
obs:  [-0.09393834  0.35708013  0.06995773 -0.46586894]
act:  1
obs:  [-0.08679674  0.55114747  0.06064036 -0.7357071 ]
act:  0
obs:  [-0.07577379  0.35524263  0.04592621 -0.42457263]
act:  0
obs:  [-0.06866894  0.15950121  0.03

obs:  [ 0.04351884  0.43815271  0.01377092 -0.52583728]
act:  0
obs:  [ 0.05228189  0.24283971  0.00325418 -0.22884704]
act:  1
obs:  [ 0.05713868  0.43791501 -0.00132276 -0.52050171]
act:  1
obs:  [ 0.06589698  0.63305556 -0.0117328  -0.81360117]
act:  1
obs:  [ 0.07855809  0.82833622 -0.02800482 -1.10995136]
act:  1
obs:  [ 0.09512482  1.02381472 -0.05020385 -1.41128647]
act:  1
obs:  [ 0.11560111  1.21952188 -0.07842958 -1.71923108]
act:  1
obs:  [ 0.13999155  1.41545008 -0.1128142  -2.03525462]
act:  0
obs:  [ 0.16830055  1.22165712 -0.15351929 -1.77951009]
act:  0
obs:  [ 0.1927337   1.02856039 -0.18910949 -1.53822909]
act:  1
Episode finished after 12 timesteps
obs:  [0.01435059 0.04710942 0.00310423 0.01161307]
act:  0
obs:  [ 0.01529278 -0.14805691  0.00333649  0.3052738 ]
act:  1
obs:  [0.01233164 0.04701734 0.00944197 0.01364498]
act:  0
obs:  [ 0.01327198 -0.14823874  0.00971487  0.30929194]
act:  0
obs:  [ 0.01030721 -0.34349776  0.01590071  0.60502277]
act:  1
obs:  [ 0.00