In [1]:
import gym
import numpy as np

In [2]:
env = gym.make('CartPole-v0')

[2018-02-15 18:38:25,704] Making new env: CartPole-v0


In [3]:
class ES():
    def __init__(self):
        self.shapes, self.oldnet = self.build_net()
        self.net = np.copy(self.oldnet)
        
    def build_net(self):
        def linear(n_in, n_out):  # network linear layer
            w = np.random.randn(n_in * n_out).astype(np.float32) * .1
            b = np.random.randn(n_out).astype(np.float32) * .1
            return (n_in, n_out), np.concatenate((w, b))
        s0, p0 = linear(4, 20)
        s1, p1 = linear(20, 1)
        return [s0, s1], np.concatenate((p0, p1))
    
    def reset_net(self):
        self.net = np.copy(self.oldnet)
    
    def mutate(self):
        noise = np.random.randn(self.net.shape[0])
        self.net += sigma*noise
        return noise
    
    def update(self, learning_rate, noises, advs):
        gradient = np.dot(noises.T, advs)
        self.oldnet += learning_rate*gradient
    
    def choose_action(self, state):
        start = 0
        state = state[np.newaxis, :]
        for s in self.shapes:
            n_w, n_b = s[0]*s[1], s[1]
            state = np.tanh(state.dot(self.net[start:start+n_w].reshape(s))+self.net[start+n_w:start+n_w+n_b])
            start += n_w+n_b
        if state[0] > 0:
            return 1
        return 0

In [15]:
es = ES()

npop = 10
sigma = 1e-2
alpha = 1e-2
rank = np.arange(1, npop + 1)
util_ = np.maximum(0, np.log(npop / 2 + 1) - np.log(rank))
utility = util_ / util_.sum() - 1 / npop
    
for e in range(100):
    noises = []
    rewards = []
    best_r = 0
    for i in range(npop):
        state = env.reset()
        es.reset_net() # reset the old params
        noise_i = es.mutate() # mutate the child
        r_i = 0
        while True: # run simulation
            action = es.choose_action(state)
            next_state, reward, done, _ = env.step(action)
            state = next_state
            r_i += reward
            if done:
                best_r = max(r_i, best_r)
                break
        # save the noises and rewards
        noises += [noise_i]
        rewards += [r_i]
    print('\rbest reward for ep', e+1, ':', best_r, end=' '*10)
#     if best_r==200:
#         break
    
#     advs = (rewards - np.mean(rewards))/np.std(rewards) # 如果用這個更新會永遠無法收斂 不知道為什麼
    ranks = np.argsort(rewards)[::-1]
    noises = np.vstack(noises)[ranks]
    es.update(alpha/(npop*sigma), noises, utility)

best reward for ep 99 : 146.0           : 109.0           36 : 115.0           100 : 200.0          

In [16]:
#test
for e in range(5):
    state = env.reset()
    r = 0
    while True:
        env.render()
        action = es.choose_action(state)
        next_state, reward, done, _ = env.step(action)
        state = next_state
        r += reward
        if done:
            print('reward for ep', e+1, ':', r)
            break

reward for ep 1 : 200.0
reward for ep 2 : 200.0
reward for ep 3 : 174.0
reward for ep 4 : 200.0
reward for ep 5 : 200.0


In [17]:
env.close()