In [1]:
import gym
import numpy as np
import time
class BinaryActionLinearPolicy(object):
    def __init__(self, theta):
        self.w = theta[:-1]
        self.b = theta[-1]
    def act(self, ob):
        y = ob.dot(self.w) + self.b
        a = int(y < 0)
        return a

class ContinuousActionLinearPolicy(object):
    def __init__(self, theta, n_in, n_out):
        assert len(theta) == (n_in + 1) * n_out
        self.W = theta[0 : n_in * n_out].reshape(n_in, n_out)
        self.b = theta[n_in * n_out : None].reshape(1, n_out)
    def act(self, ob):
        a = ob.dot(self.W) + self.b
        return a

def cem(f, th_mean, batch_size, n_iter, elite_frac, initial_std=1.0):
    """
    Generic implementation of the cross-entropy method for maximizing a black-box function

    f: a function mapping from vector -> scalar
    th_mean: initial mean over input distribution
    batch_size: number of samples of theta to evaluate per batch
    n_iter: number of batches
    elite_frac: each batch, select this fraction of the top-performing samples
    initial_std: initial standard deviation over parameter vectors
    """
    n_elite = int(np.round(batch_size*elite_frac))
    th_std = np.ones_like(th_mean) * initial_std

    for _ in range(n_iter):
        ths = np.array([th_mean + dth for dth in  th_std[None,:]*np.random.randn(batch_size, th_mean.size)])
        ys = np.array([f(th) for th in ths])
        elite_inds = ys.argsort()[::-1][:n_elite]
        elite_ths = ths[elite_inds]
        th_mean = elite_ths.mean(axis=0)
        th_std = elite_ths.std(axis=0)
        yield {'ys' : ys, 'theta_mean' : th_mean, 'y_mean' : ys.mean()}
        
        
def do_rollout(agent, env, num_steps, render=False):
    total_rew = 0
    ob = env.reset()
    for t in range(num_steps):
        a = agent.act(ob)
        (ob, reward, done, _info) = env.step(a)
        total_rew += reward
        if render and t%3==0: env.render()
        if done: break
    return total_rew, t+1

def noisy_evaluation(theta):
    agent = BinaryActionLinearPolicy(theta)
    rew, T = do_rollout(agent, env, 200)
    return rew

env = gym.make("CartPole-v0")
env.seed(0)
np.random.seed(0)
params = dict(n_iter=100, batch_size=400, elite_frac = 0.2)
num_steps = 200

for (i, iterdata) in enumerate(
    cem(noisy_evaluation, np.zeros(env.observation_space.shape[0]+1), **params)):
    print('Iteration %2i. Episode mean reward: %7.3f'%(i, iterdata['y_mean']))
    agent = BinaryActionLinearPolicy(iterdata['theta_mean'])

print iterdata['theta_mean']
ob = env.reset()
done = False
start_t = time.time()
while done == False:
    a = agent.act(ob)
    (ob, reward, done, _info) = env.step(a)
    env.render()
    if done: break
end_t = time.time()    
print('stay up time: '+str(end_t-start_t))


[2017-06-14 22:56:58,271] Making new env: CartPole-v0


Iteration  0. Episode mean reward:  20.455
Iteration  1. Episode mean reward:  53.888
Iteration  2. Episode mean reward: 107.483
Iteration  3. Episode mean reward: 161.607
Iteration  4. Episode mean reward: 184.425
Iteration  5. Episode mean reward: 191.695
Iteration  6. Episode mean reward: 193.685
Iteration  7. Episode mean reward: 195.037
Iteration  8. Episode mean reward: 195.692
Iteration  9. Episode mean reward: 197.135
Iteration 10. Episode mean reward: 199.030
Iteration 11. Episode mean reward: 197.393
Iteration 12. Episode mean reward: 199.072
Iteration 13. Episode mean reward: 199.218
Iteration 14. Episode mean reward: 199.882
Iteration 15. Episode mean reward: 199.905
Iteration 16. Episode mean reward: 199.028
Iteration 17. Episode mean reward: 199.610
Iteration 18. Episode mean reward: 199.898
Iteration 19. Episode mean reward: 199.755
Iteration 20. Episode mean reward: 199.815
Iteration 21. Episode mean reward: 199.895
Iteration 22. Episode mean reward: 199.820
Iteration 2