In [1]:
import numpy as np
import gym

In [2]:
ENV_NAME = 'CartPole-v0'
# ENV_NAME = 'MountainCar-v0'
ENV_NAME = 'LunarLander-v2'
DISCOUNT_FACTOR = 0.99

In [3]:
def softmax(z):
    e_z = np.exp(z)
    return np.squeeze(e_z / np.sum(e_z))

In [4]:
def calculate_action_probs(state, theta):
    # Feed foward
    z = np.dot(state, theta)
    # Softmax map the values to probabilities
    return softmax(z)

In [5]:
def run_ep(env, theta, render=False):
    state = env.reset()
    done = False
    rewards = []

    while not done:    
        if render:
            env.render()
        action_probs = calculate_action_probs(state, theta)
        action = np.random.choice(env.action_space.n, p=action_probs)

        next_state, reward, done, _ = env.step(action)
        rewards.append(reward)

        state = next_state
        
#     return calculate_return(rewards, discount_factor)
    return np.sum(rewards)

In [6]:
def create_parameters(size, means, std_devs):
    thetas = np.array([np.random.normal(size=size, loc=u, scale=d) for u, d in zip(means, std_devs)])
    return thetas.T

In [7]:
def optimize(env, num_episodes, pop_size=32, top_pct=0.1, noise_steps=0.1):
    # env information
    num_obs = env.observation_space.shape[0]
    num_actions = env.action_space.n
    # Selected policies
    num_idxs = int(top_pct * pop_size)
    # One mean and std_dev for each parameter
    means = np.random.uniform(size=num_obs * num_actions)
    std_devs = np.random.uniform(size=num_obs * num_actions)

    for i_episode in range(num_episodes):
        thetas = create_parameters(pop_size, means, std_devs).reshape(pop_size, num_obs, num_actions)
        returns = [run_ep(env, theta) for theta in thetas]

        # Order best returns        
        top_idxs = np.argsort(returns)[::-1][:num_idxs]
        best_thetas = thetas[top_idxs]

        # New means and std_devs based on top policies
        means = np.mean(best_thetas, axis=0).reshape(-1)
        # Add some noise because this method converges too quickly
        noise = max(((num_episodes / 2) - i_episode) / num_episodes, 0)        
        std_devs = np.std(best_thetas, axis=0).reshape(-1) + noise
    
        print('Episode: {}'.format(i_episode + 1))
        print('Theta mean: {}'.format(np.mean(means)))
        print('Reward mean: {}'.format(np.mean(returns)))
        print()

In [8]:
env = gym.make(ENV_NAME)
if 'CartPole' in ENV_NAME:
    env._max_episode_steps = 5000

[2017-09-01 23:55:31,527] Making new env: LunarLander-v2


In [9]:
optimize(env, 100)

Episode: 1
Theta mean: 0.5384641091612756
Reward mean: -239.6347200764219

Episode: 2
Theta mean: 0.7426287055091624
Reward mean: -339.3707995380438

Episode: 3
Theta mean: 0.8376276856190654
Reward mean: -343.65047887393354

Episode: 4
Theta mean: 0.8537331322308048
Reward mean: -291.1159082332284

Episode: 5
Theta mean: 0.7444581870644468
Reward mean: -305.8787272701893

Episode: 6
Theta mean: 0.6781371029423133
Reward mean: -288.743269864029

Episode: 7
Theta mean: 0.8626742304255437
Reward mean: -229.5446793036068

Episode: 8
Theta mean: 0.6349169191031888
Reward mean: -203.20513870680415

Episode: 9
Theta mean: 0.7019893858813642
Reward mean: -133.09763408262575

Episode: 10
Theta mean: 0.8264337195121649
Reward mean: -160.19364408722345

Episode: 11
Theta mean: 0.9217584070399235
Reward mean: -146.28843801626084

Episode: 12
Theta mean: 1.1367826017251346
Reward mean: -208.0786079438294

Episode: 13
Theta mean: 1.032969653436359
Reward mean: -138.53732146862026

Episode: 14
Theta