# Hill climbing

In [1]:
import numpy as np
import gym

In [2]:
env = gym.make('CartPole-v0')

[2016-11-13 20:46:44,552] Making new env: CartPole-v0


In [3]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [4]:
episode = 0
ep_reward = 0
best_reward = 0
converged = False

# Weights
W = np.random.normal(0.0, 0.1, 4)
new_W = W
learning_rate = 1e-1

while not converged:
    # Reset episode
    done = False
    ep_reward = 0
    obs = env.reset()
    
    # Keep taking new actions until episode is done
    while not done:        
        # Get action probability
        action = sigmoid(obs.dot(new_W))
        # Round the action to the highest probability
        action = int(np.round(action))
        
        # Perform the action
        obs, reward, done, info = env.step(action)
        ep_reward += reward  
        # Render only if better than the best
        if ep_reward > best_reward:
            env.render()       
            pass
        # Check if converged
        if ep_reward > 1200:
            converged = True
            break          
          
    # "Train" process
    if ep_reward > best_reward:
        W = new_W
        best_reward = ep_reward
    # Add noise to the current weight
    new_W = W + learning_rate * np.random.randn(4)       
    
    # Print episode statistics
    print('Epoch: {} | Episode Reward: {} | Best Reward: {}'.format(episode, ep_reward, best_reward))
    episode += 1
    
print('Converged in {} episodes!'.format(episode))
print('Final weights: {}'.format(W))

env.close()

Epoch: 0 | Episode Reward: 9.0 | Best Reward: 9.0
Epoch: 1 | Episode Reward: 11.0 | Best Reward: 11.0
Epoch: 2 | Episode Reward: 262.0 | Best Reward: 262.0
Epoch: 3 | Episode Reward: 21.0 | Best Reward: 262.0
Epoch: 4 | Episode Reward: 93.0 | Best Reward: 262.0
Epoch: 5 | Episode Reward: 8.0 | Best Reward: 262.0
Epoch: 6 | Episode Reward: 10.0 | Best Reward: 262.0
Epoch: 7 | Episode Reward: 10.0 | Best Reward: 262.0
Epoch: 8 | Episode Reward: 186.0 | Best Reward: 262.0
Epoch: 9 | Episode Reward: 53.0 | Best Reward: 262.0
Epoch: 10 | Episode Reward: 134.0 | Best Reward: 262.0
Epoch: 11 | Episode Reward: 77.0 | Best Reward: 262.0
Epoch: 12 | Episode Reward: 11.0 | Best Reward: 262.0
Epoch: 13 | Episode Reward: 754.0 | Best Reward: 754.0
Epoch: 14 | Episode Reward: 251.0 | Best Reward: 754.0
Epoch: 15 | Episode Reward: 536.0 | Best Reward: 754.0
Epoch: 16 | Episode Reward: 477.0 | Best Reward: 754.0
Epoch: 17 | Episode Reward: 153.0 | Best Reward: 754.0
Epoch: 18 | Episode Reward: 289.0 |