In [1]:
from __future__ import absolute_import
import gym
import numpy as np
import random
import time
import matplotlib.pyplot as plt

%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2



In [2]:
# Classical Control
## Note the controller's output is disrete which complicated the original intention of using classical PID Control

## HyperParameters
render = True

# https://gym.openai.com/docs

# Guesses at observations are: (https://gym.openai.com/evaluations/eval_VQwN8kRESjakUPwJbRlq5Q)
# observation[0] = pole speed
# observation[1] = pole top pos
# observation[2] = pole angle
# observation[3] = block speed

In [3]:
# Main
env = gym.make('CartPole-v0')
for i_episode in range(20):
    observation = env.reset()
    total_reward = 0;
    total_time = 100;
    for t in range(total_time):
        if render: env.render()
            
        if (observation[2] > 0 and observation[3] > -1) or observation[3] > 1:
            action = 1 # right
        else:
            action = 0 # left
        
        observation, reward, done, info = env.step(action)
        
        total_reward += reward
        
        if done:
            print("Episode finished after {} timesteps".format(t+1))
            break

    print("Total Reward for Episode {} is {} / {}".format(i_episode, total_reward, total_time))
        
env.monitor.close()


[2016-07-08 17:14:55,950] Making new env: CartPole-v0


Total Reward for Episode 0 is 100.0 / 100
Total Reward for Episode 1 is 100.0 / 100
Total Reward for Episode 2 is 100.0 / 100
Total Reward for Episode 3 is 100.0 / 100
Total Reward for Episode 4 is 100.0 / 100
Total Reward for Episode 5 is 100.0 / 100
Total Reward for Episode 6 is 100.0 / 100
Total Reward for Episode 7 is 100.0 / 100
Total Reward for Episode 8 is 100.0 / 100
Total Reward for Episode 9 is 100.0 / 100
Total Reward for Episode 10 is 100.0 / 100
Total Reward for Episode 11 is 100.0 / 100
Total Reward for Episode 12 is 100.0 / 100
Total Reward for Episode 13 is 100.0 / 100
Total Reward for Episode 14 is 100.0 / 100
Total Reward for Episode 15 is 100.0 / 100
Total Reward for Episode 16 is 100.0 / 100
Total Reward for Episode 17 is 100.0 / 100
Total Reward for Episode 18 is 100.0 / 100
Total Reward for Episode 19 is 100.0 / 100


In [18]:
# This cell was written by Marco Tamassia; All credit to Marco: https://bitbucket.org/marcotamassia/deep-rl
# Ported from Python3 using 3to2: 3to2 -w foo.py
# Minro edit as marked below.

from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from itertools import izip
from keras.optimizers import SGD


class RingBuffer(object):
    u"""
    A multi-field ring buffer using numpy arrays.

    Adapted from https://scimusing.wordpress.com/2013/10/25/ring-buffers-in-pythonnumpy/
    """
    def __init__(self, memory_size, entries_shape):
        self.index = 0
        self.size = 0
        self.data = tuple(
            np.zeros((memory_size, size), dtype=dtype)
            for size, dtype in entries_shape
        )
        self.max_size = memory_size

    def append(self, row):
        for data, new_data in izip(self.data, row):
            data[self.index, :] = new_data
        self.index = (self.index + 1) % self.max_size
        self.size = min(self.size + 1, self.max_size)

    def __get__(self, indices):
        return tuple(data[(self.index + indices) % self.max_size, :] for data in self.data)

    def get_random_entries(self, n):
        indices = np.random.randint(0, self.size, n)
        return tuple(data[indices, :] for data in self.data)


class Experience(object):
    u"""
    Experience pool, used to generate batches of random past experience.
    """
    def __init__(self, state_size, memory_size, discount):
        self.discount = discount
        self.memory = RingBuffer(
            memory_size=memory_size,
            entries_shape=(
                (state_size, float),
                (1, int),
                (1, float),
                (state_size, float),
                (1, bool)
            )
        )

    def remember(self, state, action, reward, new_state, game_over):
        self.memory.append((state, action, reward, new_state, game_over))

    def get_batch(self, model, batch_size):
        n_rows = min(self.memory.size, batch_size)

        S, A, R, NS, GO = self.memory.get_random_entries(n_rows)
        A, R, NGO = A.flatten(), R.flatten(), ~GO.flatten()
        inputs = S
        targets = model.predict(S)
        targets[np.arange(len(A)), A] = R
        targets[np.where(NGO), A[NGO]] += self.discount * np.max(model.predict(NS[NGO,:]),axis=1)
        return inputs, targets


def make_net(num_actions, state_size, hidden_size):
    model = Sequential()
    model.add(Dense(hidden_size, input_shape=(state_size,)))
    model.add(Activation(u'relu'))
    model.add(Dropout(0.2))
    model.add(Dense(num_actions))
    sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True) # Added
    model.compile(optimizer=sgd, loss=u"MSE")
    return model, sgd # Added sgd


def do_the_thing():
    episodes = 500 # Changed from 1000 to 500
    batch_size = 100
    epsilon = 0.1
    epsilon_decay = 1e-03
    hidden_size = 1000
    experience_pool_size = 3000
    discount = 0.99

    env = gym.make(u"CartPole-v0")
    exp = Experience(env.state.size, experience_pool_size, discount)
    model, sgd = make_net(
        num_actions=env.action_space.n,
        state_size=env.state.size,
        hidden_size=hidden_size
    )

    for ep_n in xrange(episodes):
        state = env.reset()
        game_over = False
        loss, ret, steps = 0.0, 0, 0
        ret = 0
        while not game_over:
            env.render()

            # Choose an epsilon-greedy action
            if random.random() <= epsilon:
                action = env.action_space.sample()
            else:
                # print(model.predict(state[np.newaxis,:])[0])
                action = np.argmax(model.predict(state[np.newaxis,:])[0])
            epsilon *= (1 - epsilon_decay)

            # Collect experience
            new_state, reward, game_over, info = env.step(action)
            reward = -1 if game_over else 1 - abs(new_state[2])*10
            exp.remember(state, action, reward, new_state, game_over)
            state = new_state

            # Train model and update stats
            inputs, targets = exp.get_batch(model, batch_size=batch_size)
            loss += model.train_on_batch(inputs, targets)
            ret += reward
            steps += 1
        print u"Episode {:03d}/{:03d} | Loss {:.3f} | Return {:.3f} | Steps {:d}".format(ep_n+1, episodes, loss, ret, steps)


In [19]:
do_the_thing()

[2016-07-08 17:40:17,871] Making new env: CartPole-v0


Episode 001/500 | Loss 1.349 | Return -0.024 | Steps 9
0.00999999977648
Episode 002/500 | Loss 1.938 | Return 2.450 | Steps 11
0.00999999977648
Episode 003/500 | Loss 1.309 | Return 2.200 | Steps 9
0.00999999977648
Episode 004/500 | Loss 1.264 | Return 0.896 | Steps 8
0.00999999977648
Episode 005/500 | Loss 1.364 | Return 2.788 | Steps 11
0.00999999977648
Episode 006/500 | Loss 1.114 | Return 2.927 | Steps 11
0.00999999977648
Episode 007/500 | Loss 0.932 | Return 1.107 | Steps 10
0.00999999977648
Episode 008/500 | Loss 1.382 | Return 0.816 | Steps 11
0.00999999977648
Episode 009/500 | Loss 0.986 | Return 0.221 | Steps 9
0.00999999977648
Episode 010/500 | Loss 1.290 | Return 1.317 | Steps 10
0.00999999977648
Episode 011/500 | Loss 1.319 | Return 1.924 | Steps 10
0.00999999977648
Episode 012/500 | Loss 1.480 | Return 2.150 | Steps 12
0.00999999977648
Episode 013/500 | Loss 1.304 | Return 2.883 | Steps 12
0.00999999977648
Episode 014/500 | Loss 2.506 | Return 3.975 | Steps 14
0.0099999997

[2016-07-08 17:40:29,717] Observation '[ 2.45108652  2.36256872  0.09687993  0.03139206]' is not contained within observation space 'Box(4,)'.


Episode 048/500 | Loss 16.409 | Return -17.310 | Steps 105
0.00999999977648
Episode 049/500 | Loss 23.231 | Return 47.928 | Steps 122
0.00999999977648
Episode 050/500 | Loss 11.790 | Return 20.335 | Steps 38
0.00999999977648
Episode 051/500 | Loss 6.207 | Return 0.527 | Steps 13
0.00999999977648
Episode 052/500 | Loss 8.541 | Return 2.389 | Steps 11
0.00999999977648
Episode 053/500 | Loss 9.500 | Return 1.359 | Steps 10
0.00999999977648
Episode 054/500 | Loss 15.141 | Return 3.575 | Steps 10
0.00999999977648
Episode 055/500 | Loss 21.992 | Return 0.728 | Steps 9
0.00999999977648
Episode 056/500 | Loss 117.714 | Return 3.425 | Steps 10
0.00999999977648
Episode 057/500 | Loss 176.230 | Return 3.177 | Steps 10
0.00999999977648
Episode 058/500 | Loss 136.694 | Return 6.820 | Steps 14
0.00999999977648
Episode 059/500 | Loss 231.249 | Return 1.179 | Steps 12
0.00999999977648
Episode 060/500 | Loss 49.024 | Return 7.061 | Steps 24
0.00999999977648
Episode 061/500 | Loss 28.627 | Return 5.575 

KeyboardInterrupt: 