In [1]:
import numpy as np
import gym

In [2]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam

from rl.agents.dqn import DQNAgent
from rl.policy import EpsGreedyQPolicy
from rl.memory import SequentialMemory

Using Theano backend.


In [7]:
ENV_NAME = 'CartPole-v0'

#Get the environment and extract the number of actions available in the Cartpole problem
env = gym.make(ENV_NAME)
np.random.seed(123)
env.seed(123)
nb_actions = env.action_space.n

[2017-08-10 15:53:35,580] Making new env: CartPole-v0


In [8]:
model = Sequential()
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_2 (Flatten)          (None, 4)                 0         
_________________________________________________________________
dense_3 (Dense)              (None, 16)                80        
_________________________________________________________________
activation_3 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 2)                 34        
_________________________________________________________________
activation_4 (Activation)    (None, 2)                 0         
Total params: 114
Trainable params: 114
Non-trainable params: 0
_________________________________________________________________
None


In [9]:
policy = EpsGreedyQPolicy()
memory = SequentialMemory(limit=50000, window_length=1)
dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10,
               target_model_update=1e-2, policy=policy)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])

# Ok, now it's time to learn something! We visualize the training here for show, but this slows down training quite a lot.
dqn.fit(env, nb_steps=5000, visualize=True, verbose=2)

Training for 5000 steps ...
   10/5000: episode: 1, duration: 0.212s, episode steps: 10, steps per second: 47, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.000 [0.000, 0.000], mean observation: 0.132 [-1.967, 3.014], loss: --, mean_absolute_error: --, mean_q: --




   19/5000: episode: 2, duration: 0.768s, episode steps: 9, steps per second: 12, episode reward: 9.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.111 [0.000, 1.000], mean observation: 0.161 [-1.326, 2.303], loss: 0.504959, mean_absolute_error: 0.616374, mean_q: 0.271113




   33/5000: episode: 3, duration: 0.234s, episode steps: 14, steps per second: 60, episode reward: 14.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.214 [0.000, 1.000], mean observation: 0.125 [-1.517, 2.553], loss: 0.436633, mean_absolute_error: 0.580476, mean_q: 0.333729
   42/5000: episode: 4, duration: 0.152s, episode steps: 9, steps per second: 59, episode reward: 9.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.000 [0.000, 0.000], mean observation: 0.137 [-1.783, 2.784], loss: 0.405209, mean_absolute_error: 0.571296, mean_q: 0.396374
   53/5000: episode: 5, duration: 0.183s, episode steps: 11, steps per second: 60, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.091 [0.000, 1.000], mean observation: 0.148 [-1.915, 2.968], loss: 0.374215, mean_absolute_error: 0.557008, mean_q: 0.457941
   65/5000: episode: 6, duration: 0.200s, episode steps: 12, steps per second: 60, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean action: 0

  322/5000: episode: 33, duration: 0.150s, episode steps: 9, steps per second: 60, episode reward: 9.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.000 [0.000, 0.000], mean observation: 0.145 [-1.737, 2.764], loss: 0.340623, mean_absolute_error: 1.074081, mean_q: 2.470546
  331/5000: episode: 34, duration: 0.154s, episode steps: 9, steps per second: 58, episode reward: 9.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.000 [0.000, 0.000], mean observation: 0.156 [-1.712, 2.836], loss: 0.314549, mean_absolute_error: 1.073343, mean_q: 2.458197
  339/5000: episode: 35, duration: 0.132s, episode steps: 8, steps per second: 61, episode reward: 8.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.000 [0.000, 0.000], mean observation: 0.144 [-1.590, 2.565], loss: 0.334980, mean_absolute_error: 1.093307, mean_q: 2.421076
  350/5000: episode: 36, duration: 0.180s, episode steps: 11, steps per second: 61, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean action: 0

  630/5000: episode: 64, duration: 0.166s, episode steps: 10, steps per second: 60, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.100 [0.000, 1.000], mean observation: 0.135 [-1.523, 2.481], loss: 0.289703, mean_absolute_error: 1.816219, mean_q: 3.848078
  641/5000: episode: 65, duration: 0.188s, episode steps: 11, steps per second: 58, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.182 [0.000, 1.000], mean observation: 0.118 [-1.578, 2.436], loss: 0.255801, mean_absolute_error: 1.835965, mean_q: 3.895579
  651/5000: episode: 66, duration: 0.162s, episode steps: 10, steps per second: 62, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.100 [0.000, 1.000], mean observation: 0.132 [-1.548, 2.481], loss: 0.231920, mean_absolute_error: 1.842907, mean_q: 3.988597
  661/5000: episode: 67, duration: 0.164s, episode steps: 10, steps per second: 61, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean act

  925/5000: episode: 93, duration: 0.167s, episode steps: 10, steps per second: 60, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.200 [0.000, 1.000], mean observation: 0.113 [-1.564, 2.350], loss: 0.241356, mean_absolute_error: 2.470480, mean_q: 5.040570
  935/5000: episode: 94, duration: 0.166s, episode steps: 10, steps per second: 60, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.200 [0.000, 1.000], mean observation: 0.128 [-1.585, 2.415], loss: 0.227599, mean_absolute_error: 2.482512, mean_q: 5.075848
  943/5000: episode: 95, duration: 0.133s, episode steps: 8, steps per second: 60, episode reward: 8.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.125 [0.000, 1.000], mean observation: 0.160 [-1.142, 2.028], loss: 0.213274, mean_absolute_error: 2.459292, mean_q: 5.005486
  952/5000: episode: 96, duration: 0.152s, episode steps: 9, steps per second: 59, episode reward: 9.000, mean reward: 1.000 [1.000, 1.000], mean action:

 1222/5000: episode: 123, duration: 0.163s, episode steps: 10, steps per second: 61, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.900 [0.000, 1.000], mean observation: -0.162 [-2.614, 1.531], loss: 1.215921, mean_absolute_error: 3.270322, mean_q: 6.131730
 1233/5000: episode: 124, duration: 0.185s, episode steps: 11, steps per second: 60, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean action: 1.000 [1.000, 1.000], mean observation: -0.142 [-3.306, 2.149], loss: 1.508476, mean_absolute_error: 3.302009, mean_q: 6.162461
 1242/5000: episode: 125, duration: 0.149s, episode steps: 9, steps per second: 61, episode reward: 9.000, mean reward: 1.000 [1.000, 1.000], mean action: 1.000 [1.000, 1.000], mean observation: -0.138 [-2.783, 1.756], loss: 0.897659, mean_absolute_error: 3.392080, mean_q: 6.305478
 1250/5000: episode: 126, duration: 0.130s, episode steps: 8, steps per second: 62, episode reward: 8.000, mean reward: 1.000 [1.000, 1.000], mean 

 2375/5000: episode: 152, duration: 0.667s, episode steps: 40, steps per second: 60, episode reward: 40.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.450 [0.000, 1.000], mean observation: -0.168 [-1.136, 0.733], loss: 2.861567, mean_absolute_error: 6.101355, mean_q: 11.344931
 2423/5000: episode: 153, duration: 0.799s, episode steps: 48, steps per second: 60, episode reward: 48.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.458 [0.000, 1.000], mean observation: -0.140 [-0.906, 0.659], loss: 2.120062, mean_absolute_error: 6.239829, mean_q: 11.729821
 2471/5000: episode: 154, duration: 0.803s, episode steps: 48, steps per second: 60, episode reward: 48.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.458 [0.000, 1.000], mean observation: -0.152 [-0.901, 0.562], loss: 2.615871, mean_absolute_error: 6.364263, mean_q: 11.911275
 2523/5000: episode: 155, duration: 0.859s, episode steps: 52, steps per second: 61, episode reward: 52.000, mean reward: 1.000 [1.000, 1.000]

 3605/5000: episode: 181, duration: 0.613s, episode steps: 37, steps per second: 60, episode reward: 37.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.459 [0.000, 1.000], mean observation: -0.112 [-0.659, 0.420], loss: 3.488212, mean_absolute_error: 8.999782, mean_q: 17.196478
 3632/5000: episode: 182, duration: 0.447s, episode steps: 27, steps per second: 60, episode reward: 27.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.444 [0.000, 1.000], mean observation: -0.112 [-0.690, 0.357], loss: 4.030760, mean_absolute_error: 9.083974, mean_q: 17.324644
 3657/5000: episode: 183, duration: 0.418s, episode steps: 25, steps per second: 60, episode reward: 25.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.440 [0.000, 1.000], mean observation: -0.112 [-0.633, 0.375], loss: 3.510044, mean_absolute_error: 9.089281, mean_q: 17.313381
 3686/5000: episode: 184, duration: 0.476s, episode steps: 29, steps per second: 61, episode reward: 29.000, mean reward: 1.000 [1.000, 1.000]

 4595/5000: episode: 210, duration: 0.581s, episode steps: 35, steps per second: 60, episode reward: 35.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.486 [0.000, 1.000], mean observation: -0.078 [-0.962, 0.343], loss: 4.577616, mean_absolute_error: 10.823954, mean_q: 20.768909
 4632/5000: episode: 211, duration: 0.620s, episode steps: 37, steps per second: 60, episode reward: 37.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.459 [0.000, 1.000], mean observation: -0.124 [-0.627, 0.355], loss: 4.812831, mean_absolute_error: 10.816339, mean_q: 20.779085
 4673/5000: episode: 212, duration: 0.683s, episode steps: 41, steps per second: 60, episode reward: 41.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.488 [0.000, 1.000], mean observation: -0.101 [-0.737, 0.348], loss: 4.687599, mean_absolute_error: 10.842155, mean_q: 20.817657
 4706/5000: episode: 213, duration: 0.549s, episode steps: 33, steps per second: 60, episode reward: 33.000, mean reward: 1.000 [1.000, 1.0

<keras.callbacks.History at 0x208ca18bf28>

In [6]:
dqn.test(env, nb_episodes=5, visualize=True)

Testing for 5 episodes ...
Episode 1: reward: 81.000, steps: 81
Episode 2: reward: 70.000, steps: 70
Episode 3: reward: 79.000, steps: 79
Episode 4: reward: 200.000, steps: 200
Episode 5: reward: 200.000, steps: 200


<keras.callbacks.History at 0x2083faad588>