In [2]:
import numpy as np
import gym

from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam

from rl.agents.dqn import DQNAgent
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory


ENV_NAME = 'CartPole-v0'


# Get the environment and extract the number of actions.
env = gym.make(ENV_NAME)
np.random.seed(123)
env.seed(123)
nb_actions = env.action_space.n

# Next, we build a very simple model.
model = Sequential()
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
print(model.summary())

# Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
# even the metrics!
memory = SequentialMemory(limit=50000, window_length=1)
policy = BoltzmannQPolicy()
dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10,
               target_model_update=1e-2, policy=policy)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])

# Okay, now it's time to learn something! We visualize the training here for show, but this
# slows down training quite a lot. You can always safely abort the training prematurely using
# Ctrl + C.
dqn.fit(env, nb_steps=50000, visualize=False, verbose=2)

# After training is done, we save the final weights.
dqn.save_weights('dqn_{}_weights.h5f'.format(ENV_NAME), overwrite=True)

# Finally, evaluate our algorithm for 5 episodes.
dqn.test(env, nb_episodes=5, visualize=True)

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten_1 (Flatten)         (None, 4)                 0         
                                                                 
 dense_4 (Dense)             (None, 16)                80        
                                                                 
 activation_4 (Activation)   (None, 16)                0         
                                                                 
 dense_5 (Dense)             (None, 16)                272       
                                                                 
 activation_5 (Activation)   (None, 16)                0         
                                                                 
 dense_6 (Dense)             (None, 16)                272       
                                                                 
 activation_6 (Activation)   (None, 16)               

  super().__init__(name, **kwargs)


Training for 50000 steps ...


  updates=self.state_updates,


    26/50000: episode: 1, duration: 0.450s, episode steps:  26, steps per second:  58, episode reward: 26.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.538 [0.000, 1.000],  loss: 0.475732, mae: 0.510283, mean_q: 0.051673
    60/50000: episode: 2, duration: 0.079s, episode steps:  34, steps per second: 428, episode reward: 34.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.618 [0.000, 1.000],  loss: 0.318179, mae: 0.533546, mean_q: 0.293212
    81/50000: episode: 3, duration: 0.050s, episode steps:  21, steps per second: 417, episode reward: 21.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.524 [0.000, 1.000],  loss: 0.112251, mae: 0.600926, mean_q: 0.790559
    96/50000: episode: 4, duration: 0.036s, episode steps:  15, steps per second: 412, episode reward: 15.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.400 [0.000, 1.000],  loss: 0.068393, mae: 0.686725, mean_q: 1.081965
   105/50000: episode: 5, duration: 0.023s, episode steps:   9, step



   124/50000: episode: 6, duration: 0.048s, episode steps:  19, steps per second: 397, episode reward: 19.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.737 [0.000, 1.000],  loss: 0.044988, mae: 0.755872, mean_q: 1.314899
   141/50000: episode: 7, duration: 0.041s, episode steps:  17, steps per second: 415, episode reward: 17.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.647 [0.000, 1.000],  loss: 0.037116, mae: 0.784144, mean_q: 1.405212
   162/50000: episode: 8, duration: 0.050s, episode steps:  21, steps per second: 420, episode reward: 21.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.524 [0.000, 1.000],  loss: 0.034732, mae: 0.854447, mean_q: 1.602230
   173/50000: episode: 9, duration: 0.027s, episode steps:  11, steps per second: 405, episode reward: 11.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.273 [0.000, 1.000],  loss: 0.025379, mae: 0.894612, mean_q: 1.717820
   186/50000: episode: 10, duration: 0.032s, episode steps:  13, ste

   888/50000: episode: 42, duration: 0.048s, episode steps:  20, steps per second: 415, episode reward: 20.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.500 [0.000, 1.000],  loss: 0.316441, mae: 3.584287, mean_q: 6.866067
   937/50000: episode: 43, duration: 0.115s, episode steps:  49, steps per second: 426, episode reward: 49.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.490 [0.000, 1.000],  loss: 0.309258, mae: 3.723918, mean_q: 7.134957
   966/50000: episode: 44, duration: 0.068s, episode steps:  29, steps per second: 425, episode reward: 29.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.414 [0.000, 1.000],  loss: 0.313841, mae: 3.875646, mean_q: 7.420795
   988/50000: episode: 45, duration: 0.053s, episode steps:  22, steps per second: 414, episode reward: 22.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.409 [0.000, 1.000],  loss: 0.354139, mae: 4.013767, mean_q: 7.749032
  1000/50000: episode: 46, duration: 0.030s, episode steps:  12,

  4164/50000: episode: 77, duration: 0.463s, episode steps: 200, steps per second: 432, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.505 [0.000, 1.000],  loss: 2.275429, mae: 17.958168, mean_q: 36.497547
  4354/50000: episode: 78, duration: 0.457s, episode steps: 190, steps per second: 416, episode reward: 190.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.532 [0.000, 1.000],  loss: 2.505318, mae: 18.907856, mean_q: 38.503838
  4554/50000: episode: 79, duration: 0.501s, episode steps: 200, steps per second: 400, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.515 [0.000, 1.000],  loss: 2.701282, mae: 19.726913, mean_q: 40.109444
  4754/50000: episode: 80, duration: 0.464s, episode steps: 200, steps per second: 431, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.510 [0.000, 1.000],  loss: 2.671194, mae: 20.706013, mean_q: 42.036533
  4954/50000: episode: 81, duration: 0.462s, episode

 11028/50000: episode: 112, duration: 0.466s, episode steps: 200, steps per second: 429, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.530 [0.000, 1.000],  loss: 4.998827, mae: 36.866112, mean_q: 74.456787
 11209/50000: episode: 113, duration: 0.421s, episode steps: 181, steps per second: 430, episode reward: 181.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.536 [0.000, 1.000],  loss: 7.997814, mae: 36.872288, mean_q: 74.297134
 11409/50000: episode: 114, duration: 0.466s, episode steps: 200, steps per second: 429, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.525 [0.000, 1.000],  loss: 6.145566, mae: 37.232002, mean_q: 75.010452
 11600/50000: episode: 115, duration: 0.446s, episode steps: 191, steps per second: 429, episode reward: 191.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.534 [0.000, 1.000],  loss: 4.482464, mae: 37.321278, mean_q: 75.318260
 11777/50000: episode: 116, duration: 0.413s, ep

 17898/50000: episode: 147, duration: 0.473s, episode steps: 200, steps per second: 423, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.500 [0.000, 1.000],  loss: 6.504152, mae: 42.329334, mean_q: 85.315254
 18098/50000: episode: 148, duration: 0.470s, episode steps: 200, steps per second: 426, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.530 [0.000, 1.000],  loss: 10.849400, mae: 42.813080, mean_q: 86.206642
 18298/50000: episode: 149, duration: 0.471s, episode steps: 200, steps per second: 425, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.510 [0.000, 1.000],  loss: 5.694038, mae: 42.834942, mean_q: 86.388573
 18498/50000: episode: 150, duration: 0.470s, episode steps: 200, steps per second: 426, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.505 [0.000, 1.000],  loss: 6.326753, mae: 42.421688, mean_q: 85.533966
 18698/50000: episode: 151, duration: 0.471s, e

 24836/50000: episode: 182, duration: 0.422s, episode steps: 177, steps per second: 420, episode reward: 177.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.458 [0.000, 1.000],  loss: 9.069106, mae: 46.216587, mean_q: 92.790894
 25029/50000: episode: 183, duration: 0.459s, episode steps: 193, steps per second: 420, episode reward: 193.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.461 [0.000, 1.000],  loss: 10.635406, mae: 46.326035, mean_q: 92.987144
 25210/50000: episode: 184, duration: 0.445s, episode steps: 181, steps per second: 407, episode reward: 181.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.486 [0.000, 1.000],  loss: 7.264589, mae: 45.869221, mean_q: 91.999306
 25410/50000: episode: 185, duration: 0.478s, episode steps: 200, steps per second: 419, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.490 [0.000, 1.000],  loss: 4.790083, mae: 46.071991, mean_q: 92.471893
 25610/50000: episode: 186, duration: 0.477s, e

 31705/50000: episode: 217, duration: 0.484s, episode steps: 200, steps per second: 413, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.495 [0.000, 1.000],  loss: 5.796358, mae: 48.008507, mean_q: 96.641533
 31905/50000: episode: 218, duration: 0.482s, episode steps: 200, steps per second: 415, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.490 [0.000, 1.000],  loss: 4.380370, mae: 48.127960, mean_q: 96.991676
 32105/50000: episode: 219, duration: 0.481s, episode steps: 200, steps per second: 416, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.490 [0.000, 1.000],  loss: 7.839823, mae: 48.875908, mean_q: 98.412804
 32305/50000: episode: 220, duration: 0.482s, episode steps: 200, steps per second: 415, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.490 [0.000, 1.000],  loss: 4.086870, mae: 48.653229, mean_q: 98.197853
 32505/50000: episode: 221, duration: 0.482s, ep

 38693/50000: episode: 252, duration: 0.489s, episode steps: 200, steps per second: 409, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.490 [0.000, 1.000],  loss: 8.962475, mae: 49.747837, mean_q: 99.997025
 38893/50000: episode: 253, duration: 0.487s, episode steps: 200, steps per second: 410, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.500 [0.000, 1.000],  loss: 5.334423, mae: 49.353340, mean_q: 99.257469
 39093/50000: episode: 254, duration: 0.489s, episode steps: 200, steps per second: 409, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.500 [0.000, 1.000],  loss: 7.620332, mae: 49.972355, mean_q: 100.383545
 39293/50000: episode: 255, duration: 0.489s, episode steps: 200, steps per second: 409, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.490 [0.000, 1.000],  loss: 8.875725, mae: 50.141342, mean_q: 100.671898
 39493/50000: episode: 256, duration: 0.489s, 

 45693/50000: episode: 287, duration: 0.496s, episode steps: 200, steps per second: 403, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.500 [0.000, 1.000],  loss: 9.314844, mae: 47.117695, mean_q: 94.379509
 45893/50000: episode: 288, duration: 0.495s, episode steps: 200, steps per second: 404, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.500 [0.000, 1.000],  loss: 7.247159, mae: 47.329552, mean_q: 94.751129
 46093/50000: episode: 289, duration: 0.496s, episode steps: 200, steps per second: 404, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.500 [0.000, 1.000],  loss: 12.560661, mae: 47.278534, mean_q: 94.414787
 46293/50000: episode: 290, duration: 0.497s, episode steps: 200, steps per second: 402, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.500 [0.000, 1.000],  loss: 9.273166, mae: 47.309994, mean_q: 94.671310
 46493/50000: episode: 291, duration: 0.496s, e

<keras.callbacks.History at 0x7fcf257734f0>