In [1]:
import gym

In [2]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Flatten
from tensorflow.keras.optimizers import Adam

In [3]:
from rl.agents.dqn import DQNAgent

In [4]:
env_name = "CartPole-v0"

In [5]:
env = gym.make(env_name)

In [6]:
env.reset()
for step in range(200):
    env.render(mode="human")
    action = env.action_space.sample()
    env.step(action)
env.close()



In [7]:
num_actions = env.action_space.n

In [8]:
num_observations = env.observation_space.shape

In [9]:
model = Sequential()

model.add(Flatten(input_shape=(1, ) + num_observations))
model.add(Dense(16))
model.add(Activation("relu"))
model.add(Dense(32))
model.add(Activation("relu"))
model.add(Dense(num_actions))
model.add(Activation("linear"))


In [10]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten (Flatten)            (None, 4)                 0         
_________________________________________________________________
dense (Dense)                (None, 16)                80        
_________________________________________________________________
activation (Activation)      (None, 16)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 32)                544       
_________________________________________________________________
activation_1 (Activation)    (None, 32)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 66        
_________________________________________________________________
activation_2 (Activation)    (None, 2)                 0

In [11]:
from rl.memory import SequentialMemory

In [12]:
memory = SequentialMemory(limit=20000, window_length=1)

In [13]:
from rl.policy import LinearAnnealedPolicy, EpsGreedyQPolicy

In [14]:
policy = LinearAnnealedPolicy(EpsGreedyQPolicy(),
                              attr="eps",
                              value_max=1.0,
                              value_min=0.1,
                              value_test=0.05,
                              nb_steps=20000
                             )

In [15]:
dqn = DQNAgent(model=model, nb_actions=num_actions, memory=memory, 
               nb_steps_warmup=10, target_model_update=100, policy=policy)

In [16]:
dqn.compile(Adam(learning_rate=1e-3), metrics=["mae"])

In [18]:
dqn.fit(env, nb_steps=20000, visualize=False, verbose=2)

Training for 20000 steps ...




    11/20000: episode: 1, duration: 3.651s, episode steps:  11, steps per second:   3, episode reward: 11.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.364 [0.000, 1.000],  loss: --, mae: --, mean_q: --, mean_eps: --




    36/20000: episode: 2, duration: 0.433s, episode steps:  25, steps per second:  58, episode reward: 25.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.600 [0.000, 1.000],  loss: 0.482980, mae: 0.550593, mean_q: 0.179799, mean_eps: 0.998965
    56/20000: episode: 3, duration: 0.260s, episode steps:  20, steps per second:  77, episode reward: 20.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.400 [0.000, 1.000],  loss: 0.291637, mae: 0.528614, mean_q: 0.411374, mean_eps: 0.997952
    71/20000: episode: 4, duration: 0.176s, episode steps:  15, steps per second:  85, episode reward: 15.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.600 [0.000, 1.000],  loss: 0.154829, mae: 0.539721, mean_q: 0.677994, mean_eps: 0.997165
    86/20000: episode: 5, duration: 0.173s, episode steps:  15, steps per second:  87, episode reward: 15.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.533 [0.000, 1.000],  loss: 0.070728, mae: 0.574117, mean_q: 0.949365, mean_ep

   722/20000: episode: 35, duration: 0.341s, episode steps:  22, steps per second:  64, episode reward: 22.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.500 [0.000, 1.000],  loss: 0.764654, mae: 4.002561, mean_q: 7.493076, mean_eps: 0.968028
   739/20000: episode: 36, duration: 0.180s, episode steps:  17, steps per second:  95, episode reward: 17.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.353 [0.000, 1.000],  loss: 0.808484, mae: 4.046058, mean_q: 7.659423, mean_eps: 0.967150
   761/20000: episode: 37, duration: 0.232s, episode steps:  22, steps per second:  95, episode reward: 22.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.591 [0.000, 1.000],  loss: 0.869872, mae: 4.035508, mean_q: 7.558168, mean_eps: 0.966273
   781/20000: episode: 38, duration: 0.256s, episode steps:  20, steps per second:  78, episode reward: 20.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.600 [0.000, 1.000],  loss: 0.703331, mae: 4.025028, mean_q: 7.664268, mea

  1518/20000: episode: 69, duration: 0.291s, episode steps:  29, steps per second:  99, episode reward: 29.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.552 [0.000, 1.000],  loss: 1.226316, mae: 6.773291, mean_q: 13.250874, mean_eps: 0.932365
  1543/20000: episode: 70, duration: 0.260s, episode steps:  25, steps per second:  96, episode reward: 25.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.440 [0.000, 1.000],  loss: 1.186306, mae: 6.909901, mean_q: 13.662804, mean_eps: 0.931150
  1567/20000: episode: 71, duration: 0.258s, episode steps:  24, steps per second:  93, episode reward: 24.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.458 [0.000, 1.000],  loss: 1.234279, mae: 6.954340, mean_q: 13.740529, mean_eps: 0.930048
  1589/20000: episode: 72, duration: 0.338s, episode steps:  22, steps per second:  65, episode reward: 22.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.500 [0.000, 1.000],  loss: 1.255546, mae: 6.905722, mean_q: 13.634317,

  2305/20000: episode: 102, duration: 0.231s, episode steps:  21, steps per second:  91, episode reward: 21.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.619 [0.000, 1.000],  loss: 1.254059, mae: 9.930593, mean_q: 20.095972, mean_eps: 0.896770
  2337/20000: episode: 103, duration: 0.341s, episode steps:  32, steps per second:  94, episode reward: 32.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.500 [0.000, 1.000],  loss: 1.269908, mae: 10.519327, mean_q: 21.401091, mean_eps: 0.895578
  2353/20000: episode: 104, duration: 0.163s, episode steps:  16, steps per second:  98, episode reward: 16.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.375 [0.000, 1.000],  loss: 1.151393, mae: 10.390877, mean_q: 21.222381, mean_eps: 0.894497
  2369/20000: episode: 105, duration: 0.178s, episode steps:  16, steps per second:  90, episode reward: 16.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.625 [0.000, 1.000],  loss: 2.356087, mae: 10.563463, mean_q: 21.

  3301/20000: episode: 134, duration: 0.302s, episode steps:  27, steps per second:  89, episode reward: 27.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.519 [0.000, 1.000],  loss: 2.101375, mae: 15.051253, mean_q: 30.737646, mean_eps: 0.852085
  3312/20000: episode: 135, duration: 0.132s, episode steps:  11, steps per second:  83, episode reward: 11.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.455 [0.000, 1.000],  loss: 1.765360, mae: 15.207394, mean_q: 30.888405, mean_eps: 0.851230
  3344/20000: episode: 136, duration: 0.385s, episode steps:  32, steps per second:  83, episode reward: 32.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.438 [0.000, 1.000],  loss: 1.779641, mae: 15.182363, mean_q: 31.041633, mean_eps: 0.850262
  3364/20000: episode: 137, duration: 0.217s, episode steps:  20, steps per second:  92, episode reward: 20.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.450 [0.000, 1.000],  loss: 2.225126, mae: 14.941394, mean_q: 30

  4490/20000: episode: 166, duration: 0.940s, episode steps:  85, steps per second:  90, episode reward: 85.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.553 [0.000, 1.000],  loss: 3.780527, mae: 21.632459, mean_q: 44.539071, mean_eps: 0.799885
  4522/20000: episode: 167, duration: 0.365s, episode steps:  32, steps per second:  88, episode reward: 32.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.469 [0.000, 1.000],  loss: 4.183976, mae: 22.085423, mean_q: 45.599017, mean_eps: 0.797253
  4531/20000: episode: 168, duration: 0.100s, episode steps:   9, steps per second:  90, episode reward:  9.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.333 [0.000, 1.000],  loss: 3.814010, mae: 21.754590, mean_q: 44.960398, mean_eps: 0.796330
  4546/20000: episode: 169, duration: 0.177s, episode steps:  15, steps per second:  85, episode reward: 15.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.400 [0.000, 1.000],  loss: 3.539459, mae: 22.641770, mean_q: 46

  5613/20000: episode: 198, duration: 0.367s, episode steps:  33, steps per second:  90, episode reward: 33.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.606 [0.000, 1.000],  loss: 7.672724, mae: 29.039780, mean_q: 59.603796, mean_eps: 0.748180
  5637/20000: episode: 199, duration: 0.263s, episode steps:  24, steps per second:  91, episode reward: 24.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.458 [0.000, 1.000],  loss: 4.999712, mae: 29.166821, mean_q: 59.990496, mean_eps: 0.746898
  5652/20000: episode: 200, duration: 0.174s, episode steps:  15, steps per second:  86, episode reward: 15.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.400 [0.000, 1.000],  loss: 8.399810, mae: 29.490272, mean_q: 60.835766, mean_eps: 0.746020
  5682/20000: episode: 201, duration: 0.330s, episode steps:  30, steps per second:  91, episode reward: 30.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.467 [0.000, 1.000],  loss: 9.627362, mae: 29.580751, mean_q: 60

  7035/20000: episode: 230, duration: 0.493s, episode steps:  41, steps per second:  83, episode reward: 41.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.488 [0.000, 1.000],  loss: 15.562431, mae: 39.014076, mean_q: 79.827145, mean_eps: 0.684370
  7118/20000: episode: 231, duration: 0.892s, episode steps:  83, steps per second:  93, episode reward: 83.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.494 [0.000, 1.000],  loss: 14.181838, mae: 39.529860, mean_q: 80.962977, mean_eps: 0.681580
  7152/20000: episode: 232, duration: 0.373s, episode steps:  34, steps per second:  91, episode reward: 34.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.441 [0.000, 1.000],  loss: 21.405275, mae: 39.640001, mean_q: 80.962864, mean_eps: 0.678947
  7181/20000: episode: 233, duration: 0.316s, episode steps:  29, steps per second:  92, episode reward: 29.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.621 [0.000, 1.000],  loss: 11.687950, mae: 39.538839, mean_q

  9974/20000: episode: 262, duration: 2.152s, episode steps: 200, steps per second:  93, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.510 [0.000, 1.000],  loss: 30.107876, mae: 60.259772, mean_q: 123.173327, mean_eps: 0.555692
 10085/20000: episode: 263, duration: 1.194s, episode steps: 111, steps per second:  93, episode reward: 111.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.532 [0.000, 1.000],  loss: 30.226055, mae: 61.229475, mean_q: 125.461075, mean_eps: 0.548695
 10135/20000: episode: 264, duration: 0.542s, episode steps:  50, steps per second:  92, episode reward: 50.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.520 [0.000, 1.000],  loss: 34.520819, mae: 62.173015, mean_q: 127.481331, mean_eps: 0.545072
 10276/20000: episode: 265, duration: 1.518s, episode steps: 141, steps per second:  93, episode reward: 141.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.539 [0.000, 1.000],  loss: 26.974656, mae: 62.703698, 

 14523/20000: episode: 294, duration: 2.150s, episode steps: 200, steps per second:  93, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.515 [0.000, 1.000],  loss: 35.384257, mae: 85.070506, mean_q: 173.565745, mean_eps: 0.350987
 14723/20000: episode: 295, duration: 2.127s, episode steps: 200, steps per second:  94, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.520 [0.000, 1.000],  loss: 47.378766, mae: 85.191408, mean_q: 173.758348, mean_eps: 0.341987
 14923/20000: episode: 296, duration: 2.181s, episode steps: 200, steps per second:  92, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.515 [0.000, 1.000],  loss: 43.784523, mae: 85.289763, mean_q: 173.807012, mean_eps: 0.332987
 15123/20000: episode: 297, duration: 2.121s, episode steps: 200, steps per second:  94, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.505 [0.000, 1.000],  loss: 49.772763, mae: 85.170283,

<keras.callbacks.History at 0x2178cd2e0b8>

In [19]:
dqn.save_weights(f"my_weights_cartpole.h5f", overwrite=True)

In [20]:
dqn.test(env, nb_episodes=5, visualize=True)
env.close()

Testing for 5 episodes ...
Episode 1: reward: 200.000, steps: 200
Episode 2: reward: 200.000, steps: 200
Episode 3: reward: 200.000, steps: 200
Episode 4: reward: 200.000, steps: 200
Episode 5: reward: 200.000, steps: 200
