In [2]:
import gym
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Activation,Flatten
from tensorflow.keras.optimizers import Adam

from rl.agents.dqn import DQNAgent

In [10]:
env_name = 'CartPole-v1'
env = gym.make(env_name)


In [11]:
env.reset()

array([-0.04065783,  0.001136  ,  0.01273832, -0.031902  ], dtype=float32)

In [13]:
nb_actions = env.action_space.n
nb_obs = env.observation_space.shape

In [14]:
model = Sequential()

model.add(Flatten(input_shape=(1,)+nb_obs))
model.add(Dense(16))
model.add(Activation('relu'))

model.add(Dense(32))
model.add(Activation('relu'))

model.add(Dense(nb_actions))
model.add(Activation('linear'))

In [15]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_1 (Flatten)          (None, 4)                 0         
_________________________________________________________________
dense_3 (Dense)              (None, 16)                80        
_________________________________________________________________
activation_3 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 32)                544       
_________________________________________________________________
activation_4 (Activation)    (None, 32)                0         
_________________________________________________________________
dense_5 (Dense)              (None, 2)                 66        
_________________________________________________________________
activation_5 (Activation)    (None, 2)                

In [16]:
from rl.memory import SequentialMemory

In [17]:
memory = SequentialMemory(limit = 20000,window_length = 1)

In [18]:
from rl.policy import LinearAnnealedPolicy,EpsGreedyQPolicy

In [20]:
policy = LinearAnnealedPolicy(EpsGreedyQPolicy(),attr='eps',value_max=1.0,value_min=0.1,value_test=0.05,nb_steps=20000)

In [22]:
dqn = DQNAgent(model=model,nb_actions=nb_actions,memory=memory,nb_steps_warmup=10,target_model_update=100,policy=policy)

In [24]:
dqn.compile(Adam(learning_rate=1e-3),metrics=['mae'])

In [26]:
dqn.fit(env,nb_steps=20000,visualize=False,verbose=2)

Training for 20000 steps ...


  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=s

    65/20000: episode: 1, duration: 0.732s, episode steps:  65, steps per second:  89, episode reward: 65.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.508 [0.000, 1.000],  loss: 0.392155, mae: 0.504465, mean_q: 0.170860, mean_eps: 0.998313
    92/20000: episode: 2, duration: 0.182s, episode steps:  27, steps per second: 148, episode reward: 27.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.593 [0.000, 1.000],  loss: 0.161734, mae: 0.502069, mean_q: 0.567746, mean_eps: 0.996490
   114/20000: episode: 3, duration: 0.148s, episode steps:  22, steps per second: 148, episode reward: 22.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.545 [0.000, 1.000],  loss: 0.342728, mae: 0.779947, mean_q: 0.956096, mean_eps: 0.995387
   160/20000: episode: 4, duration: 0.386s, episode steps:  46, steps per second: 119, episode reward: 46.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.522 [0.000, 1.000],  loss: 0.130413, mae: 1.004086, mean_q: 1.680221, mean_ep

   802/20000: episode: 34, duration: 0.165s, episode steps:  33, steps per second: 200, episode reward: 33.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.394 [0.000, 1.000],  loss: 0.843919, mae: 4.231310, mean_q: 8.080342, mean_eps: 0.964675
   814/20000: episode: 35, duration: 0.070s, episode steps:  12, steps per second: 171, episode reward: 12.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.250 [0.000, 1.000],  loss: 0.705325, mae: 4.536696, mean_q: 8.633029, mean_eps: 0.963662
   828/20000: episode: 36, duration: 0.066s, episode steps:  14, steps per second: 213, episode reward: 14.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.643 [0.000, 1.000],  loss: 1.100514, mae: 4.746116, mean_q: 9.175126, mean_eps: 0.963078
   848/20000: episode: 37, duration: 0.095s, episode steps:  20, steps per second: 211, episode reward: 20.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.600 [0.000, 1.000],  loss: 1.067743, mae: 4.583682, mean_q: 8.720623, mea

  1574/20000: episode: 68, duration: 0.246s, episode steps:  50, steps per second: 203, episode reward: 50.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.520 [0.000, 1.000],  loss: 1.200610, mae: 7.154123, mean_q: 14.194646, mean_eps: 0.930318
  1586/20000: episode: 69, duration: 0.060s, episode steps:  12, steps per second: 199, episode reward: 12.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.333 [0.000, 1.000],  loss: 1.211736, mae: 7.140108, mean_q: 14.268094, mean_eps: 0.928923
  1604/20000: episode: 70, duration: 0.086s, episode steps:  18, steps per second: 209, episode reward: 18.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.389 [0.000, 1.000],  loss: 1.682487, mae: 7.297267, mean_q: 14.405947, mean_eps: 0.928248
  1616/20000: episode: 71, duration: 0.059s, episode steps:  12, steps per second: 205, episode reward: 12.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.583 [0.000, 1.000],  loss: 1.534702, mae: 7.605048, mean_q: 15.054353,

  2356/20000: episode: 103, duration: 0.159s, episode steps:  34, steps per second: 214, episode reward: 34.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.471 [0.000, 1.000],  loss: 1.515708, mae: 10.072885, mean_q: 20.355918, mean_eps: 0.894767
  2380/20000: episode: 104, duration: 0.117s, episode steps:  24, steps per second: 204, episode reward: 24.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.542 [0.000, 1.000],  loss: 1.731526, mae: 10.080506, mean_q: 20.476546, mean_eps: 0.893462
  2410/20000: episode: 105, duration: 0.144s, episode steps:  30, steps per second: 209, episode reward: 30.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.467 [0.000, 1.000],  loss: 1.709024, mae: 10.067819, mean_q: 20.501375, mean_eps: 0.892247
  2441/20000: episode: 106, duration: 0.145s, episode steps:  31, steps per second: 213, episode reward: 31.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.581 [0.000, 1.000],  loss: 1.853502, mae: 10.726273, mean_q: 21

  3389/20000: episode: 135, duration: 0.300s, episode steps:  56, steps per second: 187, episode reward: 56.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.554 [0.000, 1.000],  loss: 2.610476, mae: 14.647402, mean_q: 29.914601, mean_eps: 0.848777
  3435/20000: episode: 136, duration: 0.395s, episode steps:  46, steps per second: 116, episode reward: 46.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.435 [0.000, 1.000],  loss: 2.681443, mae: 15.172610, mean_q: 30.867996, mean_eps: 0.846482
  3468/20000: episode: 137, duration: 0.209s, episode steps:  33, steps per second: 158, episode reward: 33.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.545 [0.000, 1.000],  loss: 3.072232, mae: 15.345224, mean_q: 31.222156, mean_eps: 0.844705
  3505/20000: episode: 138, duration: 0.234s, episode steps:  37, steps per second: 158, episode reward: 37.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.568 [0.000, 1.000],  loss: 1.925163, mae: 15.527442, mean_q: 31

  4532/20000: episode: 167, duration: 0.240s, episode steps:  43, steps per second: 179, episode reward: 43.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.581 [0.000, 1.000],  loss: 3.473812, mae: 20.297570, mean_q: 41.695896, mean_eps: 0.797050
  4553/20000: episode: 168, duration: 0.117s, episode steps:  21, steps per second: 180, episode reward: 21.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.571 [0.000, 1.000],  loss: 3.351155, mae: 20.684738, mean_q: 42.539342, mean_eps: 0.795610
  4601/20000: episode: 169, duration: 0.271s, episode steps:  48, steps per second: 177, episode reward: 48.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.458 [0.000, 1.000],  loss: 3.847457, mae: 20.578981, mean_q: 42.386427, mean_eps: 0.794057
  4668/20000: episode: 170, duration: 0.376s, episode steps:  67, steps per second: 178, episode reward: 67.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.478 [0.000, 1.000],  loss: 4.711332, mae: 21.070056, mean_q: 43

  5851/20000: episode: 199, duration: 0.199s, episode steps:  35, steps per second: 176, episode reward: 35.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.514 [0.000, 1.000],  loss: 6.218999, mae: 27.383760, mean_q: 56.259870, mean_eps: 0.737515
  5920/20000: episode: 200, duration: 0.386s, episode steps:  69, steps per second: 179, episode reward: 69.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.551 [0.000, 1.000],  loss: 4.715307, mae: 27.955151, mean_q: 57.984375, mean_eps: 0.735175
  5959/20000: episode: 201, duration: 0.219s, episode steps:  39, steps per second: 178, episode reward: 39.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.436 [0.000, 1.000],  loss: 7.794355, mae: 28.358964, mean_q: 58.653961, mean_eps: 0.732745
  6004/20000: episode: 202, duration: 0.250s, episode steps:  45, steps per second: 180, episode reward: 45.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.489 [0.000, 1.000],  loss: 6.781299, mae: 28.821886, mean_q: 59

  7438/20000: episode: 231, duration: 0.392s, episode steps:  71, steps per second: 181, episode reward: 71.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.535 [0.000, 1.000],  loss: 13.017635, mae: 37.047396, mean_q: 76.195785, mean_eps: 0.666910
  7499/20000: episode: 232, duration: 0.344s, episode steps:  61, steps per second: 177, episode reward: 61.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.475 [0.000, 1.000],  loss: 9.769186, mae: 37.472089, mean_q: 77.078676, mean_eps: 0.663940
  7515/20000: episode: 233, duration: 0.092s, episode steps:  16, steps per second: 174, episode reward: 16.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.375 [0.000, 1.000],  loss: 8.064774, mae: 37.786335, mean_q: 77.631427, mean_eps: 0.662208
  7529/20000: episode: 234, duration: 0.085s, episode steps:  14, steps per second: 165, episode reward: 14.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.571 [0.000, 1.000],  loss: 12.874736, mae: 38.299038, mean_q: 

 10587/20000: episode: 263, duration: 0.693s, episode steps: 126, steps per second: 182, episode reward: 126.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.500 [0.000, 1.000],  loss: 17.021018, mae: 56.708253, mean_q: 116.351348, mean_eps: 0.526442
 10671/20000: episode: 264, duration: 0.464s, episode steps:  84, steps per second: 181, episode reward: 84.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.524 [0.000, 1.000],  loss: 18.562899, mae: 57.647798, mean_q: 118.195018, mean_eps: 0.521717
 10968/20000: episode: 265, duration: 1.607s, episode steps: 297, steps per second: 185, episode reward: 297.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.492 [0.000, 1.000],  loss: 24.310780, mae: 58.692091, mean_q: 119.797981, mean_eps: 0.513145
 11249/20000: episode: 266, duration: 1.544s, episode steps: 281, steps per second: 182, episode reward: 281.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.502 [0.000, 1.000],  loss: 22.790349, mae: 60.529353, 

 17478/20000: episode: 295, duration: 1.138s, episode steps: 201, steps per second: 177, episode reward: 201.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.522 [0.000, 1.000],  loss: 37.555728, mae: 82.060479, mean_q: 167.244494, mean_eps: 0.218035
 17719/20000: episode: 296, duration: 1.355s, episode steps: 241, steps per second: 178, episode reward: 241.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.519 [0.000, 1.000],  loss: 39.321987, mae: 81.483270, mean_q: 166.067117, mean_eps: 0.208090
 17965/20000: episode: 297, duration: 1.368s, episode steps: 246, steps per second: 180, episode reward: 246.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.520 [0.000, 1.000],  loss: 41.517352, mae: 81.420026, mean_q: 165.521575, mean_eps: 0.197132
 18239/20000: episode: 298, duration: 1.510s, episode steps: 274, steps per second: 181, episode reward: 274.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.515 [0.000, 1.000],  loss: 38.114417, mae: 81.579115,

<keras.callbacks.History at 0x21b77b00ca0>

In [27]:
dqn.save_weights(f'my_weights_cartpole.h5f',overwrite=True)

In [28]:
dqn.test(env,nb_episodes=5,visualize=True)
env.close()

Testing for 5 episodes ...
Episode 1: reward: 500.000, steps: 500
Episode 2: reward: 401.000, steps: 401
Episode 3: reward: 500.000, steps: 500
Episode 4: reward: 500.000, steps: 500
Episode 5: reward: 368.000, steps: 368
