# Training a basic setting with a Deep Q Network (DQN) #

Import statements

In [22]:
import json
import os

import numpy as np

import matplotlib
import matplotlib.pyplot as plt
from matplotlib import rc
rc('text', usetex=True)
%matplotlib inline

In [2]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.optimizers import Adam, SGD

In [3]:
from rl.agents import DQNAgent
from rl.policy import BoltzmannQPolicy, EpsGreedyQPolicy
from rl.memory import SequentialMemory  # For experience replay!

In [15]:
from gym_environment_ncml import *
from learning import *

Useful numbers

In [5]:
MILLION = 1000000
HTHOUSAND = 100000
THOUSAND = 1000

## 1. Create environment ##

In [6]:
env = GridworldMultiAgentv25()



In [7]:
states = env.observation_space.shape[0]
actions = env.action_space.n

In [8]:
states, actions

(10, 25)

## 2. Create a Deep Learning Model with Keras ##

In [9]:
model = build_model(states, actions, [32, 16], ['relu', 'relu'])

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


In [10]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten (Flatten)            (None, 10)                0         
_________________________________________________________________
dense (Dense)                (None, 32)                352       
_________________________________________________________________
dense_1 (Dense)              (None, 16)                528       
_________________________________________________________________
dense_2 (Dense)              (None, 25)                425       
Total params: 1,305
Trainable params: 1,305
Non-trainable params: 0
_________________________________________________________________


## 3. Build Agent with Keras-RL ##

In [11]:
dqn = build_agent(model, actions, 0.01, EpsGreedyQPolicy(), 50000)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])
# dqn.compile(Adam(lr=1e-2), metrics=['mse'])

In [17]:
name = 'dqn25_5b5_3236_adam_lr0.001_tmu0.01_ml50K_ns5M_eps0.1'

In [12]:
history = dqn.fit(env, nb_steps=5*MILLION, visualize=False, verbose=1)

Training for 5000000 steps ...
Interval 1 (0 steps performed)
200 episodes - episode_reward: -32.360 [-58.000, 26.000] - loss: 1.490 - mae: 2.423 - mean_q: -1.613

Interval 2 (10000 steps performed)
200 episodes - episode_reward: -7.170 [-54.000, 100.000] - loss: 2.356 - mae: 3.513 - mean_q: 1.236

Interval 3 (20000 steps performed)
200 episodes - episode_reward: 7.540 [-50.000, 112.000] - loss: 5.485 - mae: 13.538 - mean_q: 15.368

Interval 4 (30000 steps performed)
200 episodes - episode_reward: 31.270 [-60.000, 132.000] - loss: 10.715 - mae: 24.133 - mean_q: 26.708

Interval 5 (40000 steps performed)
200 episodes - episode_reward: 38.660 [-50.000, 128.000] - loss: 18.953 - mae: 35.181 - mean_q: 38.591

Interval 6 (50000 steps performed)
200 episodes - episode_reward: 50.750 [-50.000, 178.000] - loss: 27.165 - mae: 44.527 - mean_q: 48.613

Interval 7 (60000 steps performed)
200 episodes - episode_reward: 62.180 [-40.000, 158.000] - loss: 38.242 - mae: 54.374 - mean_q: 59.330

Interva

200 episodes - episode_reward: 72.780 [-52.000, 174.000] - loss: 91.400 - mae: 86.022 - mean_q: 93.887

Interval 40 (390000 steps performed)
200 episodes - episode_reward: 81.260 [-36.000, 184.000] - loss: 90.340 - mae: 86.119 - mean_q: 93.914

Interval 41 (400000 steps performed)
200 episodes - episode_reward: 79.350 [-50.000, 188.000] - loss: 94.505 - mae: 88.172 - mean_q: 96.125

Interval 42 (410000 steps performed)
200 episodes - episode_reward: 77.690 [-50.000, 148.000] - loss: 92.616 - mae: 88.335 - mean_q: 96.202

Interval 43 (420000 steps performed)
200 episodes - episode_reward: 71.470 [-42.000, 170.000] - loss: 92.930 - mae: 87.130 - mean_q: 95.052

Interval 44 (430000 steps performed)
200 episodes - episode_reward: 84.230 [-16.000, 190.000] - loss: 93.140 - mae: 86.983 - mean_q: 94.900

Interval 45 (440000 steps performed)
200 episodes - episode_reward: 84.670 [-40.000, 192.000] - loss: 93.259 - mae: 87.360 - mean_q: 95.347

Interval 46 (450000 steps performed)
200 episodes 

200 episodes - episode_reward: 81.880 [-54.000, 176.000] - loss: 91.268 - mae: 86.514 - mean_q: 94.350

Interval 78 (770000 steps performed)
200 episodes - episode_reward: 78.600 [-50.000, 180.000] - loss: 91.824 - mae: 86.491 - mean_q: 94.227

Interval 79 (780000 steps performed)
200 episodes - episode_reward: 81.800 [-34.000, 184.000] - loss: 90.039 - mae: 85.583 - mean_q: 93.144

Interval 80 (790000 steps performed)
200 episodes - episode_reward: 70.070 [-54.000, 178.000] - loss: 90.460 - mae: 85.775 - mean_q: 93.308

Interval 81 (800000 steps performed)
200 episodes - episode_reward: 79.610 [-40.000, 178.000] - loss: 89.023 - mae: 84.609 - mean_q: 92.175

Interval 82 (810000 steps performed)
200 episodes - episode_reward: 89.880 [-24.000, 198.000] - loss: 88.076 - mae: 83.844 - mean_q: 91.327

Interval 83 (820000 steps performed)
200 episodes - episode_reward: 90.560 [-40.000, 198.000] - loss: 92.905 - mae: 86.509 - mean_q: 94.323

Interval 84 (830000 steps performed)
200 episodes 

200 episodes - episode_reward: 86.510 [-36.000, 202.000] - loss: 95.987 - mae: 88.125 - mean_q: 95.484

Interval 116 (1150000 steps performed)
200 episodes - episode_reward: 92.650 [-40.000, 186.000] - loss: 98.425 - mae: 89.776 - mean_q: 97.409

Interval 117 (1160000 steps performed)
200 episodes - episode_reward: 90.050 [-50.000, 178.000] - loss: 98.658 - mae: 89.772 - mean_q: 97.376

Interval 118 (1170000 steps performed)
200 episodes - episode_reward: 98.360 [-50.000, 220.000] - loss: 98.840 - mae: 89.670 - mean_q: 97.536

Interval 119 (1180000 steps performed)
200 episodes - episode_reward: 95.830 [-34.000, 186.000] - loss: 102.857 - mae: 92.126 - mean_q: 100.118

Interval 120 (1190000 steps performed)
200 episodes - episode_reward: 91.830 [-50.000, 194.000] - loss: 105.667 - mae: 92.699 - mean_q: 100.687

Interval 121 (1200000 steps performed)
200 episodes - episode_reward: 96.250 [-46.000, 242.000] - loss: 108.598 - mae: 94.757 - mean_q: 102.879

Interval 122 (1210000 steps perf

200 episodes - episode_reward: 106.440 [-50.000, 214.000] - loss: 148.418 - mae: 111.413 - mean_q: 120.964

Interval 153 (1520000 steps performed)
200 episodes - episode_reward: 113.890 [-30.000, 212.000] - loss: 145.538 - mae: 109.174 - mean_q: 118.405

Interval 154 (1530000 steps performed)
200 episodes - episode_reward: 116.460 [-30.000, 256.000] - loss: 137.034 - mae: 105.642 - mean_q: 114.735

Interval 155 (1540000 steps performed)
200 episodes - episode_reward: 123.670 [-8.000, 224.000] - loss: 139.053 - mae: 107.181 - mean_q: 116.410

Interval 156 (1550000 steps performed)
200 episodes - episode_reward: 119.260 [-64.000, 244.000] - loss: 138.456 - mae: 107.153 - mean_q: 116.211

Interval 157 (1560000 steps performed)
200 episodes - episode_reward: 113.280 [-18.000, 214.000] - loss: 135.555 - mae: 106.082 - mean_q: 114.972

Interval 158 (1570000 steps performed)
200 episodes - episode_reward: 116.890 [-48.000, 230.000] - loss: 135.681 - mae: 106.056 - mean_q: 114.828

Interval 15

200 episodes - episode_reward: 130.300 [-40.000, 240.000] - loss: 156.973 - mae: 114.489 - mean_q: 124.743

Interval 190 (1890000 steps performed)
200 episodes - episode_reward: 132.690 [-12.000, 262.000] - loss: 162.767 - mae: 116.100 - mean_q: 126.289

Interval 191 (1900000 steps performed)
200 episodes - episode_reward: 130.830 [-50.000, 226.000] - loss: 159.785 - mae: 114.718 - mean_q: 124.714

Interval 192 (1910000 steps performed)
200 episodes - episode_reward: 120.990 [-20.000, 216.000] - loss: 157.952 - mae: 113.683 - mean_q: 123.457

Interval 193 (1920000 steps performed)
200 episodes - episode_reward: 123.120 [8.000, 238.000] - loss: 149.295 - mae: 110.597 - mean_q: 119.889

Interval 194 (1930000 steps performed)
200 episodes - episode_reward: 122.070 [-40.000, 292.000] - loss: 145.814 - mae: 109.833 - mean_q: 119.104

Interval 195 (1940000 steps performed)
200 episodes - episode_reward: 127.470 [-40.000, 244.000] - loss: 144.168 - mae: 108.839 - mean_q: 118.003

Interval 196

200 episodes - episode_reward: 128.490 [-10.000, 238.000] - loss: 147.781 - mae: 111.471 - mean_q: 120.704

Interval 227 (2260000 steps performed)
200 episodes - episode_reward: 121.710 [-50.000, 244.000] - loss: 148.974 - mae: 112.339 - mean_q: 121.507

Interval 228 (2270000 steps performed)
200 episodes - episode_reward: 130.520 [-16.000, 240.000] - loss: 148.772 - mae: 111.881 - mean_q: 121.124

Interval 229 (2280000 steps performed)
200 episodes - episode_reward: 129.840 [-10.000, 252.000] - loss: 153.616 - mae: 113.517 - mean_q: 122.943

Interval 230 (2290000 steps performed)
200 episodes - episode_reward: 132.880 [-26.000, 256.000] - loss: 151.748 - mae: 112.918 - mean_q: 122.141

Interval 231 (2300000 steps performed)
200 episodes - episode_reward: 129.940 [-36.000, 268.000] - loss: 156.362 - mae: 112.371 - mean_q: 121.532

Interval 232 (2310000 steps performed)
200 episodes - episode_reward: 122.190 [-22.000, 228.000] - loss: 153.495 - mae: 112.441 - mean_q: 121.756

Interval 2

200 episodes - episode_reward: 128.220 [-58.000, 240.000] - loss: 143.883 - mae: 108.542 - mean_q: 117.592

Interval 264 (2630000 steps performed)
200 episodes - episode_reward: 133.920 [-56.000, 246.000] - loss: 142.676 - mae: 108.828 - mean_q: 117.937

Interval 265 (2640000 steps performed)
200 episodes - episode_reward: 134.380 [-40.000, 246.000] - loss: 143.056 - mae: 109.456 - mean_q: 118.724

Interval 266 (2650000 steps performed)
200 episodes - episode_reward: 133.520 [-40.000, 242.000] - loss: 151.834 - mae: 112.344 - mean_q: 121.738

Interval 267 (2660000 steps performed)
200 episodes - episode_reward: 123.790 [-40.000, 254.000] - loss: 150.563 - mae: 112.105 - mean_q: 121.469

Interval 268 (2670000 steps performed)
200 episodes - episode_reward: 129.290 [-36.000, 232.000] - loss: 154.860 - mae: 113.063 - mean_q: 122.839

Interval 269 (2680000 steps performed)
200 episodes - episode_reward: 123.670 [-28.000, 252.000] - loss: 153.251 - mae: 112.901 - mean_q: 122.712

Interval 2

200 episodes - episode_reward: 139.380 [-40.000, 280.000] - loss: 158.896 - mae: 114.622 - mean_q: 124.270

Interval 301 (3000000 steps performed)
200 episodes - episode_reward: 140.020 [-54.000, 232.000] - loss: 157.685 - mae: 113.943 - mean_q: 123.458

Interval 302 (3010000 steps performed)
200 episodes - episode_reward: 128.500 [-54.000, 258.000] - loss: 158.513 - mae: 115.308 - mean_q: 124.772

Interval 303 (3020000 steps performed)
200 episodes - episode_reward: 122.910 [-38.000, 228.000] - loss: 153.167 - mae: 111.918 - mean_q: 121.057

Interval 304 (3030000 steps performed)
200 episodes - episode_reward: 133.230 [8.000, 256.000] - loss: 147.419 - mae: 110.163 - mean_q: 119.328

Interval 305 (3040000 steps performed)
200 episodes - episode_reward: 134.470 [-22.000, 226.000] - loss: 149.422 - mae: 112.614 - mean_q: 121.991

Interval 306 (3050000 steps performed)
200 episodes - episode_reward: 125.370 [-40.000, 236.000] - loss: 155.478 - mae: 113.565 - mean_q: 122.865

Interval 307

200 episodes - episode_reward: 132.010 [2.000, 230.000] - loss: 156.017 - mae: 114.541 - mean_q: 124.186

Interval 338 (3370000 steps performed)
200 episodes - episode_reward: 136.640 [12.000, 272.000] - loss: 155.796 - mae: 115.041 - mean_q: 124.669

Interval 339 (3380000 steps performed)
200 episodes - episode_reward: 136.850 [-24.000, 286.000] - loss: 155.550 - mae: 114.939 - mean_q: 124.312

Interval 340 (3390000 steps performed)
200 episodes - episode_reward: 134.700 [-2.000, 252.000] - loss: 154.940 - mae: 113.597 - mean_q: 123.016

Interval 341 (3400000 steps performed)
200 episodes - episode_reward: 136.130 [-26.000, 244.000] - loss: 153.104 - mae: 114.133 - mean_q: 123.665

Interval 342 (3410000 steps performed)
200 episodes - episode_reward: 135.690 [6.000, 232.000] - loss: 156.441 - mae: 114.915 - mean_q: 124.509

Interval 343 (3420000 steps performed)
200 episodes - episode_reward: 134.570 [-12.000, 250.000] - loss: 155.318 - mae: 114.726 - mean_q: 124.179

Interval 344 (34

200 episodes - episode_reward: 133.640 [-48.000, 244.000] - loss: 160.525 - mae: 116.808 - mean_q: 126.577

Interval 375 (3740000 steps performed)
200 episodes - episode_reward: 131.430 [-40.000, 226.000] - loss: 155.887 - mae: 115.035 - mean_q: 124.660

Interval 376 (3750000 steps performed)
200 episodes - episode_reward: 132.130 [-36.000, 250.000] - loss: 157.268 - mae: 114.586 - mean_q: 124.206

Interval 377 (3760000 steps performed)
200 episodes - episode_reward: 126.310 [-40.000, 230.000] - loss: 153.209 - mae: 111.499 - mean_q: 120.807

Interval 378 (3770000 steps performed)
200 episodes - episode_reward: 130.480 [-42.000, 240.000] - loss: 148.697 - mae: 112.538 - mean_q: 121.937

Interval 379 (3780000 steps performed)
200 episodes - episode_reward: 138.510 [-22.000, 268.000] - loss: 151.959 - mae: 112.134 - mean_q: 121.300

Interval 380 (3790000 steps performed)
200 episodes - episode_reward: 141.100 [-22.000, 234.000] - loss: 149.708 - mae: 110.926 - mean_q: 120.188

Interval 3

200 episodes - episode_reward: 130.980 [-12.000, 256.000] - loss: 137.790 - mae: 107.126 - mean_q: 115.704

Interval 412 (4110000 steps performed)
200 episodes - episode_reward: 126.850 [0.000, 246.000] - loss: 136.668 - mae: 107.130 - mean_q: 115.591

Interval 413 (4120000 steps performed)
200 episodes - episode_reward: 130.180 [6.000, 238.000] - loss: 138.724 - mae: 107.705 - mean_q: 116.212

Interval 414 (4130000 steps performed)
200 episodes - episode_reward: 129.670 [-8.000, 266.000] - loss: 140.693 - mae: 108.757 - mean_q: 117.377

Interval 415 (4140000 steps performed)
200 episodes - episode_reward: 135.670 [24.000, 252.000] - loss: 145.495 - mae: 110.836 - mean_q: 119.739

Interval 416 (4150000 steps performed)
200 episodes - episode_reward: 134.930 [-30.000, 256.000] - loss: 153.090 - mae: 112.556 - mean_q: 121.579

Interval 417 (4160000 steps performed)
200 episodes - episode_reward: 131.460 [8.000, 236.000] - loss: 154.147 - mae: 113.177 - mean_q: 122.415

Interval 418 (4170

200 episodes - episode_reward: 88.150 [-50.000, 232.000] - loss: 168.413 - mae: 120.655 - mean_q: 129.850

Interval 449 (4480000 steps performed)
200 episodes - episode_reward: 119.960 [-30.000, 248.000] - loss: 143.629 - mae: 109.790 - mean_q: 118.452

Interval 450 (4490000 steps performed)
200 episodes - episode_reward: 118.210 [-58.000, 244.000] - loss: 133.005 - mae: 105.813 - mean_q: 114.321

Interval 451 (4500000 steps performed)
200 episodes - episode_reward: 126.110 [10.000, 230.000] - loss: 124.975 - mae: 102.167 - mean_q: 110.433

Interval 452 (4510000 steps performed)
200 episodes - episode_reward: 123.870 [-38.000, 264.000] - loss: 124.050 - mae: 100.509 - mean_q: 108.809

Interval 453 (4520000 steps performed)
200 episodes - episode_reward: 122.530 [-36.000, 260.000] - loss: 129.289 - mae: 101.980 - mean_q: 110.316

Interval 454 (4530000 steps performed)
200 episodes - episode_reward: 118.200 [-36.000, 228.000] - loss: 129.668 - mae: 102.841 - mean_q: 111.225

Interval 455

200 episodes - episode_reward: 142.780 [-50.000, 246.000] - loss: 139.904 - mae: 107.288 - mean_q: 116.464

Interval 486 (4850000 steps performed)
200 episodes - episode_reward: 151.810 [34.000, 274.000] - loss: 146.263 - mae: 109.165 - mean_q: 118.502

Interval 487 (4860000 steps performed)
200 episodes - episode_reward: 145.180 [-12.000, 250.000] - loss: 151.269 - mae: 112.686 - mean_q: 122.281

Interval 488 (4870000 steps performed)
200 episodes - episode_reward: 132.560 [-40.000, 268.000] - loss: 160.915 - mae: 116.622 - mean_q: 126.323

Interval 489 (4880000 steps performed)
200 episodes - episode_reward: 142.000 [52.000, 246.000] - loss: 163.119 - mae: 117.634 - mean_q: 127.287

Interval 490 (4890000 steps performed)
200 episodes - episode_reward: 139.520 [-50.000, 258.000] - loss: 160.698 - mae: 115.649 - mean_q: 125.045

Interval 491 (4900000 steps performed)
200 episodes - episode_reward: 141.290 [30.000, 304.000] - loss: 153.917 - mae: 113.922 - mean_q: 123.067

Interval 492 

In [26]:
data = history.history
data['episode_reward'] = [float(v) for v in data['episode_reward']]
data['nb_episode_steps'] = [int(v) for v in data['nb_episode_steps']]
data['nb_steps'] = [int(v) for v in data['nb_steps']]

In [27]:
data

{'episode_reward': [-50.0,
  -50.0,
  -36.0,
  -40.0,
  -40.0,
  -40.0,
  -50.0,
  -50.0,
  -42.0,
  -26.0,
  -46.0,
  -40.0,
  -52.0,
  -20.0,
  -40.0,
  -6.0,
  20.0,
  -50.0,
  -12.0,
  -40.0,
  -50.0,
  -20.0,
  -50.0,
  -40.0,
  -32.0,
  -40.0,
  -6.0,
  -46.0,
  -46.0,
  -50.0,
  -22.0,
  -6.0,
  -26.0,
  -40.0,
  -32.0,
  -40.0,
  -26.0,
  -40.0,
  -40.0,
  -46.0,
  -32.0,
  -2.0,
  -20.0,
  -30.0,
  -30.0,
  -36.0,
  -30.0,
  -50.0,
  -40.0,
  14.0,
  -30.0,
  -50.0,
  -54.0,
  -16.0,
  -26.0,
  -46.0,
  -8.0,
  -50.0,
  -22.0,
  -50.0,
  -40.0,
  -44.0,
  -46.0,
  -8.0,
  -24.0,
  -50.0,
  18.0,
  -28.0,
  -10.0,
  -50.0,
  10.0,
  -50.0,
  -50.0,
  -32.0,
  -36.0,
  -52.0,
  0.0,
  -40.0,
  -40.0,
  -42.0,
  -40.0,
  -40.0,
  -40.0,
  -40.0,
  -26.0,
  -50.0,
  -50.0,
  -50.0,
  -30.0,
  -50.0,
  -50.0,
  -20.0,
  -26.0,
  -50.0,
  -30.0,
  -50.0,
  -2.0,
  -38.0,
  -50.0,
  -36.0,
  -36.0,
  20.0,
  -50.0,
  -20.0,
  -50.0,
  -40.0,
  -50.0,
  -8.0,
  -2.0,
  -40.0,
  -40.0,

In [28]:
os.mkdir('agents/{}'.format(name))  # If the directory does not exist we cannot write the file
with open(get_training_path(name), 'w') as f:
    json.dump(data, f)

In [None]:
fig, ax = plt.subplots(figsize=(10,7))

ax.plot(history.history['nb_steps'], history.history['episode_reward'])

fig.tight_layout()
plt.show()

In [None]:
scores = dqn.test(env, nb_episodes=10, visualize=False)
print(np.mean(scores.history['episode_reward']))

Save agent to memory

In [29]:
dqn.save_weights(get_agent_path(name), overwrite=True)

## 4. Reloading training from Memory ##