# Training a basic setting with a Deep Q Network (DQN) #

Import statements

In [17]:
import json
import os

import numpy as np

import matplotlib
import matplotlib.pyplot as plt
from matplotlib import rc
rc('text', usetex=True)
%matplotlib inline

In [2]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.optimizers import Adam, SGD

In [3]:
from rl.agents import DQNAgent
from rl.policy import BoltzmannQPolicy, EpsGreedyQPolicy
from rl.memory import SequentialMemory  # For experience replay!

In [4]:
from gym_environment_ncml import *
from learning import *

pygame 2.0.1 (SDL 2.0.14, Python 3.7.10)
Hello from the pygame community. https://www.pygame.org/contribute.html


Useful numbers

In [5]:
MILLION = 1000000
HTHOUSAND = 100000
THOUSAND = 1000

## 1. Create environment ##

In [6]:
env = GridworldMultiAgentv1()

In [7]:
states = env.observation_space.shape[0]
actions = env.action_space.n

In [8]:
states, actions

(8, 25)

## 2. Create a Deep Learning Model with Keras ##

In [9]:
model = build_model(states, actions, [32, 16], ['relu', 'relu'])

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


In [10]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten (Flatten)            (None, 8)                 0         
_________________________________________________________________
dense (Dense)                (None, 32)                288       
_________________________________________________________________
dense_1 (Dense)              (None, 16)                528       
_________________________________________________________________
dense_2 (Dense)              (None, 25)                425       
Total params: 1,241
Trainable params: 1,241
Non-trainable params: 0
_________________________________________________________________


## 3. Build Agent with Keras-RL ##

In [11]:
dqn = build_agent(model, actions, 0.01, EpsGreedyQPolicy(), 50000)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])
# dqn.compile(Adam(lr=1e-2), metrics=['mse'])

In [14]:
name = 'dqn1_5b5_3216_adam_lr0.001_tmu0.01_ml50K_ns5M_eps0.1'

In [13]:
history = dqn.fit(env, nb_steps=5*MILLION, visualize=False, verbose=1)

Training for 5000000 steps ...
Interval 1 (0 steps performed)
200 episodes - episode_reward: -21.850 [-50.000, 80.000] - loss: 3.741 - mae: 9.218 - mean_q: 11.168

Interval 2 (10000 steps performed)
200 episodes - episode_reward: -1.200 [-50.000, 120.000] - loss: 9.952 - mae: 21.438 - mean_q: 24.182

Interval 3 (20000 steps performed)
200 episodes - episode_reward: -2.250 [-50.000, 100.000] - loss: 15.679 - mae: 31.044 - mean_q: 34.276

Interval 4 (30000 steps performed)
200 episodes - episode_reward: 8.250 [-50.000, 130.000] - loss: 17.189 - mae: 32.623 - mean_q: 35.896

Interval 5 (40000 steps performed)
200 episodes - episode_reward: 12.400 [-50.000, 170.000] - loss: 18.865 - mae: 34.486 - mean_q: 37.835

Interval 6 (50000 steps performed)
200 episodes - episode_reward: 11.750 [-50.000, 150.000] - loss: 20.752 - mae: 35.270 - mean_q: 38.622

Interval 7 (60000 steps performed)
200 episodes - episode_reward: 15.950 [-50.000, 190.000] - loss: 23.286 - mae: 37.709 - mean_q: 41.240

Inte

200 episodes - episode_reward: 150.600 [-20.000, 270.000] - loss: 286.223 - mae: 156.156 - mean_q: 169.725

Interval 39 (380000 steps performed)
200 episodes - episode_reward: 149.350 [-20.000, 260.000] - loss: 299.244 - mae: 160.773 - mean_q: 174.601

Interval 40 (390000 steps performed)
200 episodes - episode_reward: 139.850 [-10.000, 280.000] - loss: 313.565 - mae: 165.469 - mean_q: 179.531

Interval 41 (400000 steps performed)
200 episodes - episode_reward: 146.050 [-50.000, 280.000] - loss: 311.253 - mae: 163.974 - mean_q: 177.593

Interval 42 (410000 steps performed)
200 episodes - episode_reward: 152.100 [-50.000, 290.000] - loss: 310.279 - mae: 164.136 - mean_q: 178.161

Interval 43 (420000 steps performed)
200 episodes - episode_reward: 145.500 [-40.000, 290.000] - loss: 307.751 - mae: 162.180 - mean_q: 175.793

Interval 44 (430000 steps performed)
200 episodes - episode_reward: 147.250 [-40.000, 280.000] - loss: 291.994 - mae: 157.274 - mean_q: 170.681

Interval 45 (440000 st

200 episodes - episode_reward: 162.650 [20.000, 310.000] - loss: 317.789 - mae: 164.248 - mean_q: 178.265

Interval 76 (750000 steps performed)
200 episodes - episode_reward: 174.300 [-10.000, 280.000] - loss: 328.654 - mae: 166.855 - mean_q: 181.134

Interval 77 (760000 steps performed)
200 episodes - episode_reward: 173.750 [-20.000, 280.000] - loss: 327.048 - mae: 168.857 - mean_q: 183.444

Interval 78 (770000 steps performed)
200 episodes - episode_reward: 157.050 [-20.000, 310.000] - loss: 332.565 - mae: 170.224 - mean_q: 184.729

Interval 79 (780000 steps performed)
200 episodes - episode_reward: 165.400 [-10.000, 290.000] - loss: 329.721 - mae: 167.111 - mean_q: 181.347

Interval 80 (790000 steps performed)
200 episodes - episode_reward: 164.700 [-30.000, 280.000] - loss: 322.262 - mae: 166.916 - mean_q: 181.053

Interval 81 (800000 steps performed)
200 episodes - episode_reward: 159.200 [-40.000, 260.000] - loss: 324.994 - mae: 166.436 - mean_q: 180.477

Interval 82 (810000 ste

200 episodes - episode_reward: 176.300 [-10.000, 300.000] - loss: 314.998 - mae: 162.542 - mean_q: 176.604

Interval 113 (1120000 steps performed)
200 episodes - episode_reward: 170.000 [-30.000, 310.000] - loss: 324.603 - mae: 167.102 - mean_q: 181.278

Interval 114 (1130000 steps performed)
200 episodes - episode_reward: 164.150 [-50.000, 290.000] - loss: 324.525 - mae: 165.928 - mean_q: 180.116

Interval 115 (1140000 steps performed)
200 episodes - episode_reward: 125.200 [-40.000, 250.000] - loss: 336.396 - mae: 170.207 - mean_q: 184.655

Interval 116 (1150000 steps performed)
200 episodes - episode_reward: 106.950 [-50.000, 260.000] - loss: 311.870 - mae: 163.388 - mean_q: 176.809

Interval 117 (1160000 steps performed)
200 episodes - episode_reward: 134.950 [-20.000, 270.000] - loss: 276.037 - mae: 151.957 - mean_q: 164.375

Interval 118 (1170000 steps performed)
200 episodes - episode_reward: 144.350 [-40.000, 270.000] - loss: 256.364 - mae: 147.191 - mean_q: 159.395

Interval 1

200 episodes - episode_reward: 180.600 [-40.000, 300.000] - loss: 296.506 - mae: 162.104 - mean_q: 175.660

Interval 150 (1490000 steps performed)
200 episodes - episode_reward: 160.500 [-30.000, 290.000] - loss: 307.810 - mae: 162.470 - mean_q: 175.982

Interval 151 (1500000 steps performed)
200 episodes - episode_reward: 163.550 [-20.000, 290.000] - loss: 293.765 - mae: 157.618 - mean_q: 171.114

Interval 152 (1510000 steps performed)
200 episodes - episode_reward: 168.600 [-50.000, 310.000] - loss: 311.460 - mae: 163.999 - mean_q: 177.821

Interval 153 (1520000 steps performed)
200 episodes - episode_reward: 152.800 [-50.000, 280.000] - loss: 319.341 - mae: 163.482 - mean_q: 177.492

Interval 154 (1530000 steps performed)
200 episodes - episode_reward: 157.200 [-30.000, 280.000] - loss: 311.225 - mae: 163.961 - mean_q: 177.707

Interval 155 (1540000 steps performed)
200 episodes - episode_reward: 156.150 [-10.000, 260.000] - loss: 303.253 - mae: 161.377 - mean_q: 174.682

Interval 1

200 episodes - episode_reward: 167.750 [-40.000, 280.000] - loss: 261.774 - mae: 150.248 - mean_q: 162.667

Interval 187 (1860000 steps performed)
200 episodes - episode_reward: 155.450 [-40.000, 240.000] - loss: 274.518 - mae: 153.299 - mean_q: 165.820

Interval 188 (1870000 steps performed)
200 episodes - episode_reward: 170.450 [10.000, 280.000] - loss: 265.382 - mae: 151.606 - mean_q: 163.969

Interval 189 (1880000 steps performed)
200 episodes - episode_reward: 176.800 [-40.000, 260.000] - loss: 272.974 - mae: 153.373 - mean_q: 165.973

Interval 190 (1890000 steps performed)
200 episodes - episode_reward: 170.800 [-10.000, 270.000] - loss: 284.931 - mae: 155.783 - mean_q: 168.698

Interval 191 (1900000 steps performed)
200 episodes - episode_reward: 178.450 [20.000, 280.000] - loss: 290.400 - mae: 158.810 - mean_q: 171.876

Interval 192 (1910000 steps performed)
200 episodes - episode_reward: 174.850 [-10.000, 270.000] - loss: 295.674 - mae: 160.143 - mean_q: 173.378

Interval 193

200 episodes - episode_reward: 191.050 [0.000, 300.000] - loss: 311.043 - mae: 165.067 - mean_q: 178.185

Interval 224 (2230000 steps performed)
200 episodes - episode_reward: 178.750 [-50.000, 280.000] - loss: 316.299 - mae: 166.864 - mean_q: 180.131

Interval 225 (2240000 steps performed)
200 episodes - episode_reward: 148.750 [-50.000, 270.000] - loss: 365.163 - mae: 172.008 - mean_q: 185.704

Interval 226 (2250000 steps performed)
200 episodes - episode_reward: 172.050 [-40.000, 290.000] - loss: 367.363 - mae: 173.792 - mean_q: 187.643

Interval 227 (2260000 steps performed)
200 episodes - episode_reward: 201.000 [0.000, 320.000] - loss: 323.681 - mae: 168.888 - mean_q: 182.293

Interval 228 (2270000 steps performed)
200 episodes - episode_reward: 187.400 [-30.000, 320.000] - loss: 319.363 - mae: 167.602 - mean_q: 180.773

Interval 229 (2280000 steps performed)
200 episodes - episode_reward: 190.300 [-50.000, 290.000] - loss: 322.663 - mae: 167.969 - mean_q: 181.355

Interval 230 (

200 episodes - episode_reward: 197.050 [10.000, 290.000] - loss: 346.782 - mae: 173.408 - mean_q: 187.702

Interval 261 (2600000 steps performed)
200 episodes - episode_reward: 195.900 [-20.000, 310.000] - loss: 349.648 - mae: 174.219 - mean_q: 188.441

Interval 262 (2610000 steps performed)
200 episodes - episode_reward: 187.600 [-20.000, 350.000] - loss: 345.133 - mae: 174.286 - mean_q: 188.044

Interval 263 (2620000 steps performed)
200 episodes - episode_reward: 200.800 [10.000, 300.000] - loss: 339.457 - mae: 172.226 - mean_q: 185.566

Interval 264 (2630000 steps performed)
200 episodes - episode_reward: 191.300 [50.000, 280.000] - loss: 341.380 - mae: 171.305 - mean_q: 184.817

Interval 265 (2640000 steps performed)
200 episodes - episode_reward: 195.400 [30.000, 310.000] - loss: 338.871 - mae: 170.670 - mean_q: 184.212

Interval 266 (2650000 steps performed)
200 episodes - episode_reward: 190.700 [-10.000, 290.000] - loss: 342.784 - mae: 173.306 - mean_q: 186.939

Interval 267 (

200 episodes - episode_reward: 219.450 [70.000, 300.000] - loss: 353.150 - mae: 176.006 - mean_q: 190.787

Interval 298 (2970000 steps performed)
200 episodes - episode_reward: 205.700 [60.000, 330.000] - loss: 360.830 - mae: 175.630 - mean_q: 190.333

Interval 299 (2980000 steps performed)
200 episodes - episode_reward: 209.550 [-20.000, 320.000] - loss: 356.955 - mae: 175.688 - mean_q: 190.101

Interval 300 (2990000 steps performed)
200 episodes - episode_reward: 205.750 [50.000, 310.000] - loss: 347.215 - mae: 170.406 - mean_q: 184.162

Interval 301 (3000000 steps performed)
200 episodes - episode_reward: 189.950 [0.000, 290.000] - loss: 340.222 - mae: 170.077 - mean_q: 183.622

Interval 302 (3010000 steps performed)
200 episodes - episode_reward: 193.550 [-30.000, 290.000] - loss: 336.376 - mae: 172.422 - mean_q: 185.980

Interval 303 (3020000 steps performed)
200 episodes - episode_reward: 190.300 [-10.000, 330.000] - loss: 330.782 - mae: 170.520 - mean_q: 183.722

Interval 304 (3

200 episodes - episode_reward: 173.000 [-20.000, 310.000] - loss: 255.042 - mae: 149.751 - mean_q: 161.548

Interval 335 (3340000 steps performed)
200 episodes - episode_reward: 163.750 [-30.000, 250.000] - loss: 251.123 - mae: 145.901 - mean_q: 157.549

Interval 336 (3350000 steps performed)
200 episodes - episode_reward: 165.950 [-30.000, 280.000] - loss: 239.893 - mae: 144.508 - mean_q: 155.820

Interval 337 (3360000 steps performed)
200 episodes - episode_reward: 170.450 [10.000, 290.000] - loss: 239.589 - mae: 144.670 - mean_q: 155.896

Interval 338 (3370000 steps performed)
200 episodes - episode_reward: 168.300 [-40.000, 320.000] - loss: 240.855 - mae: 142.919 - mean_q: 154.086

Interval 339 (3380000 steps performed)
200 episodes - episode_reward: 157.550 [-30.000, 290.000] - loss: 255.303 - mae: 148.215 - mean_q: 159.914

Interval 340 (3390000 steps performed)
200 episodes - episode_reward: 144.600 [-40.000, 280.000] - loss: 262.018 - mae: 150.329 - mean_q: 162.023

Interval 34

200 episodes - episode_reward: 159.800 [-50.000, 270.000] - loss: 244.813 - mae: 144.532 - mean_q: 156.216

Interval 372 (3710000 steps performed)
200 episodes - episode_reward: 173.950 [20.000, 270.000] - loss: 259.477 - mae: 149.041 - mean_q: 161.428

Interval 373 (3720000 steps performed)
200 episodes - episode_reward: 175.550 [50.000, 270.000] - loss: 270.364 - mae: 151.723 - mean_q: 164.141

Interval 374 (3730000 steps performed)
200 episodes - episode_reward: 157.300 [-20.000, 300.000] - loss: 270.598 - mae: 152.341 - mean_q: 164.732

Interval 375 (3740000 steps performed)
200 episodes - episode_reward: 149.850 [-40.000, 260.000] - loss: 258.584 - mae: 149.675 - mean_q: 161.664

Interval 376 (3750000 steps performed)
200 episodes - episode_reward: 153.550 [10.000, 270.000] - loss: 245.276 - mae: 144.669 - mean_q: 156.321

Interval 377 (3760000 steps performed)
200 episodes - episode_reward: 164.200 [20.000, 280.000] - loss: 234.576 - mae: 140.353 - mean_q: 151.622

Interval 378 (

200 episodes - episode_reward: 146.050 [-30.000, 310.000] - loss: 240.470 - mae: 143.821 - mean_q: 154.787

Interval 409 (4080000 steps performed)
200 episodes - episode_reward: 164.050 [-10.000, 310.000] - loss: 243.218 - mae: 143.398 - mean_q: 154.464

Interval 410 (4090000 steps performed)
200 episodes - episode_reward: 178.750 [-40.000, 300.000] - loss: 246.541 - mae: 144.980 - mean_q: 156.342

Interval 411 (4100000 steps performed)
200 episodes - episode_reward: 179.550 [-40.000, 280.000] - loss: 248.551 - mae: 144.811 - mean_q: 156.369

Interval 412 (4110000 steps performed)
200 episodes - episode_reward: 183.300 [20.000, 300.000] - loss: 260.625 - mae: 147.761 - mean_q: 159.828

Interval 413 (4120000 steps performed)
200 episodes - episode_reward: 175.500 [-30.000, 260.000] - loss: 268.639 - mae: 151.724 - mean_q: 163.828

Interval 414 (4130000 steps performed)
200 episodes - episode_reward: 126.300 [-50.000, 290.000] - loss: 287.521 - mae: 158.202 - mean_q: 170.685

Interval 41

200 episodes - episode_reward: 157.400 [-40.000, 270.000] - loss: 216.796 - mae: 136.889 - mean_q: 147.518

Interval 446 (4450000 steps performed)
200 episodes - episode_reward: 150.450 [-20.000, 270.000] - loss: 226.058 - mae: 139.280 - mean_q: 149.820

Interval 447 (4460000 steps performed)
200 episodes - episode_reward: 154.500 [-50.000, 250.000] - loss: 218.298 - mae: 136.522 - mean_q: 147.089

Interval 448 (4470000 steps performed)
200 episodes - episode_reward: 162.150 [-40.000, 280.000] - loss: 214.885 - mae: 136.448 - mean_q: 147.161

Interval 449 (4480000 steps performed)
200 episodes - episode_reward: 183.600 [40.000, 320.000] - loss: 223.076 - mae: 137.252 - mean_q: 148.116

Interval 450 (4490000 steps performed)
200 episodes - episode_reward: 169.800 [-40.000, 290.000] - loss: 230.061 - mae: 140.169 - mean_q: 151.126

Interval 451 (4500000 steps performed)
200 episodes - episode_reward: 172.850 [-50.000, 260.000] - loss: 232.696 - mae: 142.926 - mean_q: 154.317

Interval 45

200 episodes - episode_reward: 82.550 [-50.000, 220.000] - loss: 188.313 - mae: 125.904 - mean_q: 135.319

Interval 483 (4820000 steps performed)
200 episodes - episode_reward: 90.550 [-50.000, 210.000] - loss: 165.403 - mae: 116.776 - mean_q: 125.572

Interval 484 (4830000 steps performed)
200 episodes - episode_reward: 89.300 [-50.000, 220.000] - loss: 136.148 - mae: 105.969 - mean_q: 113.965

Interval 485 (4840000 steps performed)
200 episodes - episode_reward: 96.450 [-50.000, 240.000] - loss: 121.800 - mae: 99.634 - mean_q: 107.228

Interval 486 (4850000 steps performed)
200 episodes - episode_reward: 102.150 [-40.000, 220.000] - loss: 112.350 - mae: 96.859 - mean_q: 104.128

Interval 487 (4860000 steps performed)
200 episodes - episode_reward: 96.100 [-40.000, 220.000] - loss: 116.674 - mae: 98.929 - mean_q: 106.319

Interval 488 (4870000 steps performed)
200 episodes - episode_reward: 106.150 [-50.000, 230.000] - loss: 113.613 - mae: 97.801 - mean_q: 105.223

Interval 489 (48800

In [15]:
data = history.history
data['episode_reward'] = [float(v) for v in data['episode_reward']]
data['nb_episode_steps'] = [int(v) for v in data['nb_episode_steps']]
data['nb_steps'] = [int(v) for v in data['nb_steps']]

In [18]:
os.mkdir('agents/{}'.format(name))  # If the directory does not exist we cannot write the file
with open(get_training_path(name), 'w') as f:
    json.dump(data, f)

Save agent to memory

In [19]:
dqn.save_weights(get_agent_path(name), overwrite=True)

## 4. Reloading Agent from memory and test ##

In [20]:
env = GridworldMultiAgentv1(seed=2)

In [21]:
states = env.observation_space.shape[0]
actions = env.action_space.n
model = build_model(states, actions, [32, 16], ['relu', 'relu'])
print(model.summary())
dqn = build_agent(model, actions, 0.01, EpsGreedyQPolicy(eps=0), 50000)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])

# Load weights
dqn.load_weights(get_agent_path(name))

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_1 (Flatten)          (None, 8)                 0         
_________________________________________________________________
dense_3 (Dense)              (None, 32)                288       
_________________________________________________________________
dense_4 (Dense)              (None, 16)                528       
_________________________________________________________________
dense_5 (Dense)              (None, 25)                425       
Total params: 1,241
Trainable params: 1,241
Non-trainable params: 0
_________________________________________________________________
None


In [24]:
nb_episodes = 1*THOUSAND

In [23]:
scores = dqn.test(env, nb_episodes=nb_episodes, visualize=False, verbose=0)

In [25]:
def get_test_path(name):
    return 'agents/{}/{}_test_{}episodes.txt'.format(name, name, nb_episodes)

In [30]:
rewards = np.array(scores.history['episode_reward'])

In [32]:
np.savetxt(get_test_path(name), rewards)