# Training a basic setting with a Deep Q Network (DQN) #

Import statements

In [1]:
import json
import os

import numpy as np

import matplotlib
import matplotlib.pyplot as plt
from matplotlib import rc
rc('text', usetex=True)
%matplotlib inline

In [2]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.optimizers import Adam, SGD

In [3]:
from rl.agents import DQNAgent
from rl.policy import BoltzmannQPolicy, EpsGreedyQPolicy
from rl.memory import SequentialMemory  # For experience replay!

In [4]:
from gym_environment_ncml import *
from learning import *

pygame 2.0.1 (SDL 2.0.14, Python 3.7.10)
Hello from the pygame community. https://www.pygame.org/contribute.html


Useful numbers

In [5]:
MILLION = 1000000
HTHOUSAND = 100000
THOUSAND = 1000

## 1. Create environment ##

In [6]:
env = GridworldMultiAgentv15()



In [7]:
states = env.observation_space.shape[0]
actions = env.action_space.n

In [8]:
states, actions

(8, 25)

## 2. Create a Deep Learning Model with Keras ##

In [9]:
model = build_model(states, actions, [32, 16], ['relu', 'relu'])

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


In [10]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten (Flatten)            (None, 8)                 0         
_________________________________________________________________
dense (Dense)                (None, 32)                288       
_________________________________________________________________
dense_1 (Dense)              (None, 16)                528       
_________________________________________________________________
dense_2 (Dense)              (None, 25)                425       
Total params: 1,241
Trainable params: 1,241
Non-trainable params: 0
_________________________________________________________________


## 3. Build Agent with Keras-RL ##

In [11]:
dqn = build_agent(model, actions, 0.01, EpsGreedyQPolicy(), 50000)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])
# dqn.compile(Adam(lr=1e-2), metrics=['mse'])

In [14]:
name = 'dqn15_5b5_3236_adam_lr0.001_tmu0.01_ml50K_ns7M_eps0.1'

In [13]:
history = dqn.fit(env, nb_steps=7*MILLION, visualize=False, verbose=1)

Training for 7000000 steps ...
Interval 1 (0 steps performed)
200 episodes - episode_reward: -0.400 [-50.000, 120.000] - loss: 5.089 - mae: 9.467 - mean_q: 11.273

Interval 2 (10000 steps performed)
200 episodes - episode_reward: 9.950 [-50.000, 150.000] - loss: 14.845 - mae: 27.670 - mean_q: 30.798

Interval 3 (20000 steps performed)
200 episodes - episode_reward: 13.700 [-50.000, 140.000] - loss: 21.550 - mae: 37.070 - mean_q: 40.664

Interval 4 (30000 steps performed)
200 episodes - episode_reward: 15.200 [-50.000, 160.000] - loss: 25.897 - mae: 40.404 - mean_q: 44.201

Interval 5 (40000 steps performed)
200 episodes - episode_reward: 32.350 [-50.000, 180.000] - loss: 28.149 - mae: 42.560 - mean_q: 46.517

Interval 6 (50000 steps performed)
200 episodes - episode_reward: 36.100 [-50.000, 170.000] - loss: 31.780 - mae: 46.317 - mean_q: 50.542

Interval 7 (60000 steps performed)
200 episodes - episode_reward: 50.650 [-50.000, 200.000] - loss: 37.967 - mae: 50.476 - mean_q: 55.064

Int

200 episodes - episode_reward: 159.000 [-50.000, 260.000] - loss: 319.628 - mae: 165.070 - mean_q: 178.934

Interval 39 (380000 steps performed)
200 episodes - episode_reward: 155.900 [-30.000, 300.000] - loss: 319.300 - mae: 166.167 - mean_q: 180.127

Interval 40 (390000 steps performed)
200 episodes - episode_reward: 166.700 [0.000, 280.000] - loss: 320.034 - mae: 167.128 - mean_q: 181.249

Interval 41 (400000 steps performed)
200 episodes - episode_reward: 159.700 [-50.000, 270.000] - loss: 322.223 - mae: 166.117 - mean_q: 179.967

Interval 42 (410000 steps performed)
200 episodes - episode_reward: 164.150 [-50.000, 300.000] - loss: 320.980 - mae: 167.174 - mean_q: 181.229

Interval 43 (420000 steps performed)
200 episodes - episode_reward: 162.250 [-40.000, 280.000] - loss: 326.796 - mae: 168.622 - mean_q: 182.933

Interval 44 (430000 steps performed)
200 episodes - episode_reward: 158.350 [-50.000, 270.000] - loss: 335.964 - mae: 168.840 - mean_q: 183.463

Interval 45 (440000 step

200 episodes - episode_reward: 171.250 [-40.000, 300.000] - loss: 378.328 - mae: 181.475 - mean_q: 196.995

Interval 76 (750000 steps performed)
200 episodes - episode_reward: 182.050 [0.000, 300.000] - loss: 378.091 - mae: 183.778 - mean_q: 199.360

Interval 77 (760000 steps performed)
200 episodes - episode_reward: 167.900 [-50.000, 280.000] - loss: 371.074 - mae: 179.186 - mean_q: 194.114

Interval 78 (770000 steps performed)
200 episodes - episode_reward: 175.000 [-30.000, 290.000] - loss: 361.923 - mae: 177.361 - mean_q: 192.239

Interval 79 (780000 steps performed)
200 episodes - episode_reward: 183.100 [0.000, 280.000] - loss: 350.744 - mae: 172.969 - mean_q: 187.370

Interval 80 (790000 steps performed)
200 episodes - episode_reward: 191.750 [20.000, 320.000] - loss: 354.972 - mae: 177.146 - mean_q: 192.267

Interval 81 (800000 steps performed)
200 episodes - episode_reward: 191.250 [0.000, 320.000] - loss: 370.049 - mae: 179.130 - mean_q: 194.408

Interval 82 (810000 steps per

200 episodes - episode_reward: 186.400 [40.000, 310.000] - loss: 372.057 - mae: 178.624 - mean_q: 194.151

Interval 113 (1120000 steps performed)
200 episodes - episode_reward: 198.550 [-50.000, 350.000] - loss: 368.855 - mae: 178.706 - mean_q: 194.331

Interval 114 (1130000 steps performed)
200 episodes - episode_reward: 186.600 [10.000, 300.000] - loss: 386.386 - mae: 182.103 - mean_q: 198.078

Interval 115 (1140000 steps performed)
200 episodes - episode_reward: 178.250 [-10.000, 310.000] - loss: 374.725 - mae: 179.420 - mean_q: 194.902

Interval 116 (1150000 steps performed)
200 episodes - episode_reward: 197.750 [-20.000, 350.000] - loss: 365.738 - mae: 177.519 - mean_q: 192.872

Interval 117 (1160000 steps performed)
200 episodes - episode_reward: 185.600 [-50.000, 290.000] - loss: 366.406 - mae: 177.352 - mean_q: 192.217

Interval 118 (1170000 steps performed)
200 episodes - episode_reward: 173.400 [-50.000, 280.000] - loss: 358.514 - mae: 175.068 - mean_q: 189.741

Interval 119

200 episodes - episode_reward: 175.000 [-40.000, 270.000] - loss: 341.567 - mae: 172.547 - mean_q: 186.656

Interval 150 (1490000 steps performed)
200 episodes - episode_reward: 184.500 [-10.000, 300.000] - loss: 344.020 - mae: 172.229 - mean_q: 186.406

Interval 151 (1500000 steps performed)
200 episodes - episode_reward: 192.900 [-30.000, 290.000] - loss: 332.097 - mae: 169.203 - mean_q: 183.259

Interval 152 (1510000 steps performed)
200 episodes - episode_reward: 198.800 [-10.000, 280.000] - loss: 350.688 - mae: 174.236 - mean_q: 188.831

Interval 153 (1520000 steps performed)
200 episodes - episode_reward: 209.100 [-20.000, 300.000] - loss: 363.877 - mae: 176.658 - mean_q: 191.500

Interval 154 (1530000 steps performed)
200 episodes - episode_reward: 203.950 [-20.000, 330.000] - loss: 377.150 - mae: 180.370 - mean_q: 195.886

Interval 155 (1540000 steps performed)
200 episodes - episode_reward: 197.150 [-50.000, 330.000] - loss: 378.401 - mae: 181.976 - mean_q: 197.027

Interval 1

200 episodes - episode_reward: 198.050 [0.000, 330.000] - loss: 353.046 - mae: 175.302 - mean_q: 189.821

Interval 187 (1860000 steps performed)
200 episodes - episode_reward: 190.200 [-30.000, 310.000] - loss: 371.128 - mae: 178.432 - mean_q: 193.071

Interval 188 (1870000 steps performed)
200 episodes - episode_reward: 187.800 [-50.000, 330.000] - loss: 360.425 - mae: 175.001 - mean_q: 189.311

Interval 189 (1880000 steps performed)
200 episodes - episode_reward: 182.150 [-40.000, 300.000] - loss: 346.858 - mae: 173.210 - mean_q: 187.362

Interval 190 (1890000 steps performed)
200 episodes - episode_reward: 187.300 [-40.000, 290.000] - loss: 342.726 - mae: 172.519 - mean_q: 186.453

Interval 191 (1900000 steps performed)
200 episodes - episode_reward: 192.900 [-10.000, 280.000] - loss: 340.353 - mae: 172.916 - mean_q: 186.796

Interval 192 (1910000 steps performed)
200 episodes - episode_reward: 192.500 [-30.000, 300.000] - loss: 345.077 - mae: 172.224 - mean_q: 186.354

Interval 193

200 episodes - episode_reward: 181.950 [-50.000, 290.000] - loss: 305.177 - mae: 159.903 - mean_q: 173.629

Interval 224 (2230000 steps performed)
200 episodes - episode_reward: 169.350 [-30.000, 300.000] - loss: 315.444 - mae: 164.436 - mean_q: 178.611

Interval 225 (2240000 steps performed)
200 episodes - episode_reward: 197.150 [-10.000, 310.000] - loss: 315.134 - mae: 165.584 - mean_q: 179.673

Interval 226 (2250000 steps performed)
200 episodes - episode_reward: 181.250 [-40.000, 290.000] - loss: 320.361 - mae: 165.319 - mean_q: 179.193

Interval 227 (2260000 steps performed)
200 episodes - episode_reward: 193.450 [-10.000, 310.000] - loss: 319.325 - mae: 166.652 - mean_q: 180.658

Interval 228 (2270000 steps performed)
200 episodes - episode_reward: 186.800 [-50.000, 320.000] - loss: 322.135 - mae: 165.723 - mean_q: 179.555

Interval 229 (2280000 steps performed)
200 episodes - episode_reward: 176.400 [-50.000, 280.000] - loss: 323.002 - mae: 165.804 - mean_q: 179.843

Interval 2

200 episodes - episode_reward: 195.600 [10.000, 310.000] - loss: 336.068 - mae: 170.427 - mean_q: 184.819

Interval 261 (2600000 steps performed)
200 episodes - episode_reward: 196.350 [20.000, 290.000] - loss: 336.125 - mae: 170.859 - mean_q: 185.342

Interval 262 (2610000 steps performed)
200 episodes - episode_reward: 188.600 [0.000, 300.000] - loss: 334.830 - mae: 170.253 - mean_q: 184.663

Interval 263 (2620000 steps performed)
200 episodes - episode_reward: 193.550 [-50.000, 310.000] - loss: 330.464 - mae: 169.254 - mean_q: 183.187

Interval 264 (2630000 steps performed)
200 episodes - episode_reward: 206.450 [50.000, 320.000] - loss: 344.503 - mae: 172.158 - mean_q: 186.192

Interval 265 (2640000 steps performed)
200 episodes - episode_reward: 207.100 [30.000, 350.000] - loss: 339.388 - mae: 173.017 - mean_q: 187.003

Interval 266 (2650000 steps performed)
200 episodes - episode_reward: 186.550 [-30.000, 310.000] - loss: 348.108 - mae: 172.191 - mean_q: 186.013

Interval 267 (26

200 episodes - episode_reward: 193.350 [-50.000, 300.000] - loss: 311.405 - mae: 162.387 - mean_q: 175.584

Interval 298 (2970000 steps performed)
200 episodes - episode_reward: 192.050 [20.000, 320.000] - loss: 312.561 - mae: 165.715 - mean_q: 179.266

Interval 299 (2980000 steps performed)
200 episodes - episode_reward: 185.400 [20.000, 270.000] - loss: 319.424 - mae: 166.215 - mean_q: 179.438

Interval 300 (2990000 steps performed)
200 episodes - episode_reward: 185.800 [-10.000, 290.000] - loss: 305.445 - mae: 161.892 - mean_q: 174.742

Interval 301 (3000000 steps performed)
200 episodes - episode_reward: 173.050 [10.000, 300.000] - loss: 307.233 - mae: 163.315 - mean_q: 176.283

Interval 302 (3010000 steps performed)
200 episodes - episode_reward: 179.750 [-10.000, 300.000] - loss: 304.150 - mae: 162.522 - mean_q: 175.460

Interval 303 (3020000 steps performed)
200 episodes - episode_reward: 190.150 [-20.000, 300.000] - loss: 299.715 - mae: 159.940 - mean_q: 172.733

Interval 304 

200 episodes - episode_reward: 196.650 [-30.000, 290.000] - loss: 325.685 - mae: 167.250 - mean_q: 181.074

Interval 335 (3340000 steps performed)
200 episodes - episode_reward: 199.500 [-50.000, 310.000] - loss: 313.027 - mae: 163.023 - mean_q: 176.296

Interval 336 (3350000 steps performed)
200 episodes - episode_reward: 176.850 [-30.000, 290.000] - loss: 317.154 - mae: 166.866 - mean_q: 180.272

Interval 337 (3360000 steps performed)
200 episodes - episode_reward: 192.850 [-30.000, 330.000] - loss: 317.296 - mae: 164.477 - mean_q: 177.795

Interval 338 (3370000 steps performed)
200 episodes - episode_reward: 191.300 [30.000, 310.000] - loss: 318.936 - mae: 166.473 - mean_q: 179.968

Interval 339 (3380000 steps performed)
200 episodes - episode_reward: 198.700 [-20.000, 310.000] - loss: 319.470 - mae: 166.109 - mean_q: 179.446

Interval 340 (3390000 steps performed)
200 episodes - episode_reward: 189.200 [-20.000, 300.000] - loss: 316.410 - mae: 165.624 - mean_q: 178.983

Interval 34

200 episodes - episode_reward: 185.700 [-40.000, 310.000] - loss: 315.227 - mae: 165.259 - mean_q: 178.340

Interval 372 (3710000 steps performed)
200 episodes - episode_reward: 194.000 [10.000, 310.000] - loss: 317.717 - mae: 166.692 - mean_q: 179.972

Interval 373 (3720000 steps performed)
200 episodes - episode_reward: 206.700 [-50.000, 320.000] - loss: 312.524 - mae: 164.095 - mean_q: 177.144

Interval 374 (3730000 steps performed)
200 episodes - episode_reward: 191.050 [-30.000, 310.000] - loss: 319.197 - mae: 168.506 - mean_q: 182.211

Interval 375 (3740000 steps performed)
200 episodes - episode_reward: 195.800 [-10.000, 340.000] - loss: 331.263 - mae: 168.881 - mean_q: 182.590

Interval 376 (3750000 steps performed)
200 episodes - episode_reward: 196.200 [-10.000, 310.000] - loss: 317.471 - mae: 165.485 - mean_q: 179.063

Interval 377 (3760000 steps performed)
200 episodes - episode_reward: 189.100 [20.000, 290.000] - loss: 318.708 - mae: 165.644 - mean_q: 179.206

Interval 378

200 episodes - episode_reward: 191.350 [0.000, 320.000] - loss: 338.992 - mae: 170.708 - mean_q: 184.531

Interval 409 (4080000 steps performed)
200 episodes - episode_reward: 185.700 [-10.000, 320.000] - loss: 320.789 - mae: 166.486 - mean_q: 179.620

Interval 410 (4090000 steps performed)
200 episodes - episode_reward: 205.200 [10.000, 310.000] - loss: 318.286 - mae: 166.111 - mean_q: 179.375

Interval 411 (4100000 steps performed)
200 episodes - episode_reward: 196.300 [-40.000, 310.000] - loss: 316.954 - mae: 167.023 - mean_q: 180.205

Interval 412 (4110000 steps performed)
200 episodes - episode_reward: 202.950 [0.000, 310.000] - loss: 334.275 - mae: 169.354 - mean_q: 182.794

Interval 413 (4120000 steps performed)
200 episodes - episode_reward: 202.100 [20.000, 320.000] - loss: 328.678 - mae: 168.022 - mean_q: 181.280

Interval 414 (4130000 steps performed)
200 episodes - episode_reward: 207.300 [20.000, 310.000] - loss: 335.787 - mae: 170.863 - mean_q: 184.564

Interval 415 (414

200 episodes - episode_reward: 198.850 [50.000, 310.000] - loss: 342.153 - mae: 173.292 - mean_q: 187.152

Interval 446 (4450000 steps performed)
200 episodes - episode_reward: 211.500 [0.000, 310.000] - loss: 342.225 - mae: 173.615 - mean_q: 187.554

Interval 447 (4460000 steps performed)
200 episodes - episode_reward: 211.350 [10.000, 340.000] - loss: 341.521 - mae: 172.180 - mean_q: 186.205

Interval 448 (4470000 steps performed)
200 episodes - episode_reward: 187.100 [0.000, 330.000] - loss: 345.911 - mae: 172.949 - mean_q: 187.187

Interval 449 (4480000 steps performed)
200 episodes - episode_reward: 164.450 [-50.000, 300.000] - loss: 364.616 - mae: 178.717 - mean_q: 193.232

Interval 450 (4490000 steps performed)
200 episodes - episode_reward: 207.350 [70.000, 320.000] - loss: 350.992 - mae: 175.546 - mean_q: 189.734

Interval 451 (4500000 steps performed)
200 episodes - episode_reward: 208.200 [10.000, 310.000] - loss: 346.900 - mae: 173.210 - mean_q: 187.324

Interval 452 (4510

200 episodes - episode_reward: 186.800 [0.000, 300.000] - loss: 341.157 - mae: 172.680 - mean_q: 186.319

Interval 483 (4820000 steps performed)
200 episodes - episode_reward: 200.700 [-10.000, 340.000] - loss: 335.261 - mae: 171.528 - mean_q: 185.160

Interval 484 (4830000 steps performed)
200 episodes - episode_reward: 210.200 [10.000, 300.000] - loss: 347.576 - mae: 172.395 - mean_q: 186.269

Interval 485 (4840000 steps performed)
200 episodes - episode_reward: 212.700 [10.000, 310.000] - loss: 332.870 - mae: 170.121 - mean_q: 183.625

Interval 486 (4850000 steps performed)
200 episodes - episode_reward: 213.500 [-40.000, 340.000] - loss: 329.148 - mae: 169.312 - mean_q: 182.825

Interval 487 (4860000 steps performed)
200 episodes - episode_reward: 209.950 [-50.000, 310.000] - loss: 333.655 - mae: 168.656 - mean_q: 182.116

Interval 488 (4870000 steps performed)
200 episodes - episode_reward: 196.300 [-30.000, 320.000] - loss: 338.075 - mae: 172.907 - mean_q: 186.581

Interval 489 (

200 episodes - episode_reward: 214.100 [-10.000, 310.000] - loss: 346.758 - mae: 173.765 - mean_q: 187.822

Interval 520 (5190000 steps performed)
200 episodes - episode_reward: 214.100 [-20.000, 320.000] - loss: 353.667 - mae: 175.314 - mean_q: 189.433

Interval 521 (5200000 steps performed)
200 episodes - episode_reward: 206.050 [-10.000, 320.000] - loss: 342.299 - mae: 172.764 - mean_q: 186.634

Interval 522 (5210000 steps performed)
200 episodes - episode_reward: 195.200 [20.000, 310.000] - loss: 339.491 - mae: 172.389 - mean_q: 185.935

Interval 523 (5220000 steps performed)
200 episodes - episode_reward: 180.650 [-40.000, 310.000] - loss: 328.542 - mae: 167.568 - mean_q: 180.553

Interval 524 (5230000 steps performed)
200 episodes - episode_reward: 202.100 [20.000, 300.000] - loss: 319.516 - mae: 165.241 - mean_q: 178.490

Interval 525 (5240000 steps performed)
200 episodes - episode_reward: 195.450 [-50.000, 310.000] - loss: 322.874 - mae: 167.899 - mean_q: 181.414

Interval 526

200 episodes - episode_reward: 181.450 [-20.000, 300.000] - loss: 257.439 - mae: 150.669 - mean_q: 162.586

Interval 557 (5560000 steps performed)
200 episodes - episode_reward: 188.650 [-20.000, 310.000] - loss: 261.629 - mae: 150.922 - mean_q: 162.983

Interval 558 (5570000 steps performed)
200 episodes - episode_reward: 189.350 [-30.000, 290.000] - loss: 268.189 - mae: 151.250 - mean_q: 163.254

Interval 559 (5580000 steps performed)
200 episodes - episode_reward: 192.400 [0.000, 280.000] - loss: 280.955 - mae: 155.011 - mean_q: 167.441

Interval 560 (5590000 steps performed)
200 episodes - episode_reward: 202.950 [10.000, 320.000] - loss: 299.166 - mae: 161.198 - mean_q: 174.029

Interval 561 (5600000 steps performed)
200 episodes - episode_reward: 213.850 [-20.000, 320.000] - loss: 324.048 - mae: 167.272 - mean_q: 180.721

Interval 562 (5610000 steps performed)
200 episodes - episode_reward: 195.800 [-10.000, 330.000] - loss: 332.087 - mae: 169.963 - mean_q: 183.133

Interval 563 

200 episodes - episode_reward: 201.950 [30.000, 320.000] - loss: 331.443 - mae: 169.224 - mean_q: 182.746

Interval 594 (5930000 steps performed)
200 episodes - episode_reward: 205.250 [10.000, 310.000] - loss: 330.095 - mae: 169.408 - mean_q: 182.796

Interval 595 (5940000 steps performed)
200 episodes - episode_reward: 211.650 [50.000, 340.000] - loss: 329.222 - mae: 169.820 - mean_q: 183.339

Interval 596 (5950000 steps performed)
200 episodes - episode_reward: 185.250 [-10.000, 300.000] - loss: 329.156 - mae: 169.102 - mean_q: 182.434

Interval 597 (5960000 steps performed)
200 episodes - episode_reward: 173.000 [-40.000, 300.000] - loss: 320.602 - mae: 168.004 - mean_q: 180.871

Interval 598 (5970000 steps performed)
200 episodes - episode_reward: 146.500 [-40.000, 260.000] - loss: 301.645 - mae: 162.649 - mean_q: 175.045

Interval 599 (5980000 steps performed)
200 episodes - episode_reward: 132.100 [-50.000, 250.000] - loss: 310.632 - mae: 164.742 - mean_q: 177.070

Interval 600 

200 episodes - episode_reward: 208.200 [-10.000, 300.000] - loss: 294.491 - mae: 159.705 - mean_q: 172.111

Interval 631 (6300000 steps performed)
200 episodes - episode_reward: 211.500 [0.000, 320.000] - loss: 307.209 - mae: 161.874 - mean_q: 174.631

Interval 632 (6310000 steps performed)
200 episodes - episode_reward: 205.650 [-30.000, 310.000] - loss: 308.797 - mae: 163.894 - mean_q: 176.515

Interval 633 (6320000 steps performed)
200 episodes - episode_reward: 180.050 [-10.000, 300.000] - loss: 312.780 - mae: 165.658 - mean_q: 178.207

Interval 634 (6330000 steps performed)
200 episodes - episode_reward: 187.300 [-20.000, 280.000] - loss: 300.550 - mae: 162.964 - mean_q: 175.344

Interval 635 (6340000 steps performed)
200 episodes - episode_reward: 197.650 [-50.000, 280.000] - loss: 302.094 - mae: 161.125 - mean_q: 173.426

Interval 636 (6350000 steps performed)
200 episodes - episode_reward: 191.450 [0.000, 300.000] - loss: 294.574 - mae: 162.048 - mean_q: 174.443

Interval 637 (

200 episodes - episode_reward: 157.150 [-40.000, 270.000] - loss: 242.750 - mae: 146.216 - mean_q: 157.111

Interval 668 (6670000 steps performed)
200 episodes - episode_reward: 175.000 [-10.000, 250.000] - loss: 227.302 - mae: 140.806 - mean_q: 151.131

Interval 669 (6680000 steps performed)
200 episodes - episode_reward: 184.650 [-10.000, 290.000] - loss: 225.221 - mae: 139.439 - mean_q: 149.866

Interval 670 (6690000 steps performed)
200 episodes - episode_reward: 180.700 [10.000, 310.000] - loss: 225.036 - mae: 139.473 - mean_q: 149.675

Interval 671 (6700000 steps performed)
200 episodes - episode_reward: 188.900 [90.000, 280.000] - loss: 234.107 - mae: 143.309 - mean_q: 153.840

Interval 672 (6710000 steps performed)
200 episodes - episode_reward: 178.000 [-40.000, 310.000] - loss: 237.807 - mae: 143.980 - mean_q: 154.675

Interval 673 (6720000 steps performed)
200 episodes - episode_reward: 178.650 [-20.000, 270.000] - loss: 243.702 - mae: 144.650 - mean_q: 155.824

Interval 674

In [15]:
data = history.history
data['episode_reward'] = [float(v) for v in data['episode_reward']]
data['nb_episode_steps'] = [int(v) for v in data['nb_episode_steps']]
data['nb_steps'] = [int(v) for v in data['nb_steps']]

In [None]:
data

In [16]:
os.mkdir('agents/{}'.format(name))  # If the directory does not exist we cannot write the file
with open(get_training_path(name), 'w') as f:
    json.dump(data, f)

Save agent to memory

In [17]:
dqn.save_weights(get_agent_path(name), overwrite=True)

## 4. Reloading training from Memory ##