-
Notifications
You must be signed in to change notification settings - Fork 2
/
dqn.py
665 lines (572 loc) · 30.1 KB
/
dqn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
TRAIN = True
TEST = True
"""
Implementation of DeepMind's Deep Q-Learning by Fabio M. Graetz, 2018
If you have questions or suggestions, write me a mail fabiograetzatgooglemaildotcom
"""
import os
import random
# import gym
import tensorflow as tf
import numpy as np
import imageio
from skimage.transform import resize
from conv_env import MyApp as conv_env
import pdb
import cv2
import time
class ProcessFrame:
"""Resizes and converts RGB Env frames to grayscale"""
def __init__(self, frame_height=84, frame_width=84):
"""
Args:
frame_height: Integer, Height of a frame of an Env game
frame_width: Integer, Width of a frame of an Env game
"""
self.frame_height = frame_height
self.frame_width = frame_width
self.frame = tf.placeholder(shape=[84, 84, 3], dtype=tf.uint8)
self.processed = tf.image.rgb_to_grayscale(self.frame)
def process(self, session, frame):
"""
Args:
session: A Tensorflow session object
frame: A (84, 84, 3) frame of an Env game in BGR
Returns:
A processed (84, 84, 1) frame in grayscale
"""
return session.run(self.processed, feed_dict={self.frame:frame})
class DQN:
"""Implements a Deep Q Network"""
# pylint: disable=too-many-instance-attributes
def __init__(self, n_actions, hidden=1024, learning_rate=0.00001,
frame_height=84, frame_width=84, agent_history_length=4):
"""
Args:
n_actions: Integer, number of possible actions
hidden: Integer, Number of filters in the final convolutional layer.
This is different from the DeepMind implementation
learning_rate: Float, Learning rate for the Adam optimizer
frame_height: Integer, Height of a frame of an Env game
frame_width: Integer, Width of a frame of an Env game
agent_history_length: Integer, Number of frames stacked together to create a state
"""
self.n_actions = n_actions
self.hidden = hidden
self.learning_rate = learning_rate
self.frame_height = frame_height
self.frame_width = frame_width
self.agent_history_length = agent_history_length
self.input = tf.placeholder(shape=[None, self.frame_height,
self.frame_width, self.agent_history_length],
dtype=tf.float32)
# Normalizing the input
self.inputscaled = self.input/255
# Convolutional layers
self.conv1 = tf.layers.conv2d(
inputs=self.inputscaled, filters=32, kernel_size=[8, 8], strides=4,
kernel_initializer=tf.variance_scaling_initializer(scale=2),
padding="valid", activation=tf.nn.relu, use_bias=False, name='conv1')
self.conv2 = tf.layers.conv2d(
inputs=self.conv1, filters=64, kernel_size=[4, 4], strides=2,
kernel_initializer=tf.variance_scaling_initializer(scale=2),
padding="valid", activation=tf.nn.relu, use_bias=False, name='conv2')
self.conv3 = tf.layers.conv2d(
inputs=self.conv2, filters=64, kernel_size=[3, 3], strides=1,
kernel_initializer=tf.variance_scaling_initializer(scale=2),
padding="valid", activation=tf.nn.relu, use_bias=False, name='conv3')
self.conv4 = tf.layers.conv2d(
inputs=self.conv3, filters=hidden, kernel_size=[7, 7], strides=1,
kernel_initializer=tf.variance_scaling_initializer(scale=2),
padding="valid", activation=tf.nn.relu, use_bias=False, name='conv4')
# Splitting into value and advantage stream
self.valuestream, self.advantagestream = tf.split(self.conv4, 2, 3)
self.valuestream = tf.layers.flatten(self.valuestream)
self.advantagestream = tf.layers.flatten(self.advantagestream)
self.advantage = tf.layers.dense(
inputs=self.advantagestream, units=self.n_actions,
kernel_initializer=tf.variance_scaling_initializer(scale=2), name="advantage")
self.value = tf.layers.dense(
inputs=self.valuestream, units=1,
kernel_initializer=tf.variance_scaling_initializer(scale=2), name='value')
# Combining value and advantage into Q-values as described above
self.q_values = self.value + tf.subtract(self.advantage, tf.reduce_mean(self.advantage, axis=1, keepdims=True))
self.best_action = tf.argmax(self.q_values, 1)
# The next lines perform the parameter update. This will be explained in detail later.
# targetQ according to Bellman equation:
# Q = r + gamma*max Q', calculated in the function learn()
self.target_q = tf.placeholder(shape=[None], dtype=tf.float32)
# Action that was performed
self.action = tf.placeholder(shape=[None], dtype=tf.int32)
# Q value of the action that was performed
self.Q = tf.reduce_sum(tf.multiply(self.q_values, tf.one_hot(self.action, self.n_actions, dtype=tf.float32)), axis=1)
# Parameter updates
self.loss = tf.reduce_mean(tf.losses.huber_loss(labels=self.target_q, predictions=self.Q))
self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
self.update = self.optimizer.minimize(self.loss)
class ActionGetter:
"""Determines an action according to an epsilon greedy strategy with annealing epsilon"""
def __init__(self, n_actions, eps_initial=1, eps_final=0.1, eps_final_frame=0.01,
eps_evaluation=0.0, eps_annealing_frames=1000000,
replay_memory_start_size=50000, max_frames=25000000):
"""
Args:
n_actions: Integer, number of possible actions
eps_initial: Float, Exploration probability for the first
replay_memory_start_size frames
eps_final: Float, Exploration probability after
replay_memory_start_size + eps_annealing_frames frames
eps_final_frame: Float, Exploration probability after max_frames frames
eps_evaluation: Float, Exploration probability during evaluation
eps_annealing_frames: Int, Number of frames over which the
exploration probabilty is annealed from eps_initial to eps_final
replay_memory_start_size: Integer, Number of frames during
which the agent only explores
max_frames: Integer, Total number of frames shown to the agent
"""
self.n_actions = n_actions
self.eps_initial = eps_initial
self.eps_final = eps_final
self.eps_final_frame = eps_final_frame
self.eps_evaluation = eps_evaluation
self.eps_annealing_frames = eps_annealing_frames
self.replay_memory_start_size = replay_memory_start_size
self.max_frames = max_frames
# Slopes and intercepts for exploration decrease
self.slope = -(self.eps_initial - self.eps_final)/self.eps_annealing_frames
self.intercept = self.eps_initial - self.slope*self.replay_memory_start_size
self.slope_2 = -(self.eps_final - self.eps_final_frame)/(self.max_frames - self.eps_annealing_frames - self.replay_memory_start_size)
self.intercept_2 = self.eps_final_frame - self.slope_2*self.max_frames
def get_action(self, session, frame_number, state, main_dqn, evaluation=False):
"""
Args:
session: A tensorflow session object
frame_number: Integer, number of the current frame
state: A (84, 84, 4) sequence of frames of an Env game in grayscale
main_dqn: A DQN object
evaluation: A boolean saying whether the agent is being evaluated
Returns:
An integer between 0 and n_actions - 1 determining the action the agent perfoms next
"""
if evaluation:
eps = self.eps_evaluation
elif frame_number < self.replay_memory_start_size:
eps = self.eps_initial
elif frame_number >= self.replay_memory_start_size and frame_number < self.replay_memory_start_size + self.eps_annealing_frames:
eps = self.slope*frame_number + self.intercept
elif frame_number >= self.replay_memory_start_size + self.eps_annealing_frames:
eps = self.slope_2*frame_number + self.intercept_2
if np.random.rand(1) < eps:
return np.random.randint(0, self.n_actions)
return session.run(main_dqn.best_action, feed_dict={main_dqn.input:[state]})[0]
class ReplayMemory:
"""Replay Memory that stores the last size=1,000,000 transitions"""
def __init__(self, size=1000000, frame_height=84, frame_width=84,
agent_history_length=4, batch_size=32):
"""
Args:
size: Integer, Number of stored transitions
frame_height: Integer, Height of a frame of an Env game
frame_width: Integer, Width of a frame of an Env game
agent_history_length: Integer, Number of frames stacked together to create a state
batch_size: Integer, Number if transitions returned in a minibatch
"""
self.size = size
self.frame_height = frame_height
self.frame_width = frame_width
self.agent_history_length = agent_history_length
self.batch_size = batch_size
self.count = 0
self.current = 0
# Pre-allocate memory
self.actions = np.empty(self.size, dtype=np.int32)
self.rewards = np.empty(self.size, dtype=np.float32)
self.frames = np.empty((self.size, self.frame_height, self.frame_width), dtype=np.uint8)
self.terminal_flags = np.empty(self.size, dtype=np.bool)
# Pre-allocate memory for the states and new_states in a minibatch
self.states = np.empty((self.batch_size, self.agent_history_length,
self.frame_height, self.frame_width), dtype=np.uint8)
self.new_states = np.empty((self.batch_size, self.agent_history_length,
self.frame_height, self.frame_width), dtype=np.uint8)
self.indices = np.empty(self.batch_size, dtype=np.int32)
def add_experience(self, action, frame, reward, terminal):
"""
Args:
action: An integer between 0 and env.action_space.n - 1
determining the action the agent perfomed
frame: A (84, 84, 1) frame of an Env game in grayscale
reward: A float determining the reward the agend received for performing an action
terminal: A bool stating whether the episode terminated
"""
if frame.shape != (self.frame_height, self.frame_width):
raise ValueError('Dimension of frame is wrong!')
self.actions[self.current] = action
self.frames[self.current, ...] = frame
self.rewards[self.current] = reward
self.terminal_flags[self.current] = terminal
self.count = max(self.count, self.current+1)
self.current = (self.current + 1) % self.size
def _get_state(self, index):
if self.count is 0:
raise ValueError("The replay memory is empty!")
if index < self.agent_history_length - 1:
raise ValueError("Index must be min 3")
return self.frames[index-self.agent_history_length+1:index+1, ...]
def _get_valid_indices(self):
for i in range(self.batch_size):
while True:
index = random.randint(self.agent_history_length, self.count - 1)
if index < self.agent_history_length:
continue
if index >= self.current and index - self.agent_history_length <= self.current:
continue
if self.terminal_flags[index - self.agent_history_length:index].any():
continue
break
self.indices[i] = index
def get_minibatch(self):
"""
Returns a minibatch of self.batch_size = 32 transitions
"""
if self.count < self.agent_history_length:
raise ValueError('Not enough memories to get a minibatch')
self._get_valid_indices()
for i, idx in enumerate(self.indices):
self.states[i] = self._get_state(idx - 1)
self.new_states[i] = self._get_state(idx)
return np.transpose(self.states, axes=(0, 2, 3, 1)), self.actions[self.indices], self.rewards[self.indices], np.transpose(self.new_states, axes=(0, 2, 3, 1)), self.terminal_flags[self.indices]
def learn(session, replay_memory, main_dqn, target_dqn, batch_size, gamma):
"""
Args:
session: A tensorflow sesson object
replay_memory: A ReplayMemory object
main_dqn: A DQN object
target_dqn: A DQN object
batch_size: Integer, Batch size
gamma: Float, discount factor for the Bellman equation
Returns:
loss: The loss of the minibatch, for tensorboard
Draws a minibatch from the replay memory, calculates the
target Q-value that the prediction Q-value is regressed to.
Then a parameter update is performed on the main DQN.
"""
# Draw a minibatch from the replay memory
states, actions, rewards, new_states, terminal_flags = replay_memory.get_minibatch()
# The main network estimates which action is best (in the next
# state s', new_states is passed!)
# for every transition in the minibatch
arg_q_max = session.run(main_dqn.best_action, feed_dict={main_dqn.input:new_states})
# The target network estimates the Q-values (in the next state s', new_states is passed!)
# for every transition in the minibatch
q_vals = session.run(target_dqn.q_values, feed_dict={target_dqn.input:new_states})
double_q = q_vals[range(batch_size), arg_q_max]
# Bellman equation. Multiplication with (1-terminal_flags) makes sure that
# if the game is over, targetQ=rewards
target_q = rewards + (gamma*double_q * (1-terminal_flags))
# Gradient descend step to update the parameters of the main network
loss, _ = session.run([main_dqn.loss, main_dqn.update],
feed_dict={main_dqn.input:states,
main_dqn.target_q:target_q,
main_dqn.action:actions})
return loss
class TargetNetworkUpdater:
"""Copies the parameters of the main DQN to the target DQN"""
def __init__(self, main_dqn_vars, target_dqn_vars):
"""
Args:
main_dqn_vars: A list of tensorflow variables belonging to the main DQN network
target_dqn_vars: A list of tensorflow variables belonging to the target DQN network
"""
self.main_dqn_vars = main_dqn_vars
self.target_dqn_vars = target_dqn_vars
def _update_target_vars(self):
update_ops = []
for i, var in enumerate(self.main_dqn_vars):
copy_op = self.target_dqn_vars[i].assign(var.value())
update_ops.append(copy_op)
return update_ops
def update_networks(self, sess):
"""
Args:
sess: A Tensorflow session object
Assigns the values of the parameters of the main network to the
parameters of the target network
"""
update_ops = self._update_target_vars()
for copy_op in update_ops:
sess.run(copy_op)
def generate_gif(frame_number, frames_for_gif, reward, path):
"""
Args:
frame_number: Integer, determining the number of the current frame
frames_for_gif: A sequence of (84, 84, 3) frames of an Env game in RGB
reward: Integer, Total reward of the episode that es ouputted as a gif
path: String, path where gif is saved
"""
for idx, frame_idx in enumerate(frames_for_gif):
frames_for_gif[idx] = resize(frame_idx, (320, 320, 3),
preserve_range=True, order=0).astype(np.uint8)
imageio.mimsave(f'{path}{"ATARI_frame_{0}_reward_{1}.gif".format(frame_number, reward)}',
frames_for_gif, duration=1/30)
def generate_movie(file_name, clip, reward, path):
"""
Args:
frame_number: Integer, determining the number of the current frame
clip: A sequence of (84, 84, 3) frames of an Env game in RGB
reward: Integer, Total reward of the episode that es ouputted as a gif
path: String, path where gif is saved
fourcc_str : str, to retrieve fourcc from opencv
fps : float, frame rate of create video-stream
"""
for idx, frame_idx in enumerate(clip):
clip[idx] = resize(frame_idx, (320, 320, 3), preserve_range=True, order=0).astype(np.uint8)
for idx, frame_idx in enumerate(clip):
clip[idx] = cv2.cvtColor(frame_idx, cv2.COLOR_BGR2RGB)
filename = file_name
fps = 20
imageio.mimwrite(filename, clip, fps=fps)
class Env:
"""Wrapper for the imported environment"""
def __init__(self, no_op_steps=300, agent_history_length=4):
# self.env = gym.make(envName)
self.env = conv_env()
self.frame_processor = ProcessFrame()
self.state = None
self.last_lives = 0
self.no_op_steps = no_op_steps
self.agent_history_length = agent_history_length
def reset(self, sess, evaluation=False):
"""
Args:
sess: A Tensorflow session object
evaluation: A boolean saying whether the agent is evaluating or training
Resets the environment and stacks four frames ontop of each other to
create the first state
"""
frame = self.env.reset()
self.last_lives = 0
terminal_life_lost = True # Set to true so that the agent starts
# with a 'FIRE' action when evaluating
if evaluation:
for _ in range(random.randint(1, self.no_op_steps)):
frame, _, _ = self.env.step(1) # Action 'Fire'
processed_frame = self.frame_processor.process(sess, frame) # (★★★)
self.state = np.repeat(processed_frame, self.agent_history_length, axis=2)
return terminal_life_lost
def step(self, sess, action):
"""
Args:
sess: A Tensorflow session object
action: Integer, action the agent performs
Performs an action and observes the reward and terminal state from the environment
"""
new_frame, reward, terminal = self.env.step(action) # (5★)
terminal_life_lost = terminal
self.last_lives = 10
processed_new_frame = self.frame_processor.process(sess, new_frame) # (6★)
new_state = np.append(self.state[:, :, 1:], processed_new_frame, axis=2) # (6★)
self.state = new_state
return processed_new_frame, reward, terminal, terminal_life_lost, new_frame
tf.reset_default_graph()
# Control parameters
MAX_EPISODE_LENGTH = 18000 # Equivalent of 5 minutes of gameplay at 60 frames per second
EVAL_FREQUENCY = 200000 # Number of frames the agent sees between evaluations
EVAL_STEPS = 10000 # Number of frames for one evaluation
NETW_UPDATE_FREQ = 10000 # Number of chosen actions between updating the target network.
# According to Mnih et al. 2015 this is measured in the number of
# parameter updates (every four actions), however, in the
# DeepMind code, it is clearly measured in the number
# of actions the agent choses
DISCOUNT_FACTOR = 0.99 # gamma in the Bellman equation
REPLAY_MEMORY_START_SIZE = 50000 # Number of completely random actions,
# before the agent starts learning
MAX_FRAMES = 10000000 # Total number of frames the agent sees
MEMORY_SIZE = 1000000 # Number of transitions stored in the replay memory
NO_OP_STEPS = 10 # Number of 'NOOP' or 'FIRE' actions at the beginning of an
# evaluation episode
UPDATE_FREQ = 4 # Every four actions a gradient descend step is performed
HIDDEN = 1024 # Number of filters in the final convolutional layer. The output
# has the shape (1,1,1024) which is split into two streams. Both
# the advantage stream and value stream have the shape
# (1,1,512). This is slightly different from the original
# implementation but tests I did with the environment Pong
# have shown that this way the score increases more quickly
LEARNING_RATE = 0.00001 # Set to 0.00025 in Pong for quicker results.
# Hessel et al. 2017 used 0.0000625
BS = 32 # Batch size
SUMMARIES = "summaries" # logdir for tensorboard
RUNID = 'run_4'
PATH = f'models/{RUNID}/' # checkpoints will be saved here
os.makedirs(PATH, exist_ok=True)
os.makedirs(os.path.join(SUMMARIES, RUNID), exist_ok=True)
SUMM_WRITER = tf.summary.FileWriter(os.path.join(SUMMARIES, RUNID))
sim_env = Env()
# print("The environment has the following {} actions: {}".format(sim_env.env.action_space.n,
# sim_env.env.unwrapped.get_action_meanings()))
# main DQN and target DQN networks:
with tf.variable_scope('mainDQN'):
MAIN_DQN = DQN(sim_env.env.actions, HIDDEN, LEARNING_RATE) # (★★)
with tf.variable_scope('targetDQN'):
TARGET_DQN = DQN(sim_env.env.actions, HIDDEN) # (★★)
init = tf.global_variables_initializer()
saver = tf.train.Saver(save_relative_paths=True)
MAIN_DQN_VARS = tf.trainable_variables(scope='mainDQN')
TARGET_DQN_VARS = tf.trainable_variables(scope='targetDQN')
LAYER_IDS = ["conv1", "conv2", "conv3", "conv4", "denseAdvantage",
"denseAdvantageBias", "denseValue", "denseValueBias"]
# Scalar summaries for tensorboard: loss, average reward and evaluation score
with tf.name_scope('Performance'):
LOSS_PH = tf.placeholder(tf.float32, shape=None, name='loss_summary')
LOSS_SUMMARY = tf.summary.scalar('loss', LOSS_PH)
REWARD_PH = tf.placeholder(tf.float32, shape=None, name='reward_summary')
REWARD_SUMMARY = tf.summary.scalar('reward', REWARD_PH)
EVAL_SCORE_PH = tf.placeholder(tf.float32, shape=None, name='evaluation_summary')
EVAL_SCORE_SUMMARY = tf.summary.scalar('evaluation_score', EVAL_SCORE_PH)
PERFORMANCE_SUMMARIES = tf.summary.merge([LOSS_SUMMARY, REWARD_SUMMARY])
# Histogramm summaries for tensorboard: parameters
with tf.name_scope('Parameters'):
ALL_PARAM_SUMMARIES = []
for i, Id in enumerate(LAYER_IDS):
with tf.name_scope('mainDQN/'):
MAIN_DQN_KERNEL = tf.summary.histogram(Id, tf.reshape(MAIN_DQN_VARS[i], shape=[-1]))
ALL_PARAM_SUMMARIES.extend([MAIN_DQN_KERNEL])
PARAM_SUMMARIES = tf.summary.merge(ALL_PARAM_SUMMARIES)
def train():
"""Contains the training and evaluation loops"""
my_replay_memory = ReplayMemory(size=MEMORY_SIZE, batch_size=BS) # (★)
network_updater = TargetNetworkUpdater(MAIN_DQN_VARS, TARGET_DQN_VARS)
action_getter = ActionGetter(sim_env.env.actions,
replay_memory_start_size=REPLAY_MEMORY_START_SIZE,
max_frames=MAX_FRAMES)
with tf.Session() as sess:
sess.run(init)
frame_number = 0
rewards = []
loss_list = []
while frame_number < MAX_FRAMES:
########################
####### Training #######
########################
epoch_frame = 0
while epoch_frame < EVAL_FREQUENCY:
terminal_life_lost = sim_env.reset(sess, evaluation=True)
episode_reward_sum = 0
for _ in range(MAX_EPISODE_LENGTH):
# (4★)
action = action_getter.get_action(sess, frame_number, sim_env.state, MAIN_DQN)
# (5★)
processed_new_frame, reward, terminal, terminal_life_lost, _ = sim_env.step(sess, action)
frame_number += 1
epoch_frame += 1
episode_reward_sum += reward
# (7★) Store transition in the replay memory
my_replay_memory.add_experience(action=action,
frame=processed_new_frame[:, :, 0],
reward=reward,
terminal=terminal_life_lost)
if frame_number % UPDATE_FREQ == 0 and frame_number > REPLAY_MEMORY_START_SIZE:
loss = learn(sess, my_replay_memory, MAIN_DQN, TARGET_DQN,
BS, gamma = DISCOUNT_FACTOR) # (8★)
loss_list.append(loss)
if frame_number % NETW_UPDATE_FREQ == 0 and frame_number > REPLAY_MEMORY_START_SIZE:
network_updater.update_networks(sess) # (9★)
if terminal:
terminal = False
break
rewards.append(episode_reward_sum)
# Output the progress:
if len(rewards) % 10 == 0:
# Scalar summaries for tensorboard
if frame_number > REPLAY_MEMORY_START_SIZE:
summ = sess.run(PERFORMANCE_SUMMARIES,
feed_dict={LOSS_PH:np.mean(loss_list),
REWARD_PH:np.mean(rewards[-100:])})
SUMM_WRITER.add_summary(summ, frame_number)
loss_list = []
# Histogramm summaries for tensorboard
summ_param = sess.run(PARAM_SUMMARIES)
SUMM_WRITER.add_summary(summ_param, frame_number)
print(len(rewards), frame_number, np.mean(rewards[-100:]))
with open('rewards.dat', 'a') as reward_file:
print(len(rewards), frame_number,
np.mean(rewards[-100:]), file=reward_file)
########################
###### Evaluation ######
########################
terminal = True
gif = True
frames_for_gif = []
eval_rewards = []
evaluate_frame_number = 0
action = 1
for _ in range(200):
processed_new_frame, reward, terminal, terminal_life_lost, new_frame = sim_env.step(sess, action)
for _ in range(EVAL_STEPS):
if terminal:
terminal_life_lost = sim_env.reset(sess, evaluation=True)
episode_reward_sum = 0
terminal = False
# Fire (action 1), when a life was lost or the game just started,
# so that the agent does not stand around doing nothing. When playing
# with other environments, you might want to change this...
action = 1 if terminal_life_lost else action_getter.get_action(sess, frame_number,
sim_env.state,
MAIN_DQN,
evaluation=True)
processed_new_frame, reward, terminal, terminal_life_lost, new_frame = sim_env.step(sess, action)
evaluate_frame_number += 1
episode_reward_sum += reward
# Only save the first 600 frames of the first run
if evaluate_frame_number > 600 and gif is True:
terminal = True
if gif:
frames_for_gif.append(new_frame)
if terminal:
eval_rewards.append(episode_reward_sum)
gif = False # Save only the first game of the evaluation as a gif
eval_rewards.append(episode_reward_sum)
print("Evaluation score:\n", np.mean(eval_rewards))
os.makedirs('./demos/{RUNID}/',exist_ok=True)
try:
generate_movie(f'demos/{RUNID}/{frame_number}.mp4', frames_for_gif, eval_rewards[0], PATH)
except IndexError:
print("No evaluation game finished")
#Save the network parameters
saver.save(sess, PATH+'/my_model', global_step=frame_number)
frames_for_gif = []
# Show the evaluation score in tensorboard
summ = sess.run(EVAL_SCORE_SUMMARY, feed_dict={EVAL_SCORE_PH:np.mean(eval_rewards)})
SUMM_WRITER.add_summary(summ, frame_number)
with open('rewardsEval.dat', 'a') as eval_reward_file:
print(frame_number, np.mean(eval_rewards), file=eval_reward_file)
if TRAIN:
train()
if TEST:
gif_path = "GIF/"
os.makedirs(gif_path,exist_ok=True)
trained_path = f'models/{RUNID}/'
save_file = "my_model-3240000.meta"
action_getter = ActionGetter(sim_env.env.actions,
replay_memory_start_size=REPLAY_MEMORY_START_SIZE,
max_frames=MAX_FRAMES)
with tf.Session() as sess:
saver = tf.train.import_meta_graph(trained_path+save_file)
saver.restore(sess,tf.train.latest_checkpoint(trained_path))
frames_for_gif = []
terminal_live_lost = sim_env.reset(sess, evaluation = True)
episode_reward_sum = 0
action = 1
k = 0
processed_new_frame, reward, terminal, terminal_live_lost, new_frame = sim_env.step(sess, action)
while True:
k += 1
action = action_getter.get_action(sess,0, sim_env.state, MAIN_DQN, evaluation = True)
processed_new_frame, reward, terminal, terminal_live_lost, new_frame = sim_env.step(sess, action)
episode_reward_sum += reward
frames_for_gif.append(new_frame)
if k > 6000:
break
print("The total reward is {}".format(episode_reward_sum))
print("Creating movie...")
os.makedirs('showcase/',exist_ok=True)
generate_movie(f'showcase/{RUNID}.mp4', frames_for_gif, episode_reward_sum, gif_path)