In [1]:
from keras.layers.convolutional import Conv2D
from keras.layers import Dense, Flatten
from keras.optimizers import RMSprop
from keras.models import Sequential
from skimage.transform import resize
from skimage.color import rgb2gray
from collections import deque
from keras import backend as K
import tensorflow as tf
import numpy as np
import random
import gym

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
tf.__version__

'1.13.1'

In [3]:
EPISODES = 50000

In [4]:
class DQNAgent:
    def __init__(self, action_size):
        self.render = False
        self.load_model = False
        # 상태와 행동의 크기 정의
        self.state_size = (84, 84, 4)
        self.action_size = action_size
        # DQN 하이퍼파라미터
        self.epsilon = 1.
        self.epsilon_start, self.epsilon_end = 1.0, 0.1
        self.exploration_steps = 1000000.
        self.epsilon_decay_step = (self.epsilon_start - self.epsilon_end) \
                                  / self.exploration_steps
        self.batch_size = 32
        self.train_start = 50000
        self.update_target_rate = 10000
        self.discount_factor = 0.99
        # 리플레이 메모리, 최대 크기 400000
        self.memory = deque(maxlen=400000)
        self.no_op_steps = 30
        # 모델과 타겟모델을 생성하고 타겟모델 초기화
        self.model = self.build_model()
        self.target_model = self.build_model()
        self.update_target_model()

        self.optimizer = self.optimizer()

        # 텐서보드 설정
        self.sess = tf.InteractiveSession()
        K.set_session(self.sess)

        self.avg_q_max, self.avg_loss = 0, 0
        self.summary_placeholders, self.update_ops, self.summary_op = \
            self.setup_summary()
        self.summary_writer = tf.summary.FileWriter(
            'summary/breakout_dqn', self.sess.graph)
        self.sess.run(tf.global_variables_initializer())

        if self.load_model:
            self.model.load_weights("./save_model/breakout_dqn.h5")

    # Huber Loss를 이용하기 위해 최적화 함수를 직접 정의
    def optimizer(self):
        a = K.placeholder(shape=(None,), dtype='int32')
        y = K.placeholder(shape=(None,), dtype='float32')

        prediction = self.model.output

        a_one_hot = K.one_hot(a, self.action_size)
        q_value = K.sum(prediction * a_one_hot, axis=1)
        error = K.abs(y - q_value)

        quadratic_part = K.clip(error, 0.0, 1.0)
        linear_part = error - quadratic_part
        loss = K.mean(0.5 * K.square(quadratic_part) + linear_part)

        optimizer = RMSprop(lr=0.00025, epsilon=0.01)
        updates = optimizer.get_updates(self.model.trainable_weights, [], loss)
        train = K.function([self.model.input, a, y], [loss], updates=updates)

        return train

    # 상태가 입력, 큐함수가 출력인 인공신경망 생성
    def build_model(self):
        model = Sequential()
        model.add(Conv2D(32, (8, 8), strides=(4, 4), activation='relu',
                         input_shape=self.state_size))
        model.add(Conv2D(64, (4, 4), strides=(2, 2), activation='relu'))
        model.add(Conv2D(64, (3, 3), strides=(1, 1), activation='relu'))
        model.add(Flatten())
        model.add(Dense(512, activation='relu'))
        model.add(Dense(self.action_size))
        model.summary()
        return model

    # 타겟 모델을 모델의 가중치로 업데이트
    def update_target_model(self):
        self.target_model.set_weights(self.model.get_weights())

    # 입실론 탐욕 정책으로 행동 선택
    def get_action(self, history):
        history = np.float32(history / 255.0)
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        else:
            q_value = self.model.predict(history)
            return np.argmax(q_value[0])

    # 샘플 <s, a, r, s'>을 리플레이 메모리에 저장
    def append_sample(self, history, action, reward, next_history, dead):
        self.memory.append((history, action, reward, next_history, dead))

    # 리플레이 메모리에서 무작위로 추출한 배치로 모델 학습
    def train_model(self):
        if self.epsilon > self.epsilon_end:
            self.epsilon -= self.epsilon_decay_step

        mini_batch = random.sample(self.memory, self.batch_size)

        history = np.zeros((self.batch_size, self.state_size[0],
                            self.state_size[1], self.state_size[2]))
        next_history = np.zeros((self.batch_size, self.state_size[0],
                                 self.state_size[1], self.state_size[2]))
        target = np.zeros((self.batch_size,))
        action, reward, dead = [], [], []

        for i in range(self.batch_size):
            history[i] = np.float32(mini_batch[i][0] / 255.)
            next_history[i] = np.float32(mini_batch[i][3] / 255.)
            action.append(mini_batch[i][1])
            reward.append(mini_batch[i][2])
            dead.append(mini_batch[i][4])

        target_value = self.target_model.predict(next_history)

        for i in range(self.batch_size):
            if dead[i]:
                target[i] = reward[i]
            else:
                target[i] = reward[i] + self.discount_factor * \
                                        np.amax(target_value[i])

        loss = self.optimizer([history, action, target])
        self.avg_loss += loss[0]

    # 각 에피소드 당 학습 정보를 기록
    def setup_summary(self):
        episode_total_reward = tf.Variable(0.)
        episode_avg_max_q = tf.Variable(0.)
        episode_duration = tf.Variable(0.)
        episode_avg_loss = tf.Variable(0.)

        tf.summary.scalar('Total Reward/Episode', episode_total_reward)
        tf.summary.scalar('Average Max Q/Episode', episode_avg_max_q)
        tf.summary.scalar('Duration/Episode', episode_duration)
        tf.summary.scalar('Average Loss/Episode', episode_avg_loss)

        summary_vars = [episode_total_reward, episode_avg_max_q,
                        episode_duration, episode_avg_loss]
        summary_placeholders = [tf.placeholder(tf.float32) for _ in
                                range(len(summary_vars))]
        update_ops = [summary_vars[i].assign(summary_placeholders[i]) for i in
                      range(len(summary_vars))]
        summary_op = tf.summary.merge_all()
        return summary_placeholders, update_ops, summary_op


In [5]:
def pre_processing(observe):
    processed_observe = np.uint8(
        resize(rgb2gray(observe), (84, 84), mode='constant') * 255)
    return processed_observe

In [None]:
if __name__ == "__main__":
    # 환경과 DQN 에이전트 생성
    env = gym.make('BreakoutDeterministic-v4')
    agent = DQNAgent(action_size=3)

    scores, episodes, global_step = [], [], 0

    for e in range(EPISODES):
        done = False
        dead = False

        step, score, start_life = 0, 0, 5
        observe = env.reset()

        for _ in range(random.randint(1, agent.no_op_steps)):
            observe, _, _, _ = env.step(1)

        state = pre_processing(observe)
        history = np.stack((state, state, state, state), axis=2)
        history = np.reshape([history], (1, 84, 84, 4))

        while not done:
            if agent.render:
                env.render()
            global_step += 1
            step += 1

            # 바로 전 4개의 상태로 행동을 선택
            action = agent.get_action(history)
            # 1: 정지, 2: 왼쪽, 3: 오른쪽
            if action == 0:
                real_action = 1
            elif action == 1:
                real_action = 2
            else:
                real_action = 3

            # 선택한 행동으로 환경에서 한 타임스텝 진행
            observe, reward, done, info = env.step(real_action)
            # 각 타임스텝마다 상태 전처리
            next_state = pre_processing(observe)
            next_state = np.reshape([next_state], (1, 84, 84, 1))
            next_history = np.append(next_state, history[:, :, :, :3], axis=3)

            agent.avg_q_max += np.amax(
                agent.model.predict(np.float32(history / 255.))[0])
            
            if start_life > info['ale.lives']:
                dead = True
                start_life = info['ale.lives']

            reward = np.clip(reward, -1., 1.)
            # 샘플 <s, a, r, s'>을 리플레이 메모리에 저장 후 학습
            agent.append_sample(history, action, reward, next_history, dead)

            if len(agent.memory) >= agent.train_start:
                agent.train_model()

            # 일정 시간마다 타겟모델을 모델의 가중치로 업데이트
            if global_step % agent.update_target_rate == 0:
                agent.update_target_model()

            score += reward

            if dead:
                dead = False
            else:
                history = next_history

            if done:
                # 각 에피소드 당 학습 정보를 기록
                if global_step > agent.train_start:
                    stats = [score, agent.avg_q_max / float(step), step,
                             agent.avg_loss / float(step)]
                    for i in range(len(stats)):
                        agent.sess.run(agent.update_ops[i], feed_dict={
                            agent.summary_placeholders[i]: float(stats[i])
                        })
                    summary_str = agent.sess.run(agent.summary_op)
                    agent.summary_writer.add_summary(summary_str, e + 1)

                print("episode:", e, "  score:", score, "  memory length:",
                      len(agent.memory), "  epsilon:", agent.epsilon,
                      "  global_step:", global_step, "  average_q:",
                      agent.avg_q_max / float(step), "  average loss:",
                      agent.avg_loss / float(step))

                agent.avg_q_max, agent.avg_loss = 0, 0

        # 1000 에피소드마다 모델 저장
        if e % 1000 == 0:
            agent.model.save_weights("./save_model/breakout_dqn.h5")

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_7 (Conv2D)            (None, 20, 20, 32)        8224      
_________________________________________________________________
conv2d_8 (Conv2D)            (None, 9, 9, 64)          32832     
_________________________________________________________________
conv2d_9 (Conv2D)            (None, 7, 7, 64)          36928     
_________________________________________________________________
flatten_3 (Flatten)          (None, 3136)              0         
_________________________________________________________________
dense_5 (Dense)              (None, 512)               1606144   
_________________________________________________________________
dense_6 (Dense)              (None, 3)                 1539      
Total params: 1,685,667
Trainable params: 1,685,667
Non-trainable params: 0
____________________________________________



episode: 0   score: 2.0   memory length: 187   epsilon: 1.0   global_step: 187   average_q: 0.055928201758287804   average loss: 0.0
episode: 1   score: 0.0   memory length: 324   epsilon: 1.0   global_step: 324   average_q: 0.05418188086826436   average loss: 0.0
episode: 2   score: 1.0   memory length: 456   epsilon: 1.0   global_step: 456   average_q: 0.05448061377374512   average loss: 0.0
episode: 3   score: 0.0   memory length: 560   epsilon: 1.0   global_step: 560   average_q: 0.054643740268567435   average loss: 0.0
episode: 4   score: 3.0   memory length: 806   epsilon: 1.0   global_step: 806   average_q: 0.05326916284980328   average loss: 0.0
episode: 5   score: 2.0   memory length: 992   epsilon: 1.0   global_step: 992   average_q: 0.053562487004905616   average loss: 0.0
episode: 6   score: 3.0   memory length: 1251   epsilon: 1.0   global_step: 1251   average_q: 0.0540279652419928   average loss: 0.0
episode: 7   score: 0.0   memory length: 1356   epsilon: 1.0   global_st

episode: 61   score: 0.0   memory length: 10190   epsilon: 1.0   global_step: 10190   average_q: 0.05450849422942037   average loss: 0.0
episode: 62   score: 1.0   memory length: 10346   epsilon: 1.0   global_step: 10346   average_q: 0.05396023337753155   average loss: 0.0
episode: 63   score: 0.0   memory length: 10468   epsilon: 1.0   global_step: 10468   average_q: 0.054362713985267236   average loss: 0.0
episode: 64   score: 0.0   memory length: 10582   epsilon: 1.0   global_step: 10582   average_q: 0.05441282408540709   average loss: 0.0
episode: 65   score: 0.0   memory length: 10677   epsilon: 1.0   global_step: 10677   average_q: 0.05412467890664151   average loss: 0.0
episode: 66   score: 0.0   memory length: 10789   epsilon: 1.0   global_step: 10789   average_q: 0.054213654748829346   average loss: 0.0
episode: 67   score: 0.0   memory length: 10916   epsilon: 1.0   global_step: 10916   average_q: 0.054336711647003655   average loss: 0.0
episode: 68   score: 3.0   memory leng

episode: 121   score: 0.0   memory length: 19256   epsilon: 1.0   global_step: 19256   average_q: 0.05443866563037159   average loss: 0.0
episode: 122   score: 1.0   memory length: 19394   epsilon: 1.0   global_step: 19394   average_q: 0.05500205509040667   average loss: 0.0
episode: 123   score: 3.0   memory length: 19636   epsilon: 1.0   global_step: 19636   average_q: 0.05353745882791921   average loss: 0.0
episode: 124   score: 2.0   memory length: 19839   epsilon: 1.0   global_step: 19839   average_q: 0.05296144643660836   average loss: 0.0
episode: 125   score: 0.0   memory length: 19967   epsilon: 1.0   global_step: 19967   average_q: 0.05426781336427666   average loss: 0.0
episode: 126   score: 0.0   memory length: 20072   epsilon: 1.0   global_step: 20072   average_q: 0.05437924567432631   average loss: 0.0
episode: 127   score: 0.0   memory length: 20215   epsilon: 1.0   global_step: 20215   average_q: 0.05431485988877036   average loss: 0.0
episode: 128   score: 1.0   memory

episode: 181   score: 2.0   memory length: 27959   epsilon: 1.0   global_step: 27959   average_q: 0.05457256240658013   average loss: 0.0
episode: 182   score: 2.0   memory length: 28145   epsilon: 1.0   global_step: 28145   average_q: 0.05367715256188505   average loss: 0.0
episode: 183   score: 4.0   memory length: 28426   epsilon: 1.0   global_step: 28426   average_q: 0.05646209877474877   average loss: 0.0
episode: 184   score: 0.0   memory length: 28531   epsilon: 1.0   global_step: 28531   average_q: 0.05450075456783885   average loss: 0.0
episode: 185   score: 1.0   memory length: 28690   epsilon: 1.0   global_step: 28690   average_q: 0.05377074424763146   average loss: 0.0
episode: 186   score: 1.0   memory length: 28840   epsilon: 1.0   global_step: 28840   average_q: 0.05340245594580968   average loss: 0.0
episode: 187   score: 0.0   memory length: 28936   epsilon: 1.0   global_step: 28936   average_q: 0.05470425014694532   average loss: 0.0
episode: 188   score: 4.0   memory

episode: 241   score: 3.0   memory length: 37983   epsilon: 1.0   global_step: 37983   average_q: 0.05352748327525744   average loss: 0.0
episode: 242   score: 0.0   memory length: 38110   epsilon: 1.0   global_step: 38110   average_q: 0.05450245391900145   average loss: 0.0
episode: 243   score: 2.0   memory length: 38325   epsilon: 1.0   global_step: 38325   average_q: 0.05506754462109056   average loss: 0.0
episode: 244   score: 0.0   memory length: 38428   epsilon: 1.0   global_step: 38428   average_q: 0.05436151673637547   average loss: 0.0
episode: 245   score: 1.0   memory length: 38581   epsilon: 1.0   global_step: 38581   average_q: 0.05445251102541007   average loss: 0.0
episode: 246   score: 4.0   memory length: 38857   epsilon: 1.0   global_step: 38857   average_q: 0.05506368399854156   average loss: 0.0
episode: 247   score: 3.0   memory length: 39089   epsilon: 1.0   global_step: 39089   average_q: 0.05438977255132692   average loss: 0.0
episode: 248   score: 1.0   memory

episode: 301   score: 1.0   memory length: 47887   epsilon: 1.0   global_step: 47887   average_q: 0.05374687315513756   average loss: 0.0
episode: 302   score: 0.0   memory length: 47995   epsilon: 1.0   global_step: 47995   average_q: 0.05456676937777687   average loss: 0.0
episode: 303   score: 0.0   memory length: 48100   epsilon: 1.0   global_step: 48100   average_q: 0.0544438919141179   average loss: 0.0
episode: 304   score: 0.0   memory length: 48229   epsilon: 1.0   global_step: 48229   average_q: 0.054283129094645034   average loss: 0.0
episode: 305   score: 1.0   memory length: 48393   epsilon: 1.0   global_step: 48393   average_q: 0.0544323759244346   average loss: 0.0
episode: 306   score: 0.0   memory length: 48501   epsilon: 1.0   global_step: 48501   average_q: 0.05455646301723189   average loss: 0.0
episode: 307   score: 2.0   memory length: 48701   epsilon: 1.0   global_step: 48701   average_q: 0.05375468919053674   average loss: 0.0
episode: 308   score: 1.0   memory 

episode: 353   score: 0.0   memory length: 56159   epsilon: 0.9944560000001825   global_step: 56159   average_q: 1.25528757331463   average loss: 0.2943397562671285
episode: 354   score: 0.0   memory length: 56282   epsilon: 0.9943453000001862   global_step: 56282   average_q: 1.0768641829490662   average loss: 0.35229788450640154
episode: 355   score: 0.0   memory length: 56394   epsilon: 0.9942445000001895   global_step: 56394   average_q: 1.1477046800511224   average loss: 0.32659680172296895
episode: 356   score: 1.0   memory length: 56557   epsilon: 0.9940978000001943   global_step: 56557   average_q: 1.1686802879432958   average loss: 0.3593454441621484
episode: 357   score: 0.0   memory length: 56676   epsilon: 0.9939907000001978   global_step: 56676   average_q: 1.085946948087516   average loss: 0.3350365368552743
episode: 358   score: 1.0   memory length: 56825   epsilon: 0.9938566000002023   global_step: 56825   average_q: 1.0652496810727472   average loss: 0.3437883355716853

episode: 403   score: 1.0   memory length: 63822   epsilon: 0.9875593000004096   global_step: 63822   average_q: 1.0553265957832336   average loss: 0.3703358793479456
episode: 404   score: 3.0   memory length: 64058   epsilon: 0.9873469000004166   global_step: 64058   average_q: 1.0436009406033209   average loss: 0.32877289841592766
episode: 405   score: 1.0   memory length: 64211   epsilon: 0.9872092000004211   global_step: 64211   average_q: 0.932579706696903   average loss: 0.3502327280542399
episode: 406   score: 2.0   memory length: 64386   epsilon: 0.9870517000004263   global_step: 64386   average_q: 1.1065963544164386   average loss: 0.32055959457059346
episode: 407   score: 0.0   memory length: 64495   epsilon: 0.9869536000004295   global_step: 64495   average_q: 0.9753270865580358   average loss: 0.3195186687813692
episode: 408   score: 1.0   memory length: 64663   epsilon: 0.9868024000004345   global_step: 64663   average_q: 1.1273219344161807   average loss: 0.33194523780294

episode: 453   score: 1.0   memory length: 71768   epsilon: 0.980407900000645   global_step: 71768   average_q: 1.3177009042766359   average loss: 0.31233734174956457
episode: 454   score: 1.0   memory length: 71931   epsilon: 0.9802612000006499   global_step: 71931   average_q: 1.1127909425577502   average loss: 0.3523158963189473
episode: 455   score: 3.0   memory length: 72145   epsilon: 0.9800686000006562   global_step: 72145   average_q: 0.9694431958911575   average loss: 0.33660342557721984
episode: 456   score: 0.0   memory length: 72275   epsilon: 0.9799516000006601   global_step: 72275   average_q: 1.1145275152646579   average loss: 0.3171461684647511
episode: 457   score: 2.0   memory length: 72513   epsilon: 0.9797374000006671   global_step: 72513   average_q: 0.9168947073591858   average loss: 0.3429583695450819
episode: 458   score: 2.0   memory length: 72713   epsilon: 0.979557400000673   global_step: 72713   average_q: 1.0106545153260231   average loss: 0.346903985411522

episode: 503   score: 2.0   memory length: 79886   epsilon: 0.9731017000008856   global_step: 79886   average_q: 0.9351798140484354   average loss: 0.3742850733203678
episode: 504   score: 1.0   memory length: 80058   epsilon: 0.9729469000008907   global_step: 80058   average_q: 1.0533598973307499   average loss: 0.33293158933581657
episode: 505   score: 1.0   memory length: 80219   epsilon: 0.9728020000008955   global_step: 80219   average_q: 1.1332029940919106   average loss: 0.33523288299307336
episode: 506   score: 0.0   memory length: 80336   epsilon: 0.9726967000008989   global_step: 80336   average_q: 1.0993367586380396   average loss: 0.3406309032158252
episode: 507   score: 0.0   memory length: 80451   epsilon: 0.9725932000009023   global_step: 80451   average_q: 0.9397517105807429   average loss: 0.34592469370523443
episode: 508   score: 2.0   memory length: 80640   epsilon: 0.9724231000009079   global_step: 80640   average_q: 1.1367104772537473   average loss: 0.309177827229

episode: 553   score: 1.0   memory length: 88261   epsilon: 0.9655642000011337   global_step: 88261   average_q: 0.9479818139990716   average loss: 0.3288322331702509
episode: 554   score: 1.0   memory length: 88424   epsilon: 0.9654175000011386   global_step: 88424   average_q: 0.9901186458172242   average loss: 0.3564374738978396
episode: 555   score: 0.0   memory length: 88542   epsilon: 0.9653113000011421   global_step: 88542   average_q: 1.0397510366924738   average loss: 0.2895920877346067
episode: 556   score: 1.0   memory length: 88684   epsilon: 0.9651835000011463   global_step: 88684   average_q: 1.0617747227071037   average loss: 0.3425052356122821
episode: 557   score: 2.0   memory length: 88880   epsilon: 0.9650071000011521   global_step: 88880   average_q: 1.0765321564917663   average loss: 0.31119772298112025
episode: 558   score: 1.0   memory length: 89023   epsilon: 0.9648784000011563   global_step: 89023   average_q: 0.9626508455176453   average loss: 0.32460173352984

episode: 603   score: 3.0   memory length: 95977   epsilon: 0.9586198000013624   global_step: 95977   average_q: 1.0889765978371124   average loss: 0.34854109662392185
episode: 604   score: 1.0   memory length: 96121   epsilon: 0.9584902000013666   global_step: 96121   average_q: 1.071459776825375   average loss: 0.3446814049028646
episode: 605   score: 3.0   memory length: 96346   epsilon: 0.9582877000013733   global_step: 96346   average_q: 1.0742094575034247   average loss: 0.3179370602737524
episode: 606   score: 1.0   memory length: 96479   epsilon: 0.9581680000013773   global_step: 96479   average_q: 1.1518287658691406   average loss: 0.3569599032921531
episode: 607   score: 0.0   memory length: 96607   epsilon: 0.958052800001381   global_step: 96607   average_q: 1.122153892647475   average loss: 0.33790365991239923
episode: 608   score: 2.0   memory length: 96800   epsilon: 0.9578791000013868   global_step: 96800   average_q: 1.0761034692507334   average loss: 0.2959597035092126

episode: 652   score: 1.0   memory length: 103800   epsilon: 0.9515791000015942   global_step: 103800   average_q: 0.9529510800902908   average loss: 0.3602550102431623
episode: 653   score: 0.0   memory length: 103926   epsilon: 0.9514657000015979   global_step: 103926   average_q: 0.9056249989403619   average loss: 0.37384620046387207
episode: 654   score: 0.0   memory length: 104043   epsilon: 0.9513604000016014   global_step: 104043   average_q: 0.9505304149073414   average loss: 0.3084030988667677
episode: 655   score: 0.0   memory length: 104144   epsilon: 0.9512695000016044   global_step: 104144   average_q: 1.0399456024169922   average loss: 0.34086388698663783
episode: 656   score: 2.0   memory length: 104337   epsilon: 0.9510958000016101   global_step: 104337   average_q: 1.0958186591845103   average loss: 0.30515579638666057
episode: 657   score: 4.0   memory length: 104631   epsilon: 0.9508312000016188   global_step: 104631   average_q: 1.1389627367460808   average loss: 0.

episode: 701   score: 2.0   memory length: 111695   epsilon: 0.9444736000018281   global_step: 111695   average_q: 0.9391219548734964   average loss: 0.351072381667486
episode: 702   score: 0.0   memory length: 111811   epsilon: 0.9443692000018316   global_step: 111811   average_q: 1.030829447610625   average loss: 0.3142164689620194
episode: 703   score: 1.0   memory length: 111979   epsilon: 0.9442180000018365   global_step: 111979   average_q: 1.0507152080535889   average loss: 0.3491724040547681
episode: 704   score: 1.0   memory length: 112115   epsilon: 0.9440956000018406   global_step: 112115   average_q: 1.082830835791195   average loss: 0.38548730562642425
episode: 705   score: 0.0   memory length: 112229   epsilon: 0.9439930000018439   global_step: 112229   average_q: 1.0709246261078014   average loss: 0.31848354187927563
episode: 706   score: 0.0   memory length: 112347   epsilon: 0.9438868000018474   global_step: 112347   average_q: 1.1177681330907143   average loss: 0.3515

episode: 750   score: 0.0   memory length: 119311   epsilon: 0.9376192000020538   global_step: 119311   average_q: 1.1290677371232405   average loss: 0.3639036422794996
episode: 751   score: 4.0   memory length: 119591   epsilon: 0.9373672000020621   global_step: 119591   average_q: 1.0125963134425027   average loss: 0.34190809589535903
episode: 752   score: 0.0   memory length: 119715   epsilon: 0.9372556000020658   global_step: 119715   average_q: 1.0858602350757969   average loss: 0.3084455179374022
episode: 753   score: 0.0   memory length: 119830   epsilon: 0.9371521000020692   global_step: 119830   average_q: 1.0499923016714012   average loss: 0.36506470274117697
episode: 754   score: 0.0   memory length: 119946   epsilon: 0.9370477000020726   global_step: 119946   average_q: 1.023375647335217   average loss: 0.35794170235774325
episode: 755   score: 1.0   memory length: 120095   epsilon: 0.936913600002077   global_step: 120095   average_q: 1.0265170247762796   average loss: 0.29

episode: 799   score: 0.0   memory length: 127606   epsilon: 0.9301537000022996   global_step: 127606   average_q: 1.0943009531497956   average loss: 0.29735458097800804
episode: 800   score: 1.0   memory length: 127731   epsilon: 0.9300412000023033   global_step: 127731   average_q: 1.0818850078582765   average loss: 0.3159372687723926
episode: 801   score: 1.0   memory length: 127889   epsilon: 0.929899000002308   global_step: 127889   average_q: 1.07094942127602   average loss: 0.3517275062595167
episode: 802   score: 0.0   memory length: 128015   epsilon: 0.9297856000023117   global_step: 128015   average_q: 1.0448312759399414   average loss: 0.32868636035340565
episode: 803   score: 1.0   memory length: 128158   epsilon: 0.9296569000023159   global_step: 128158   average_q: 1.014678708323232   average loss: 0.29247156331591456
episode: 804   score: 1.0   memory length: 128291   epsilon: 0.9295372000023199   global_step: 128291   average_q: 0.9847404831334164   average loss: 0.3921

episode: 848   score: 1.0   memory length: 134777   epsilon: 0.9236998000025121   global_step: 134777   average_q: 1.1121286948521931   average loss: 0.34088601886491776
episode: 849   score: 3.0   memory length: 134993   epsilon: 0.9235054000025185   global_step: 134993   average_q: 1.259709201477192   average loss: 0.3471845905059362
episode: 850   score: 1.0   memory length: 135140   epsilon: 0.9233731000025228   global_step: 135140   average_q: 1.0623972752467306   average loss: 0.2982079734128321
episode: 851   score: 1.0   memory length: 135276   epsilon: 0.9232507000025268   global_step: 135276   average_q: 0.9871305415735525   average loss: 0.31496300823298096
episode: 852   score: 0.0   memory length: 135371   epsilon: 0.9231652000025297   global_step: 135371   average_q: 0.825004528070751   average loss: 0.34914797570458367
episode: 853   score: 0.0   memory length: 135503   epsilon: 0.9230464000025336   global_step: 135503   average_q: 1.022575035239711   average loss: 0.348

episode: 897   score: 1.0   memory length: 142944   epsilon: 0.9163495000027541   global_step: 142944   average_q: 1.0944313441112543   average loss: 0.3268773453871803
episode: 898   score: 0.0   memory length: 143057   epsilon: 0.9162478000027574   global_step: 143057   average_q: 1.2219734044201607   average loss: 0.3550346893792516
episode: 899   score: 0.0   memory length: 143175   epsilon: 0.9161416000027609   global_step: 143175   average_q: 1.0938844781810955   average loss: 0.35686821566883536
episode: 900   score: 2.0   memory length: 143373   epsilon: 0.9159634000027668   global_step: 143373   average_q: 1.0280481023017807   average loss: 0.34498556174466255
episode: 901   score: 1.0   memory length: 143540   epsilon: 0.9158131000027717   global_step: 143540   average_q: 1.0947447743958343   average loss: 0.3663022770516885
episode: 902   score: 0.0   memory length: 143661   epsilon: 0.9157042000027753   global_step: 143661   average_q: 1.0544086783385473   average loss: 0.2

episode: 946   score: 1.0   memory length: 150886   epsilon: 0.9092017000029894   global_step: 150886   average_q: 0.9737171500351778   average loss: 0.3363841881067708
episode: 947   score: 0.0   memory length: 151011   epsilon: 0.9090892000029931   global_step: 151011   average_q: 1.0555195136070252   average loss: 0.3666384541380685
episode: 948   score: 0.0   memory length: 151115   epsilon: 0.9089956000029962   global_step: 151115   average_q: 1.0231933880310793   average loss: 0.36280888541202516
episode: 949   score: 0.0   memory length: 151240   epsilon: 0.9088831000029999   global_step: 151240   average_q: 1.0786598796844482   average loss: 0.32363066024352155
episode: 950   score: 2.0   memory length: 151424   epsilon: 0.9087175000030053   global_step: 151424   average_q: 1.0949884400419567   average loss: 0.3414988365534292
episode: 951   score: 2.0   memory length: 151605   epsilon: 0.9085546000030107   global_step: 151605   average_q: 1.0492415849675132   average loss: 0.3

episode: 995   score: 2.0   memory length: 158899   epsilon: 0.9019900000032268   global_step: 158899   average_q: 0.959354168304833   average loss: 0.3346544887150543
episode: 996   score: 0.0   memory length: 159024   epsilon: 0.9018775000032305   global_step: 159024   average_q: 0.9499951481819153   average loss: 0.31105369544580025
episode: 997   score: 0.0   memory length: 159134   epsilon: 0.9017785000032338   global_step: 159134   average_q: 0.9416741132736206   average loss: 0.326205951446371
episode: 998   score: 1.0   memory length: 159275   epsilon: 0.901651600003238   global_step: 159275   average_q: 1.0142769707855603   average loss: 0.3623634459796475
episode: 999   score: 3.0   memory length: 159498   epsilon: 0.9014509000032446   global_step: 159498   average_q: 1.0407296253426728   average loss: 0.3230213720447693
episode: 1000   score: 0.0   memory length: 159606   epsilon: 0.9013537000032478   global_step: 159606   average_q: 1.022337054764783   average loss: 0.34143

episode: 1044   score: 2.0   memory length: 166746   epsilon: 0.8949277000034593   global_step: 166746   average_q: 1.0451815567206388   average loss: 0.3329703099366836
episode: 1045   score: 1.0   memory length: 166907   epsilon: 0.8947828000034641   global_step: 166907   average_q: 1.106299769063914   average loss: 0.32104303532227424
episode: 1046   score: 3.0   memory length: 167121   epsilon: 0.8945902000034704   global_step: 167121   average_q: 1.1540144670789487   average loss: 0.3327010553520582
episode: 1047   score: 0.0   memory length: 167246   epsilon: 0.8944777000034742   global_step: 167246   average_q: 1.1409078512191773   average loss: 0.3255572577842486
episode: 1048   score: 1.0   memory length: 167398   epsilon: 0.8943409000034787   global_step: 167398   average_q: 1.0267618267159713   average loss: 0.3571033328386387
episode: 1049   score: 2.0   memory length: 167583   epsilon: 0.8941744000034841   global_step: 167583   average_q: 1.0318943258878348   average loss:

episode: 1093   score: 1.0   memory length: 174373   epsilon: 0.8880634000036853   global_step: 174373   average_q: 1.0992108552079451   average loss: 0.3537842723968173
episode: 1094   score: 0.0   memory length: 174496   epsilon: 0.887952700003689   global_step: 174496   average_q: 1.0330001929911172   average loss: 0.34055840287551403
episode: 1095   score: 0.0   memory length: 174621   epsilon: 0.8878402000036927   global_step: 174621   average_q: 1.034287308692932   average loss: 0.33064608132155954
episode: 1096   score: 0.0   memory length: 174728   epsilon: 0.8877439000036959   global_step: 174728   average_q: 1.0356276024167783   average loss: 0.3322440319468017
episode: 1097   score: 4.0   memory length: 174966   epsilon: 0.8875297000037029   global_step: 174966   average_q: 0.9849931835627356   average loss: 0.32270109755406146
episode: 1098   score: 1.0   memory length: 175149   epsilon: 0.8873650000037083   global_step: 175149   average_q: 0.9871932261628531   average loss

episode: 1142   score: 3.0   memory length: 182910   epsilon: 0.8803801000039383   global_step: 182910   average_q: 1.0625706432167619   average loss: 0.3337345204964831
episode: 1143   score: 2.0   memory length: 183092   epsilon: 0.8802163000039437   global_step: 183092   average_q: 1.0615934906425057   average loss: 0.32924670694427066
episode: 1144   score: 0.0   memory length: 183212   epsilon: 0.8801083000039472   global_step: 183212   average_q: 1.067831673224767   average loss: 0.3234586868038605
episode: 1145   score: 0.0   memory length: 183321   epsilon: 0.8800102000039505   global_step: 183321   average_q: 1.0663022787199108   average loss: 0.33441121357109277
episode: 1146   score: 0.0   memory length: 183427   epsilon: 0.8799148000039536   global_step: 183427   average_q: 1.0312047443299923   average loss: 0.32172698668676636
episode: 1147   score: 1.0   memory length: 183557   epsilon: 0.8797978000039575   global_step: 183557   average_q: 1.004210870541059   average loss

episode: 1191   score: 0.0   memory length: 190071   epsilon: 0.8739352000041505   global_step: 190071   average_q: 1.1848138322924624   average loss: 0.3778725874110005
episode: 1192   score: 1.0   memory length: 190244   epsilon: 0.8737795000041556   global_step: 190244   average_q: 1.1838510036468506   average loss: 0.33948192164126534
episode: 1193   score: 0.0   memory length: 190345   epsilon: 0.8736886000041586   global_step: 190345   average_q: 1.1166451638287838   average loss: 0.327459192067199
episode: 1194   score: 2.0   memory length: 190525   epsilon: 0.8735266000041639   global_step: 190525   average_q: 1.0896991087330712   average loss: 0.3462124035258999
episode: 1195   score: 2.0   memory length: 190702   epsilon: 0.8733673000041692   global_step: 190702   average_q: 1.1644537388268164   average loss: 0.3190908431582611
episode: 1196   score: 0.0   memory length: 190801   epsilon: 0.8732782000041721   global_step: 190801   average_q: 1.1732455118738039   average loss:

episode: 1240   score: 0.0   memory length: 197997   epsilon: 0.8668018000043853   global_step: 197997   average_q: 0.9982359532047721   average loss: 0.3598167362525003
episode: 1241   score: 0.0   memory length: 198104   epsilon: 0.8667055000043885   global_step: 198104   average_q: 1.000250659256338   average loss: 0.31239511592575053
episode: 1242   score: 2.0   memory length: 198272   epsilon: 0.8665543000043935   global_step: 198272   average_q: 0.9994468014864695   average loss: 0.3396010515452816
episode: 1243   score: 1.0   memory length: 198446   epsilon: 0.8663977000043986   global_step: 198446   average_q: 0.9447036090938524   average loss: 0.3277892053568697
episode: 1244   score: 6.0   memory length: 198780   epsilon: 0.8660971000044085   global_step: 198780   average_q: 0.9924375664688156   average loss: 0.3352926848147302
episode: 1245   score: 0.0   memory length: 198880   epsilon: 0.8660071000044115   global_step: 198880   average_q: 1.0172085654735565   average loss: