# DQN on breakout

Done with reference to Denny Britz's DQN implementation found at
https://github.com/dennybritz/reinforcement-learning/blob/master/DQN/Deep%20Q%20Learning.ipynb

In [1]:
import tensorflow as tf
import numpy as np
import gym
from gym.wrappers import Monitor
import itertools
import os

In [2]:
class ReplayBuffer:
    """
    A replay buffer for implementing experience replay
    """
    def __init__(self, size):
        """
        Args:
            size: the size of the replay buffer, items will be evicted in a FIFO manner
        """
        self.size = size
        self.buffer = []
        
    def add(self, state, action, reward, next_state, is_terminal):
        """
        Args:
            state
            action
            reward
            next_state
            is_terminal
            
        Store experience into replay buffer, evicting old experience if buffer is full
        """
        if len(self.buffer) >= self.size:
            self.buffer.pop(0)
        self.buffer.append([state, action, reward, next_state, is_terminal])
        
    def sample(self, no_of_samples):
        """
        Args:
            no_of_sample: number of samples desired
            
        Return:
            samples of length no_of_sample
            
        Samples from replay buffer
        """
        idx = np.random.choice(len(self.buffer), no_of_samples)
        return np.array(self.buffer)[idx]

In [3]:
# from https://github.com/dennybritz/reinforcement-learning/blob/master/DQN/Deep%20Q%20Learning.ipynb
class StateProcessor():
    """
    Processes a raw Atari images. Resizes it and converts it to grayscale.
    """
    def __init__(self):
        # Build the Tensorflow graph
        with tf.variable_scope("state_processor"):
            self.input_state = tf.placeholder(shape=[210, 160, 3], dtype=tf.uint8)
            self.output = tf.image.rgb_to_grayscale(self.input_state)
            self.output = tf.image.crop_to_bounding_box(self.output, 34, 0, 160, 160)
            self.output = tf.image.resize_images(
                self.output, [84, 84], method=tf.image.ResizeMethod.NEAREST_NEIGHBOR)
            self.output = tf.squeeze(self.output)

    def process(self, sess, state):
        """
        Args:
            sess: A Tensorflow session object
            state: A [210, 160, 3] Atari RGB State

        Returns:
            A processed [84, 84, 1] state representing grayscale values.
        """
        return sess.run(self.output, { self.input_state: state })

In [4]:
def weights_init(shape):
    W = tf.get_variable('W', initializer=tf.truncated_normal(shape, mean=0, stddev=0.1))
    return W

def bias_init(shape):
    b = tf.get_variable('b', initializer=tf.constant(0.1, shape=shape))
    return b

In [5]:
class Qnetwork:
    def __init__(self, frame_size, no_of_frame, no_of_actions, global_step, scope='Estimator', summaries_dir=None):
        """
        Args:
            frame_size: width and height of a single frame
            no_of_frame: number of frames stacked
            no_of_actions: number of actions (i.e output neurons), this varies from game to game
            scope: name of scope. Used to distinguish target and estimator network
        """
        self.scope = scope
        self.summary_writer = None
        self.step = 0
        with tf.variable_scope(scope):
            # build summary writer
            if summaries_dir:
                summary_dir = os.path.join(summaries_dir, "summaries_{}".format(scope))
                if not os.path.exists(summary_dir):
                    os.makedirs(summary_dir)
                self.summary_writer = tf.summary.FileWriter(summary_dir)
            
            
            self.X = tf.placeholder(tf.uint8, shape=[None, frame_size, frame_size, no_of_frame], name='X')
            self.targets = tf.placeholder(tf.float32, shape=[None], name='targets')
            self.selected_actions = tf.placeholder(tf.int32, shape=[None], name='actions')
            X = tf.to_float(self.X) / 255.0
            batch_size = tf.shape(self.X)[0]
            
            with tf.variable_scope('conv1'):
                W1 = weights_init([8,8, no_of_frame, 32])
                b1 = bias_init([32])
                conv1 = tf.nn.relu(tf.nn.conv2d(X, W1, strides=[1,4,4,1], padding='VALID') + b1)
            with tf.variable_scope('conv2'):
                W2 = weights_init([4,4, 32, 64])
                b2 = bias_init([64])
                conv2 = tf.nn.relu(tf.nn.conv2d(conv1, W2, strides=[1,2,2,1], padding='VALID') + b2)
            with tf.variable_scope('conv3'):
                W3 = weights_init([3,3, 64, 64])
                b3 = bias_init([64])
                conv3 = tf.nn.relu(tf.nn.conv2d(conv2, W3, strides=[1,1,1,1], padding='VALID') + b3)
                final_conv_width = (((frame_size - 8)//4 + 1 - 4)//2 + 1 - 3) + 1
            with tf.variable_scope('fc4'):
                W4 = weights_init([final_conv_width**2 * 64, 512])
                b4 = bias_init([512])
                flattened = tf.reshape(conv3, [-1, final_conv_width**2 * 64])
                fc4 = tf.nn.relu(tf.matmul(flattened, W4) + b4)
            with tf.variable_scope('fc5'):
                W5 = weights_init([512, no_of_actions])
                b5 = bias_init([no_of_actions])
                self.predictions = tf.matmul(fc4, W5) + b5

            # compute loss
            # we need to extract the action values of selected actions
            # to do that, we will flatten the predictions into a 1d array
            # we then transform the action index to an index compatible with this 1d array
            # we transform the action index by adding the action index to the offset for each row
            # reference from
            # https://github.com/dennybritz/reinforcement-learning/blob/master/DQN/Deep%20Q%20Learning.ipynb
            with tf.variable_scope('loss'):
                gather_indices = tf.range(batch_size) * tf.shape(self.predictions)[1] + self.selected_actions
                self.action_predictions = tf.gather(tf.reshape(self.predictions, [-1]), gather_indices)

                self.losses = tf.squared_difference(self.targets, self.action_predictions)
                self.loss = tf.reduce_mean(self.losses)

            # create train op
            # I am neglecting error clipping that was used in the paper
                self.optimizer = tf.train.RMSPropOptimizer(0.00025, 0.99, 0.0, 1e-6)
                self.train_op = self.optimizer.minimize(self.loss, 
                                                        var_list=[W1, b1, W2, b2, W3, b3, W4, b4, W5, b5],
                                                        global_step=global_step)
            
            # Summaries for Tensorboard
            self.summaries = tf.summary.merge([
                tf.summary.scalar("loss", self.loss),
                tf.summary.histogram("loss_hist", self.losses),
                tf.summary.histogram("q_values_hist", self.predictions),
                tf.summary.scalar("max_q_value", tf.reduce_max(self.predictions))
            ])
            
    def predict(self, sess, s):
        """
        Predicts action values.

        Args:
          sess: Tensorflow session
          s: State input of shape [batch_size, frame_size, frame_size, no_of_frames]

        Returns:
          Tensor of shape [batch_size, no_of_actions] containing the estimated 
          action values.
        """
        return sess.run(self.predictions, { self.X: s })

    def update(self, sess, s, a, targets):
        """
        Updates the network towards the given targets.

        Args:
          sess: Tensorflow session object
          s: State input of shape [batch_size, frame_size, frame_size, no_of_frames]
          a: Chosen actions of shape [batch_size]
          targets: Targets of shape [batch_size]

        Returns:
          The calculated loss on the batch.
        """
        feed_dict = { self.X: s, self.targets: targets, self.selected_actions: a }
        summaries, _, loss, step = sess.run(
            [self.summaries, self.train_op, self.loss, global_step],
            feed_dict)
        if self.summary_writer:
            self.summary_writer.add_summary(summaries, step)
        return loss


In [6]:
# from https://github.com/dennybritz/reinforcement-learning/blob/master/DQN/Deep%20Q%20Learning.ipynb
def copy_model_parameters(sess, network1, network2):
    """
    Copies the model parameters of one estimator to another.

    Args:
      sess: Tensorflow session instance
      estimator1: Estimator to copy the paramters from
      estimator2: Estimator to copy the parameters to
    """
    n1_params = [t for t in tf.trainable_variables() if t.name.startswith(network1.scope)]
    n1_params = sorted(n1_params, key=lambda v: v.name)
    n2_params = [t for t in tf.trainable_variables() if t.name.startswith(network2.scope)]
    n2_params = sorted(n2_params, key=lambda v: v.name)

    update_ops = []
    for n1_v, n2_v in zip(n1_params, n2_params):
        op = n2_v.assign(n1_v)
        update_ops.append(op)

    sess.run(update_ops)

In [7]:
def select_action(sess, env, q_network, state, epsilon, e_greedy=True):
    if e_greedy and np.random.uniform() < epsilon:
        return env.action_space.sample()
    else:
        action_values = q_network.predict(sess, np.expand_dims(state, 0))
        return np.argmax(action_values)

In [8]:
def train_dqn(sess,
              env,
              q_network,
              target_network,
              state_processor,
              num_episodes,
              global_step,
              experiment_dir,
              replay_buffer,
              buffer_init_size,
              target_interval,
              frame_skip,
              epsilon_start,
              epsilon_end,
              epsilon_decay_length,
              gamma,
              batch_size):

    """
    Args:
        buffer_init_size: no of frames used to initialize replay buffer
        frame_skip: no. of frames to skip between decision (paper used 4)
    """
    
    # Keeps track of useful statistics
    episode_lengths=np.zeros(num_episodes)
    episode_rewards=np.zeros(num_episodes)

    # Create directories for checkpoints and summaries
    checkpoint_dir = os.path.join(experiment_dir, "checkpoints")
    checkpoint_path = os.path.join(checkpoint_dir, "model")
    monitor_path = os.path.join(experiment_dir, "monitor")

    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)
    if not os.path.exists(monitor_path):
        os.makedirs(monitor_path)

    saver = tf.train.Saver()
    # Load a previous checkpoint if we find one
    latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir)
    if latest_checkpoint:
        print("Loading model checkpoint {}...\n".format(latest_checkpoint))
        saver.restore(sess, latest_checkpoint)
        
    total_t = sess.run(global_step)
    
    # The epsilon decay schedule
    epsilons = np.linspace(epsilon_start, epsilon_end, epsilon_decay_length)
    
    last_4_frame = []
    
    # init replay buffer
    print("Populating replay memory...")
    observation = env.reset()
    observation = state_processor.process(sess, observation)
    # populate last 4 frames
    last_4_frame = [observation.tolist()] * 4
    
    for i in range(buffer_init_size):
        if i % 1000 == 0:
            print("\rpopulating buffer: %d" %i, end="")
        current_state = np.stack(last_4_frame, axis=2)
        action = env.action_space.sample()
        observation, reward, done, _ = env.step(action)
        observation = state_processor.process(sess, observation)
        last_4_frame.pop(0)
        last_4_frame.append(observation)
        next_state = np.stack(last_4_frame, axis=2)
        replay_buffer.add(current_state, action, reward, next_state, done)
        if done:
            observation = env.reset()
            observation = state_processor.process(sess, observation)
            # populate last 4 frames
            last_4_frame = [observation.tolist()] * 4
            
        
    print("\nPopulated replay memory")
    
    # this can only be done after populating replay buffer
    should_record = False
    # set up env to record video near the end of training
    env = Monitor(env, directory='./', resume=True, video_callable=lambda count: should_record and count % 10 == 0)
    
    
    action = None

    for i_episode in range(num_episodes):
        # Save the current checkpoint
        saver.save(tf.get_default_session(), checkpoint_path)
        
        # if is last 4 episode, record video
        if i_episode >= num_episodes - 4 or i_episode % 5000 == 0:
            should_record = False
        else:
            should_record = False

        # Reset the environment
        observation = env.reset()
        observation = state_processor.process(sess, observation)
        last_4_frame = [observation.tolist()] * 4
        loss = None
        current_state = np.stack(last_4_frame, axis=2)
        for t in itertools.count():
            # Epsilon for this time step
            epsilon = epsilons[min(total_t, epsilon_decay_length-1)]

            # Add epsilon to Tensorboard
            episode_summary = tf.Summary()
            episode_summary.value.add(simple_value=epsilon, tag="epsilon")
            q_network.summary_writer.add_summary(episode_summary, total_t)
            
            # update the target network
            if total_t % target_interval == 0:
                copy_model_parameters(sess, q_network, target_network)
                
            
            if t % frame_skip == 0:
                # only make decision every k (frame_skip) steps
                action = select_action(sess, env, q_network, current_state, epsilon, e_greedy=True)
                
            # Print out which step we're on, useful for debugging.
            print("\rStep {} ({}) @ Episode {}/{}, loss: {}".format(
                    t, total_t, i_episode + 1, num_episodes, loss), end="")
            
            observation, reward, done, _ = env.step(action)
            observation = state_processor.process(sess, observation)
            last_4_frame.pop(0)
            last_4_frame.append(observation)
            next_state = np.stack(last_4_frame, axis=2)
            replay_buffer.add(current_state, action, reward, next_state, done)
            
            # Update statistics
            episode_rewards[i_episode] += reward
            episode_lengths[i_episode] = t
            
            # train
            samples = replay_buffer.sample(batch_size)
            states = []
            actions = []
            targets = []
            for s, a, r, next_s, d in samples:
                states.append(s)
                actions.append(a)
                # compute target
                qs = target_network.predict(sess, np.expand_dims(next_s, 0))
                qmax = np.max(qs)
                if r > 0:
                    r_clip = 1
                elif r < 0:
                    r_clip = -1
                else:
                    r_clip = 0
                targets.append(r_clip + gamma * qmax * (1-int(d)))
            loss = q_network.update(sess, states, actions, targets)
                
            
                
                
            current_state = next_state
            total_t += 1
            
            if done:
                break
        print("\nEpisode {}, Reward: {}".format(i_episode + 1, episode_rewards[i_episode]))
        # Add summaries to tensorboard
        episode_summary = tf.Summary()
        episode_summary.value.add(simple_value=episode_rewards[i_episode], node_name="episode_reward", tag="episode_reward")
        episode_summary.value.add(simple_value=episode_lengths[i_episode], node_name="episode_length", tag="episode_length")
        q_network.summary_writer.add_summary(episode_summary, total_t)
        q_network.summary_writer.flush()
    
    env.close()
    return episode_rewards, episode_lengths

In [None]:
tf.reset_default_graph()



global_step = tf.Variable(0, trainable=False, name='global_step')


env = gym.make('Breakout-v0')

# Where we save our checkpoints and graphs
experiment_dir = os.path.abspath("./experiments/{}".format(env.spec.id))

q_network = Qnetwork(84, 4, env.action_space.n, global_step, scope='estimator', summaries_dir=experiment_dir)
target_network = Qnetwork(84, 4, env.action_space.n, global_step, scope='target')

state_processor = StateProcessor()

replay_buffer = ReplayBuffer(size=500000)


with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    # copy network to target
    copy_model_parameters(sess, q_network, target_network)
    rewards, lengths = train_dqn(sess=sess,
              env=env,
              q_network=q_network,
              target_network=target_network,
              state_processor=state_processor,
              num_episodes=5000,
              global_step=global_step,
              experiment_dir=experiment_dir,
              replay_buffer=replay_buffer,
              buffer_init_size=10000,
              target_interval=10000,
              frame_skip=1,
              epsilon_start=1,
              epsilon_end=0.1,
              epsilon_decay_length=500000,
              gamma=0.99,
              batch_size=32)

[2018-01-05 12:46:33,837] Making new env: Breakout-v0
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Loading model checkpoint /home/kai/rl/gym/dqn/experiments/Breakout-v0/checkpoints/model...

INFO:tensorflow:Restoring parameters from /home/kai/rl/gym/dqn/experiments/Breakout-v0/checkpoints/model


[2018-01-05 12:46:35,456] Restoring parameters from /home/kai/rl/gym/dqn/experiments/Breakout-v0/checkpoints/model


Populating replay memory...
populating buffer: 9000
Populated replay memory
Step 482 (527197) @ Episode 1/5000, loss: 0.020171679556369788
Episode 1, Reward: 6.0
Step 462 (527660) @ Episode 2/5000, loss: 0.023435108363628387
Episode 2, Reward: 6.0
Step 420 (528081) @ Episode 3/5000, loss: 0.092802062630653383
Episode 3, Reward: 5.0
Step 400 (528482) @ Episode 4/5000, loss: 0.0097585208714008335
Episode 4, Reward: 5.0
Step 326 (528809) @ Episode 5/5000, loss: 0.047685708850622187
Episode 5, Reward: 4.0
Step 311 (529121) @ Episode 6/5000, loss: 0.0228598229587078176
Episode 6, Reward: 3.0
Step 344 (529466) @ Episode 7/5000, loss: 0.027841633185744286
Episode 7, Reward: 4.0
Step 328 (529795) @ Episode 8/5000, loss: 0.009873446077108383
Episode 8, Reward: 4.0
Step 419 (530215) @ Episode 9/5000, loss: 0.017611037939786918
Episode 9, Reward: 6.0
Step 396 (530612) @ Episode 10/5000, loss: 0.014920121058821678
Episode 10, Reward: 4.0
Step 292 (530905) @ Episode 11/5000, loss: 0.066748477518558

Step 624 (569015) @ Episode 93/5000, loss: 0.034873709082603455
Episode 93, Reward: 9.0
Step 281 (569297) @ Episode 94/5000, loss: 0.012183896265923977
Episode 94, Reward: 3.0
Step 446 (569744) @ Episode 95/5000, loss: 0.015408316627144814
Episode 95, Reward: 5.0
Step 413 (570158) @ Episode 96/5000, loss: 0.031660355627536774
Episode 96, Reward: 5.0
Step 462 (570621) @ Episode 97/5000, loss: 0.0232321005314588556
Episode 97, Reward: 6.0
Step 633 (571255) @ Episode 98/5000, loss: 0.0199116282165050585
Episode 98, Reward: 8.0
Step 439 (571695) @ Episode 99/5000, loss: 0.137241601943969734
Episode 99, Reward: 3.0
Step 571 (572267) @ Episode 100/5000, loss: 0.0205158106982707984
Episode 100, Reward: 7.0
Step 387 (572655) @ Episode 101/5000, loss: 0.0274873264133930265
Episode 101, Reward: 4.0
Step 464 (573120) @ Episode 102/5000, loss: 0.0180018544197082525
Episode 102, Reward: 5.0
Step 366 (573487) @ Episode 103/5000, loss: 0.009720508009195328
Episode 103, Reward: 4.0
Step 423 (573911) @

Step 726 (612566) @ Episode 184/5000, loss: 0.027775853872299194
Episode 184, Reward: 9.0
Step 423 (612990) @ Episode 185/5000, loss: 0.0163397751748561865
Episode 185, Reward: 4.0
Step 338 (613329) @ Episode 186/5000, loss: 0.028557980433106422
Episode 186, Reward: 2.0
Step 616 (613946) @ Episode 187/5000, loss: 0.0123034743592143064
Episode 187, Reward: 7.0
Step 384 (614331) @ Episode 188/5000, loss: 0.015041835606098175
Episode 188, Reward: 2.0
Step 423 (614755) @ Episode 189/5000, loss: 0.024739298969507217
Episode 189, Reward: 3.0
Step 542 (615298) @ Episode 190/5000, loss: 0.008627180010080338
Episode 190, Reward: 6.0
Step 612 (615911) @ Episode 191/5000, loss: 0.131799504160881044
Episode 191, Reward: 7.0
Step 604 (616516) @ Episode 192/5000, loss: 0.018043542280793193
Episode 192, Reward: 5.0
Step 482 (616999) @ Episode 193/5000, loss: 0.0123821198940277146
Episode 193, Reward: 5.0
Step 484 (617484) @ Episode 194/5000, loss: 0.0234638769179582685
Episode 194, Reward: 4.0
Step 4

Step 555 (656249) @ Episode 274/5000, loss: 0.0269485805183649065
Episode 274, Reward: 4.0
Step 469 (656719) @ Episode 275/5000, loss: 0.0312965884804725655
Episode 275, Reward: 5.0
Step 647 (657367) @ Episode 276/5000, loss: 0.0161986574530601585
Episode 276, Reward: 8.0
Step 755 (658123) @ Episode 277/5000, loss: 0.0380167216062545865
Episode 277, Reward: 11.0
Step 396 (658520) @ Episode 278/5000, loss: 0.0112065542489290245
Episode 278, Reward: 4.0
Step 502 (659023) @ Episode 279/5000, loss: 0.0191525612026453025
Episode 279, Reward: 6.0
Step 725 (659749) @ Episode 280/5000, loss: 0.0233547445386648185
Episode 280, Reward: 5.0
Step 373 (660123) @ Episode 281/5000, loss: 0.0224698334932327275
Episode 281, Reward: 4.0
Step 491 (660615) @ Episode 282/5000, loss: 0.018876731395721436
Episode 282, Reward: 4.0
Step 484 (661100) @ Episode 283/5000, loss: 0.0152114965021610265
Episode 283, Reward: 3.0
Step 514 (661615) @ Episode 284/5000, loss: 0.0120402090251445775
Episode 284, Reward: 5.0

Step 367 (698437) @ Episode 364/5000, loss: 0.0081276874989271165
Episode 364, Reward: 4.0
Step 392 (698830) @ Episode 365/5000, loss: 0.5594772100448608716
Episode 365, Reward: 4.0
Step 416 (699247) @ Episode 366/5000, loss: 0.0079654064029455185
Episode 366, Reward: 2.0
Step 413 (699661) @ Episode 367/5000, loss: 0.0081207714974880225
Episode 367, Reward: 3.0
Step 566 (700228) @ Episode 368/5000, loss: 0.0085666589438915254
Episode 368, Reward: 5.0
Step 324 (700553) @ Episode 369/5000, loss: 0.041761044412851334
Episode 369, Reward: 2.0
Step 338 (700892) @ Episode 370/5000, loss: 0.012551049701869488
Episode 370, Reward: 0.0
Step 286 (701179) @ Episode 371/5000, loss: 0.0210445765405893335
Episode 371, Reward: 2.0
Step 727 (701907) @ Episode 372/5000, loss: 0.0209124572575092384
Episode 372, Reward: 5.0
Step 441 (702349) @ Episode 373/5000, loss: 0.0250997561961412434
Episode 373, Reward: 3.0
Step 415 (702765) @ Episode 374/5000, loss: 0.014849367551505566
Episode 374, Reward: 4.0
St

Step 410 (739323) @ Episode 454/5000, loss: 0.0106315175071358685
Episode 454, Reward: 2.0
Step 523 (739847) @ Episode 455/5000, loss: 0.0090227723121643075
Episode 455, Reward: 2.0
Step 395 (740243) @ Episode 456/5000, loss: 0.0071770329959690575
Episode 456, Reward: 1.0
Step 249 (740493) @ Episode 457/5000, loss: 0.012532478198409081
Episode 457, Reward: 1.0
Step 303 (740797) @ Episode 458/5000, loss: 0.0099153397604823115
Episode 458, Reward: 1.0
Step 549 (741347) @ Episode 459/5000, loss: 0.0100947860628366476
Episode 459, Reward: 5.0
Step 307 (741655) @ Episode 460/5000, loss: 0.0084083741530776024
Episode 460, Reward: 2.0
Step 405 (742061) @ Episode 461/5000, loss: 0.0194065924733877186
Episode 461, Reward: 2.0
Step 394 (742456) @ Episode 462/5000, loss: 0.1147618070244789164
Episode 462, Reward: 3.0
Step 448 (742905) @ Episode 463/5000, loss: 0.0403978750109672555
Episode 463, Reward: 4.0
Step 403 (743309) @ Episode 464/5000, loss: 0.0051942486315965655
Episode 464, Reward: 4.0


Step 230 (772136) @ Episode 544/5000, loss: 0.0107081308960914615
Episode 544, Reward: 1.0
Step 419 (772556) @ Episode 545/5000, loss: 0.1240744888782501224
Episode 545, Reward: 4.0
Step 177 (772734) @ Episode 546/5000, loss: 0.013065954670310028
Episode 546, Reward: 0.0
Step 161 (772896) @ Episode 547/5000, loss: 0.0123016797006130225
Episode 547, Reward: 0.0
Step 268 (773165) @ Episode 548/5000, loss: 0.0051203146576881415
Episode 548, Reward: 2.0
Step 316 (773482) @ Episode 549/5000, loss: 0.0090865353122353555
Episode 549, Reward: 2.0
Step 209 (773692) @ Episode 550/5000, loss: 0.0191698148846626284
Episode 550, Reward: 1.0
Step 295 (773988) @ Episode 551/5000, loss: 0.0626999884843826375
Episode 551, Reward: 3.0
Step 490 (774479) @ Episode 552/5000, loss: 0.0133648253977298745
Episode 552, Reward: 5.0
Step 178 (774658) @ Episode 553/5000, loss: 0.014660583809018135
Episode 553, Reward: 0.0
Step 268 (774927) @ Episode 554/5000, loss: 0.0135845802724361425
Episode 554, Reward: 1.0
S

Step 381 (798749) @ Episode 634/5000, loss: 0.0079557355493307115
Episode 634, Reward: 4.0
Step 238 (798988) @ Episode 635/5000, loss: 0.0073031042702496054
Episode 635, Reward: 2.0
Step 380 (799369) @ Episode 636/5000, loss: 0.0135032236576080325
Episode 636, Reward: 4.0
Step 287 (799657) @ Episode 637/5000, loss: 0.0100009683519601825
Episode 637, Reward: 2.0
Step 300 (799958) @ Episode 638/5000, loss: 0.0396181046962738045
Episode 638, Reward: 3.0
Step 199 (800158) @ Episode 639/5000, loss: 0.0087552117183804516
Episode 639, Reward: 1.0
Step 387 (800546) @ Episode 640/5000, loss: 0.0093411002308130265
Episode 640, Reward: 4.0
Step 195 (800742) @ Episode 641/5000, loss: 0.294731765985488915
Episode 641, Reward: 0.0
Step 346 (801089) @ Episode 642/5000, loss: 0.0063033560290932655
Episode 642, Reward: 1.0
Step 303 (801393) @ Episode 643/5000, loss: 0.0079282186925411226
Episode 643, Reward: 2.0
Step 432 (801826) @ Episode 644/5000, loss: 0.0085461838170886045
Episode 644, Reward: 4.0


Step 184 (826390) @ Episode 724/5000, loss: 0.0066099595278501514
Episode 724, Reward: 0.0
Step 275 (826666) @ Episode 725/5000, loss: 0.0106710968539118772
Episode 725, Reward: 2.0
Step 247 (826914) @ Episode 726/5000, loss: 0.0345377586781978625
Episode 726, Reward: 2.0
Step 465 (827380) @ Episode 727/5000, loss: 0.0097657013684511185
Episode 727, Reward: 3.0
Step 326 (827707) @ Episode 728/5000, loss: 0.0123656606301665375
Episode 728, Reward: 3.0
Step 169 (827877) @ Episode 729/5000, loss: 0.0115889627486467365
Episode 729, Reward: 0.0
Step 317 (828195) @ Episode 730/5000, loss: 0.0379930622875690464
Episode 730, Reward: 3.0
Step 159 (828355) @ Episode 731/5000, loss: 0.016204245388507843
Episode 731, Reward: 0.0
Step 380 (828736) @ Episode 732/5000, loss: 0.0056877844035625465
Episode 732, Reward: 4.0
Step 281 (829018) @ Episode 733/5000, loss: 0.0194552931934595117
Episode 733, Reward: 3.0
Step 356 (829375) @ Episode 734/5000, loss: 0.0070737283676862724
Episode 734, Reward: 3.0


In [148]:
rewards

array([ 470.])