In [1]:
%matplotlib inline

import gym
import itertools
import numpy as np
import os
import random
import sys
import tensorflow as tf
from gym.wrappers import Monitor
import time

if "../" not in sys.path:
  sys.path.append("../")

from lib import plotting
from collections import deque, namedtuple

In [2]:
env = gym.envs.make("Breakout-v0")

[2017-06-21 13:56:49,892] Making new env: Breakout-v0


In [3]:
# Atari Actions: 0 (noop), 1 (fire), 2 (left) and 3 (right) are valid actions
VALID_ACTIONS = [0, 1, 2, 3]

In [4]:
class StateProcessor():
    """
    Processes a raw Atari iamges. Resizes it and converts it to grayscale.
    """
    def __init__(self):
        # Build the Tensorflow graph
        with tf.variable_scope("state_processor"):
            self.input_state = tf.placeholder(shape=[210, 160, 3], dtype=tf.uint8)
            self.output = tf.image.rgb_to_grayscale(self.input_state)
            self.output = tf.image.crop_to_bounding_box(self.output, 34, 0, 160, 160)
            self.output = tf.image.resize_images(
                self.output, np.array([84, 84],dtype=np.int32), method=tf.image.ResizeMethod.NEAREST_NEIGHBOR)
            self.output = tf.squeeze(self.output)

    def process(self, sess, state):
        """
        Args:
            sess: A Tensorflow session object
            state: A [210, 160, 3] Atari RGB State

        Returns:
            A processed [84, 84, 1] state representing grayscale values.
        """
        return sess.run(self.output, { self.input_state: state })

In [5]:
class Estimator():
    """Q-Value Estimator neural network.

    This network is used for both the Q-Network and the Target Network.
    """

    def __init__(self, scope="estimator", summaries_dir=None):
        self.scope = scope
        # Writes Tensorboard summaries to disk
        self.summary_writer = None
        with tf.variable_scope(scope):
            # Build the graph
            self._build_model()
            if summaries_dir:
                summary_dir = os.path.join(summaries_dir, "summaries_{}".format(scope))
                if not os.path.exists(summary_dir):
                    os.makedirs(summary_dir)
                self.summary_writer = tf.summary.FileWriter(summary_dir)

    def _build_model(self):
        """
        Builds the Tensorflow graph.
        """

        # Placeholders for our input
        # Our input are 4 RGB frames of shape 160, 160 each
        self.X_pl = tf.placeholder(shape=[None, 84, 84, 4], dtype=tf.uint8, name="X")
        # The TD target value
        self.y_pl = tf.placeholder(shape=[None], dtype=tf.float32, name="y")
        # Integer id of which action was selected
        self.actions_pl = tf.placeholder(shape=[None], dtype=tf.int32, name="actions")

        X = tf.to_float(self.X_pl) / 255.0
        batch_size = tf.shape(self.X_pl)[0]

        # Three convolutional layers
        conv1 = tf.contrib.layers.conv2d(
            X, 32, 8, 4, activation_fn=tf.nn.relu)
        conv2 = tf.contrib.layers.conv2d(
            conv1, 64, 4, 2, activation_fn=tf.nn.relu)
        conv3 = tf.contrib.layers.conv2d(
            conv2, 64, 3, 1, activation_fn=tf.nn.relu)

        # Fully connected layers
        flattened = tf.contrib.layers.flatten(conv3)
        fc1 = tf.contrib.layers.fully_connected(flattened, 512)
        self.predictions = tf.contrib.layers.fully_connected(fc1, len(VALID_ACTIONS))

        # Get the predictions for the chosen actions only
        gather_indices = tf.range(batch_size) * tf.shape(self.predictions)[1] + self.actions_pl
        self.action_predictions = tf.gather(tf.reshape(self.predictions, [-1]), gather_indices)

        # Calcualte the loss
        self.losses = tf.squared_difference(self.y_pl, self.action_predictions)
        self.loss = tf.reduce_mean(self.losses)

        # Optimizer Parameters from original paper
        self.optimizer = tf.train.RMSPropOptimizer(0.00025, 0.99, 0.0, 1e-6)
        self.train_op = self.optimizer.minimize(self.loss, global_step=tf.contrib.framework.get_global_step())

        # Summaries for Tensorboard
        self.summaries = tf.summary.merge([
            tf.summary.scalar("loss", self.loss),
            tf.summary.histogram("loss_hist", self.losses),
            tf.summary.histogram("q_values_hist", self.predictions),
            tf.summary.scalar("max_q_value", tf.reduce_max(self.predictions))
        ])


    def predict(self, sess, s):
        """
        Predicts action values.

        Args:
          sess: Tensorflow session
          s: State input of shape [batch_size, 4, 160, 160, 3]

        Returns:
          Tensor of shape [batch_size, NUM_VALID_ACTIONS] containing the estimated 
          action values.
        """
        return sess.run(self.predictions, { self.X_pl: s })

    def update(self, sess, s, a, y):
        """
        Updates the estimator towards the given targets.

        Args:
          sess: Tensorflow session object
          s: State input of shape [batch_size, 4, 160, 160, 3]
          a: Chosen actions of shape [batch_size]
          y: Targets of shape [batch_size]

        Returns:
          The calculated loss on the batch.
        """
        feed_dict = { self.X_pl: s, self.y_pl: y, self.actions_pl: a }
        summaries, global_step, _, loss = sess.run(
            [self.summaries, tf.contrib.framework.get_global_step(), self.train_op, self.loss],
            feed_dict)
        if self.summary_writer:
            self.summary_writer.add_summary(summaries, global_step)
        return loss

In [6]:
# For Testing....

tf.reset_default_graph()
global_step = tf.Variable(0, name="global_step", trainable=False)

e = Estimator(scope="test")
sp = StateProcessor()

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    # Example observation batch
    observation = env.reset()
    
    observation_p = sp.process(sess, observation)
    observation = np.stack([observation_p] * 4, axis=2)
    observations = np.array([observation] * 2)
    
    # Test Prediction
    print(e.predict(sess, observations))

    # Test training step
    y = np.array([10.0, 10.0])
    a = np.array([1, 3])
    print(e.update(sess, observations, a, y))

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


[[ 0.          0.05673014  0.01246074  0.        ]
 [ 0.          0.05673014  0.01246074  0.        ]]
99.4343


In [7]:
def copy_model_parameters(sess, estimator1, estimator2):
    """
    Copies the model parameters of one estimator to another.

    Args:
      sess: Tensorflow session instance
      estimator1: Estimator to copy the paramters from
      estimator2: Estimator to copy the parameters to
    """
    e1_params = [t for t in tf.trainable_variables() if t.name.startswith(estimator1.scope)]
    e1_params = sorted(e1_params, key=lambda v: v.name)
    e2_params = [t for t in tf.trainable_variables() if t.name.startswith(estimator2.scope)]
    e2_params = sorted(e2_params, key=lambda v: v.name)

    update_ops = []
    for e1_v, e2_v in zip(e1_params, e2_params):
        op = e2_v.assign(e1_v)
        update_ops.append(op)

    sess.run(update_ops)

In [8]:
def make_epsilon_greedy_policy(estimator, nA):
    """
    Creates an epsilon-greedy policy based on a given Q-function approximator and epsilon.

    Args:
        estimator: An estimator that returns q values for a given state
        nA: Number of actions in the environment.

    Returns:
        A function that takes the (sess, observation, epsilon) as an argument and returns
        the probabilities for each action in the form of a numpy array of length nA.

    """
    def policy_fn(sess, observation, epsilon):
        A = np.ones(nA, dtype=float) * epsilon / nA
        q_values = estimator.predict(sess, np.expand_dims(observation, 0))[0]
        best_action = np.argmax(q_values)
        A[best_action] += (1.0 - epsilon)
        return A
    return policy_fn

In [None]:
def deep_q_learning(sess,
                    env,
                    q_estimator,
                    target_estimator,
                    state_processor,
                    num_episodes,
                    experiment_dir,
                    replay_memory_size=500000,
                    replay_memory_init_size=50000,
                    update_target_estimator_every=10000,
                    discount_factor=0.99,
                    epsilon_start=1.0,
                    epsilon_end=0.1,
                    epsilon_decay_steps=500000,
                    batch_size=32,
                    record_video_every=1000):
    """
    Q-Learning algorithm for fff-policy TD control using Function Approximation.
    Finds the optimal greedy policy while following an epsilon-greedy policy.

    Args:
        sess: Tensorflow Session object
        env: OpenAI environment
        q_estimator: Estimator object used for the q values
        target_estimator: Estimator object used for the targets
        state_processor: A StateProcessor object
        num_episodes: Number of episodes to run for
        experiment_dir: Directory to save Tensorflow summaries in
        replay_memory_size: Size of the replay memory
        replay_memory_init_size: Number of random experiences to sampel when initializing 
          the reply memory.
        update_target_estimator_every: Copy parameters from the Q estimator to the 
          target estimator every N steps
        discount_factor: Lambda time discount factor
        epsilon_start: Chance to sample a random action when taking an action.
          Epsilon is decayed over time and this is the start value
        epsilon_end: The final minimum value of epsilon after decaying is done
        epsilon_decay_steps: Number of steps to decay epsilon over
        batch_size: Size of batches to sample from the replay memory
        record_video_every: Record a video every N episodes

    Returns:
        An EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards.
    """

    Transition = namedtuple("Transition", ["state", "action", "reward", "successor", "done"])

    # The replay memory
    replay_memory = []

    # Keeps track of useful statistics
    stats = plotting.EpisodeStats(
        episode_lengths=np.zeros(num_episodes),
        episode_rewards=np.zeros(num_episodes))

    # Create directories for checkpoints and summaries
    checkpoint_dir = os.path.join(experiment_dir, "checkpoints")
    checkpoint_path = os.path.join(checkpoint_dir, "model")
    monitor_path = os.path.join(experiment_dir, "monitor")

    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)
    if not os.path.exists(monitor_path):
        os.makedirs(monitor_path)

    saver = tf.train.Saver()
    # Load a previous checkpoint if we find one
    latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir)
    if latest_checkpoint:
        print("Loading model checkpoint {}...\n".format(latest_checkpoint))
        saver.restore(sess, latest_checkpoint)
    
    # Get the current time step
    total_t = sess.run(tf.contrib.framework.get_global_step())

    # The epsilon decay schedule
    epsilons = np.linspace(epsilon_start, epsilon_end, epsilon_decay_steps)

    # The policy we're following
    policy = make_epsilon_greedy_policy(
        q_estimator,
        len(VALID_ACTIONS))

    # Populate the replay memory with initial experience
    print("Populating replay memory...")
    state = env.reset()
    state = state_processor.process(sess, state)
    state = np.stack([state] * 4, axis=2)
    for i in range(replay_memory_init_size):
        action_probs = policy(sess, state, epsilons[min(total_t,epsilon_decay_steps-1)])
        action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
        successor, reward, done, _ = env.step(action)
        successor = state_processor.process(sess, successor)
        successor = np.append(state[:,:,1:], np.expand_dims(successor, 2), axis=2)
        replay_memory.append(Transition(state,action,reward,successor,done))
        if done:
            state = env.reset()
            state = state_processor.process(sess, state)
            state = np.stack([state] * 4, axis=2)
            break
        else:
            state = successor

    # Record videos
    env = Monitor(env, 
                  directory=monitor_path, 
                  video_callable=lambda count: count % record_video_every == 0, 
                  resume=True)

    for i_episode in range(num_episodes):
        
        # Save the current checkpoint
        saver.save(tf.get_default_session(), checkpoint_path)

        # Reset the environment
        state = env.reset()
        state = state_processor.process(sess, state)
        state = np.stack([state] * 4, axis=2)
        loss = None
        
        # Start timer
        time0 = time.time()

        # One step in the environment
        for t in itertools.count():

            # Epsilon for this time step
            epsilon = epsilons[min(total_t, epsilon_decay_steps-1)]

            # Add epsilon to Tensorboard
            episode_summary = tf.Summary()
            episode_summary.value.add(simple_value=epsilon, tag="epsilon")
            q_estimator.summary_writer.add_summary(episode_summary, total_t)

            # Maybe update the target estimator
            if total_t % update_target_estimator_every == 0:
                copy_model_parameters(sess, q_estimator, target_estimator)

            # Print out which step we're on, useful for debugging.
            print("\rStep {} ({}) @ Episode {}/{}, loss: {}".format(
                    t, total_t, i_episode + 1, num_episodes, loss), end="")
            sys.stdout.flush()

            # Take a step in the environment
            action_probs = policy(sess, state, epsilon)
            action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
            successor, reward, done, _ = env.step(VALID_ACTIONS[action])
            successor = state_processor.process(sess, successor)
            successor = np.append(state[:,:,1:], np.expand_dims(successor, 2), axis=2)
            
            # If our replay memory is full, pop the first element
            if len(replay_memory) == replay_memory_size:
                replay_memory.pop(0)

            # Save transition to replay memory
            replay_memory.append(Transition(state,action,reward,successor,done))

            # Update statistics
            stats.episode_rewards[i_episode] += reward
            stats.episode_lengths[i_episode] = t

            # Sample a minibatch from the replay memory
            sample = random.sample(replay_memory, batch_size)
            state_batch, action_batch, reward_batch, successor_batch, done_batch = map(np.array,zip(*sample))
            
            # Calculate q values and targets
            next_q_values = q_estimator.predict(sess, successor_batch)
            best_actions = np.argmax(next_q_values, axis=1)
            next_q_values_target = target_estimator.predict(sess, successor_batch)
            target_batch = reward_batch + np.invert(done_batch).astype(np.float32) * \
                discount_factor * next_q_values_target[np.arange(batch_size), best_actions]

            
            # Perform gradient descent update
            states_batch = np.array(state_batch)
            loss = q_estimator.update(sess, state_batch, action_batch, target_batch)

            if done:
                print("\nEpisode time: ",time.time()-time0)
                break

            state = successor
            total_t += 1

        # Add summaries to tensorboard
        episode_summary = tf.Summary()
        episode_summary.value.add(simple_value=stats.episode_rewards[i_episode], 
                                  node_name="episode_reward", tag="episode_reward")
        episode_summary.value.add(simple_value=stats.episode_lengths[i_episode], 
                                  node_name="episode_length", tag="episode_length")
        q_estimator.summary_writer.add_summary(episode_summary, total_t)
        q_estimator.summary_writer.flush()

        yield total_t, plotting.EpisodeStats(
            episode_lengths=stats.episode_lengths[:i_episode+1],
            episode_rewards=stats.episode_rewards[:i_episode+1])

    env.monitor.close()
    return stats

In [None]:
tf.reset_default_graph()

# Where we save our checkpoints and graphs
experiment_dir = os.path.abspath("./experiments/{}".format(env.spec.id))

# Create a global step variable
global_step = tf.Variable(0, name='global_step', trainable=False)
    
# Create estimators
q_estimator = Estimator(scope="q", summaries_dir=experiment_dir)
target_estimator = Estimator(scope="target_q")

# State processor
state_processor = StateProcessor()

# GPU options
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction = 0.5)

# Run it!
with tf.Session(config = tf.ConfigProto(gpu_options = gpu_options)) as sess:
    sess.run(tf.global_variables_initializer())
    for t, stats in deep_q_learning(sess,
                                    env,
                                    q_estimator=q_estimator,
                                    target_estimator=target_estimator,
                                    state_processor=state_processor,
                                    experiment_dir=experiment_dir,
                                    num_episodes=10000,
                                    replay_memory_size=500000,
                                    replay_memory_init_size=50000,
                                    update_target_estimator_every=10000,
                                    epsilon_start=1.0,
                                    epsilon_end=0.1,
                                    epsilon_decay_steps=500000,
                                    discount_factor=0.99,
                                    batch_size=32):

        print("\nEpisode Reward: {}".format(stats.episode_rewards[-1]))

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Populating replay memory...


[2017-06-21 13:57:00,724] Starting new video recorder writing to /home/jason/reinforcement-learning/DQN/experiments/Breakout-v0/monitor/openaigym.video.0.947.video000000.mp4


Step 167 (167) @ Episode 1/10000, loss: 0.00044260203139856465
Episode time:  1.5968477725982666

Episode Reward: 0.0
Step 390 (557) @ Episode 2/10000, loss: 0.00041565168066881597
Episode time:  3.2760391235351562

Episode Reward: 4.0
Step 171 (728) @ Episode 3/10000, loss: 0.00036551491939462723
Episode time:  1.4342994689941406

Episode Reward: 0.0
Step 253 (981) @ Episode 4/10000, loss: 0.03247578814625743356
Episode time:  2.0825939178466797

Episode Reward: 2.0
Step 281 (1262) @ Episode 5/10000, loss: 0.00098028453066945086
Episode time:  2.316087007522583

Episode Reward: 2.0
Step 285 (1547) @ Episode 6/10000, loss: 0.00026012700982391834
Episode time:  2.3570570945739746

Episode Reward: 2.0
Step 242 (1789) @ Episode 7/10000, loss: 0.00103185453917831185
Episode time:  2.080019474029541

Episode Reward: 1.0
Step 166 (1955) @ Episode 8/10000, loss: 6.300773384282365e-052
Episode time:  1.4234669208526611

Episode Reward: 0.0
Step 239 (2194) @ Episode 9/10000, loss: 0.00011750348

Step 286 (33946) @ Episode 137/10000, loss: 0.00338972685858607395
Episode time:  2.319105386734009

Episode Reward: 2.0
Step 178 (34124) @ Episode 138/10000, loss: 0.00013206343282945454
Episode time:  1.5094728469848633

Episode Reward: 0.0
Step 159 (34283) @ Episode 139/10000, loss: 0.00268301181495189678
Episode time:  1.3441781997680664

Episode Reward: 0.0
Step 165 (34448) @ Episode 140/10000, loss: 0.00069462257670238619
Episode time:  1.3692662715911865

Episode Reward: 0.0
Step 270 (34718) @ Episode 141/10000, loss: 0.00015279636136256158
Episode time:  2.250886917114258

Episode Reward: 2.0
Step 320 (35038) @ Episode 142/10000, loss: 7.581556565128267e-053
Episode time:  2.644479990005493

Episode Reward: 3.0
Step 180 (35218) @ Episode 143/10000, loss: 7.963180541992188e-059
Episode time:  1.4584927558898926

Episode Reward: 0.0
Step 174 (35392) @ Episode 144/10000, loss: 0.00118351029232144365
Episode time:  1.4710712432861328

Episode Reward: 0.0
Step 232 (35624) @ Episode 

Step 243 (65990) @ Episode 271/10000, loss: 0.00033490956411696972
Episode time:  2.0928738117218018

Episode Reward: 1.0
Step 180 (66170) @ Episode 272/10000, loss: 0.00020299530297052115
Episode time:  1.513012170791626

Episode Reward: 0.0
Step 336 (66506) @ Episode 273/10000, loss: 0.00066751620033755953
Episode time:  2.724555253982544

Episode Reward: 3.0
Step 186 (66692) @ Episode 274/10000, loss: 0.00419335765764117222
Episode time:  1.53346586227417

Episode Reward: 0.0
Step 304 (66996) @ Episode 275/10000, loss: 0.00074159412179142246
Episode time:  2.488110065460205

Episode Reward: 2.0
Step 186 (67182) @ Episode 276/10000, loss: 0.00011765379895223305
Episode time:  1.5313680171966553

Episode Reward: 0.0
Step 312 (67494) @ Episode 277/10000, loss: 6.59866927890107e-0507
Episode time:  2.5517895221710205

Episode Reward: 2.0
Step 167 (67661) @ Episode 278/10000, loss: 0.00014445174019783735
Episode time:  1.3535189628601074

Episode Reward: 0.0
Step 178 (67839) @ Episode 27

Step 166 (98288) @ Episode 405/10000, loss: 0.00152380345389246945
Episode time:  1.394737958908081

Episode Reward: 0.0
Step 376 (98664) @ Episode 406/10000, loss: 0.00017786932585295295
Episode time:  3.128807544708252

Episode Reward: 3.0
Step 284 (98948) @ Episode 407/10000, loss: 0.00095317518571391764
Episode time:  2.334948778152466

Episode Reward: 2.0
Step 375 (99323) @ Episode 408/10000, loss: 0.00093580700922757392
Episode time:  3.144123077392578

Episode Reward: 4.0
Step 309 (99632) @ Episode 409/10000, loss: 0.00166956847533583644
Episode time:  2.5335936546325684

Episode Reward: 2.0
Step 251 (99883) @ Episode 410/10000, loss: 0.00012417364632710814
Episode time:  2.0572993755340576

Episode Reward: 1.0
Step 239 (100122) @ Episode 411/10000, loss: 0.00398227851837873571
Episode time:  2.0039350986480713

Episode Reward: 1.0
Step 333 (100455) @ Episode 412/10000, loss: 0.00053983583347871935
Episode time:  2.7118990421295166

Episode Reward: 3.0
Step 219 (100674) @ Episod

Step 371 (115793) @ Episode 472/10000, loss: 0.00114879745524376635
Episode time:  3.0617926120758057

Episode Reward: 3.0
Step 415 (116208) @ Episode 473/10000, loss: 0.00014180010475683957
Episode time:  3.4689974784851074

Episode Reward: 4.0
Step 187 (116395) @ Episode 474/10000, loss: 0.00057641143212094911
Episode time:  1.5682423114776611

Episode Reward: 0.0
Step 231 (116626) @ Episode 475/10000, loss: 0.00039534160168841484
Episode time:  1.8995506763458252

Episode Reward: 1.0
Step 312 (116938) @ Episode 476/10000, loss: 0.00024472663062624633
Episode time:  2.577471971511841

Episode Reward: 2.0
Step 214 (117152) @ Episode 477/10000, loss: 0.00190938287414610395
Episode time:  1.750591516494751

Episode Reward: 1.0
Step 178 (117330) @ Episode 478/10000, loss: 0.00013441787450574338
Episode time:  1.4863026142120361

Episode Reward: 0.0
Step 175 (117505) @ Episode 479/10000, loss: 0.00452344864606857333
Episode time:  1.4661738872528076

Episode Reward: 0.0
Step 285 (117790) 

Step 253 (132200) @ Episode 539/10000, loss: 0.00175839778967201717
Episode time:  2.1787354946136475

Episode Reward: 1.0
Step 177 (132377) @ Episode 540/10000, loss: 0.00027147325454279783
Episode time:  1.4463231563568115

Episode Reward: 0.0
Step 181 (132558) @ Episode 541/10000, loss: 0.00021666147222276777
Episode time:  1.5451619625091553

Episode Reward: 0.0
Step 250 (132808) @ Episode 542/10000, loss: 0.00157705205492675367
Episode time:  2.089266777038574

Episode Reward: 2.0
Step 263 (133071) @ Episode 543/10000, loss: 7.365572673734277e-055
Episode time:  2.2064368724823

Episode Reward: 1.0
Step 167 (133238) @ Episode 544/10000, loss: 0.00155666819773614426
Episode time:  1.3651769161224365

Episode Reward: 0.0
Step 180 (133418) @ Episode 545/10000, loss: 0.00097472965717315672
Episode time:  1.4621918201446533

Episode Reward: 0.0
Step 301 (133719) @ Episode 546/10000, loss: 0.00020829719142057002
Episode time:  2.478956699371338

Episode Reward: 2.0
Step 287 (134006) @ E

Step 218 (148539) @ Episode 606/10000, loss: 0.00105129205621778966
Episode time:  1.8038403987884521

Episode Reward: 1.0
Step 175 (148714) @ Episode 607/10000, loss: 0.00032053334871307015
Episode time:  1.4770255088806152

Episode Reward: 0.0
Step 261 (148975) @ Episode 608/10000, loss: 0.00030905331368558117
Episode time:  2.161329507827759

Episode Reward: 2.0
Step 241 (149216) @ Episode 609/10000, loss: 0.00012097330181859434
Episode time:  1.9525549411773682

Episode Reward: 1.0
Step 175 (149391) @ Episode 610/10000, loss: 0.00037792162038385877
Episode time:  1.4940342903137207

Episode Reward: 0.0
Step 299 (149690) @ Episode 611/10000, loss: 0.00085546367336064584
Episode time:  2.4128623008728027

Episode Reward: 2.0
Step 230 (149920) @ Episode 612/10000, loss: 0.00080174481263384222
Episode time:  1.886411190032959

Episode Reward: 1.0
Step 171 (150091) @ Episode 613/10000, loss: 0.00031195706105791033
Episode time:  1.4493505954742432

Episode Reward: 0.0
Step 275 (150366) 

Step 210 (164348) @ Episode 673/10000, loss: 0.00036017072852700955
Episode time:  1.7257342338562012

Episode Reward: 1.0
Step 206 (164554) @ Episode 674/10000, loss: 0.00077040679752826695
Episode time:  1.702218770980835

Episode Reward: 1.0
Step 213 (164767) @ Episode 675/10000, loss: 0.00291286315768957145
Episode time:  1.8032238483428955

Episode Reward: 1.0
Step 315 (165082) @ Episode 676/10000, loss: 0.00026743556372821332
Episode time:  2.5840024948120117

Episode Reward: 3.0
Step 163 (165245) @ Episode 677/10000, loss: 0.00125967455096542842
Episode time:  1.3506534099578857

Episode Reward: 0.0
Step 164 (165409) @ Episode 678/10000, loss: 0.00098429049830883744
Episode time:  1.377333641052246

Episode Reward: 0.0
Step 264 (165673) @ Episode 679/10000, loss: 0.00014292608830146492
Episode time:  2.166863441467285

Episode Reward: 2.0
Step 305 (165978) @ Episode 680/10000, loss: 0.00058126926887780436
Episode time:  2.4958181381225586

Episode Reward: 2.0
Step 286 (166264) @

Step 197 (179936) @ Episode 740/10000, loss: 0.00084315740969032056
Episode time:  1.641322374343872

Episode Reward: 0.0
Step 247 (180183) @ Episode 741/10000, loss: 0.00082481233403086662
Episode time:  2.0722694396972656

Episode Reward: 1.0
Step 240 (180423) @ Episode 742/10000, loss: 0.00390229746699333287
Episode time:  2.0404140949249268

Episode Reward: 1.0
Step 210 (180633) @ Episode 743/10000, loss: 0.00017916290380526334
Episode time:  1.7021830081939697

Episode Reward: 1.0
Step 219 (180852) @ Episode 744/10000, loss: 0.00015931553207337856
Episode time:  1.79384183883667

Episode Reward: 1.0
Step 210 (181062) @ Episode 745/10000, loss: 0.00030986563069745967
Episode time:  1.7627811431884766

Episode Reward: 1.0
Step 234 (181296) @ Episode 746/10000, loss: 0.00084008654812350874
Episode time:  2.010796070098877

Episode Reward: 1.0
Step 169 (181465) @ Episode 747/10000, loss: 0.00077637820504605775
Episode time:  1.3907594680786133

Episode Reward: 0.0
Step 217 (181682) @ 

Step 208 (195888) @ Episode 807/10000, loss: 0.00020260846940800548
Episode time:  1.7229118347167969

Episode Reward: 1.0
Step 219 (196107) @ Episode 808/10000, loss: 0.00026856304612010725
Episode time:  1.796428918838501

Episode Reward: 1.0
Step 335 (196442) @ Episode 809/10000, loss: 0.00016417025472037494
Episode time:  2.7668583393096924

Episode Reward: 3.0
Step 271 (196713) @ Episode 810/10000, loss: 0.00016936937754508108
Episode time:  2.256859302520752

Episode Reward: 2.0
Step 246 (196959) @ Episode 811/10000, loss: 5.820461592520587e-056
Episode time:  2.055802583694458

Episode Reward: 1.0
Step 229 (197188) @ Episode 812/10000, loss: 0.00056218355894088755
Episode time:  1.9001047611236572

Episode Reward: 1.0
Step 182 (197370) @ Episode 813/10000, loss: 0.00024197291349992156
Episode time:  1.5193963050842285

Episode Reward: 0.0
Step 323 (197693) @ Episode 814/10000, loss: 0.00056256691459566356
Episode time:  2.648214817047119

Episode Reward: 2.0
Step 292 (197985) @ 

Step 241 (212636) @ Episode 874/10000, loss: 0.00030580794555135073
Episode time:  2.0005505084991455

Episode Reward: 1.0
Step 229 (212865) @ Episode 875/10000, loss: 0.00026755221188068395
Episode time:  1.9350345134735107

Episode Reward: 1.0
Step 237 (213102) @ Episode 876/10000, loss: 0.00010447549720993266
Episode time:  1.967808723449707

Episode Reward: 1.0
Step 258 (213360) @ Episode 877/10000, loss: 0.00012178663018858065
Episode time:  2.224397659301758

Episode Reward: 2.0
Step 178 (213538) @ Episode 878/10000, loss: 0.00022049547987990086
Episode time:  1.4631128311157227

Episode Reward: 0.0
Step 316 (213854) @ Episode 879/10000, loss: 0.00011244054621784016
Episode time:  2.592271566390991

Episode Reward: 3.0
Step 181 (214035) @ Episode 880/10000, loss: 5.576618787017651e-057
Episode time:  1.514509677886963

Episode Reward: 0.0
Step 235 (214270) @ Episode 881/10000, loss: 0.00030519580468535423
Episode time:  1.946953535079956

Episode Reward: 1.0
Step 216 (214486) @ E

Step 276 (228872) @ Episode 941/10000, loss: 0.00060213258257135757
Episode time:  2.2688980102539062

Episode Reward: 2.0
Step 438 (229310) @ Episode 942/10000, loss: 0.00012476465781219304
Episode time:  3.5506820678710938

Episode Reward: 4.0
Step 180 (229490) @ Episode 943/10000, loss: 0.00010820443276315928
Episode time:  1.4653422832489014

Episode Reward: 0.0
Step 262 (229752) @ Episode 944/10000, loss: 0.00036374205956235536
Episode time:  2.1498122215270996

Episode Reward: 2.0
Step 197 (229949) @ Episode 945/10000, loss: 0.00052299944218248136
Episode time:  1.6005010604858398

Episode Reward: 0.0
Step 165 (230114) @ Episode 946/10000, loss: 0.00011503857967909425
Episode time:  1.4284825325012207

Episode Reward: 0.0
Step 421 (230535) @ Episode 947/10000, loss: 0.00039528700290247868
Episode time:  3.485931158065796

Episode Reward: 4.0
Step 172 (230707) @ Episode 948/10000, loss: 8.400627120863646e-057
Episode time:  1.4428153038024902

Episode Reward: 0.0
Step 280 (230987)

[2017-06-21 14:33:52,051] Starting new video recorder writing to /home/jason/reinforcement-learning/DQN/experiments/Breakout-v0/monitor/openaigym.video.0.947.video001000.mp4


Step 232 (243078) @ Episode 1001/10000, loss: 0.00011191579687874764
Episode time:  2.0589687824249268

Episode Reward: 1.0
Step 268 (243346) @ Episode 1002/10000, loss: 0.00023137171228881925
Episode time:  2.1791136264801025

Episode Reward: 2.0
Step 251 (243597) @ Episode 1003/10000, loss: 0.00121603952720761312
Episode time:  2.0992634296417236

Episode Reward: 1.0
Step 306 (243903) @ Episode 1004/10000, loss: 0.00034874092671088874
Episode time:  2.5029489994049072

Episode Reward: 3.0
Step 189 (244092) @ Episode 1005/10000, loss: 0.00011907457519555464
Episode time:  1.5946288108825684

Episode Reward: 0.0
Step 259 (244351) @ Episode 1006/10000, loss: 0.00050559669034555553
Episode time:  2.170520782470703

Episode Reward: 1.0
Step 175 (244526) @ Episode 1007/10000, loss: 0.00073003291618078955
Episode time:  1.4512474536895752

Episode Reward: 0.0
Step 433 (244959) @ Episode 1008/10000, loss: 0.00050191010814160114
Episode time:  3.5479838848114014

Episode Reward: 4.0
Step 216 

Step 229 (277555) @ Episode 1133/10000, loss: 0.00034988910192623734
Episode time:  1.867152452468872

Episode Reward: 1.0
Step 279 (277834) @ Episode 1134/10000, loss: 9.582270286045969e-052
Episode time:  2.31048583984375

Episode Reward: 2.0
Step 344 (278178) @ Episode 1135/10000, loss: 0.00045744740054942674
Episode time:  2.8208038806915283

Episode Reward: 3.0
Step 389 (278567) @ Episode 1136/10000, loss: 0.00078899634536355734
Episode time:  3.1677260398864746

Episode Reward: 4.0
Step 233 (278800) @ Episode 1137/10000, loss: 0.00120959384366869936
Episode time:  1.9506144523620605

Episode Reward: 1.0
Step 185 (278985) @ Episode 1138/10000, loss: 0.00951975304633379974
Episode time:  1.554614543914795

Episode Reward: 0.0
Step 171 (279156) @ Episode 1139/10000, loss: 0.00040964037179946902
Episode time:  1.470191478729248

Episode Reward: 0.0
Step 167 (279323) @ Episode 1140/10000, loss: 0.00111667322926223283
Episode time:  1.463719367980957

Episode Reward: 0.0
Step 277 (2796

Step 356 (317453) @ Episode 1265/10000, loss: 0.00144409947097301484
Episode time:  2.9180748462677

Episode Reward: 4.0
Step 190 (317643) @ Episode 1266/10000, loss: 0.00052317051449790677
Episode time:  1.6221747398376465

Episode Reward: 0.0
Step 418 (318061) @ Episode 1267/10000, loss: 0.00223793671466410166
Episode time:  3.4730637073516846

Episode Reward: 4.0
Step 279 (318340) @ Episode 1268/10000, loss: 0.00095064286142587667
Episode time:  2.2863874435424805

Episode Reward: 2.0
Step 345 (318685) @ Episode 1269/10000, loss: 0.00054492906201630838
Episode time:  2.952160358428955

Episode Reward: 3.0
Step 478 (319163) @ Episode 1270/10000, loss: 0.00054083205759525362
Episode time:  3.9625611305236816

Episode Reward: 6.0
Step 308 (319471) @ Episode 1271/10000, loss: 0.00133706862106919296
Episode time:  2.6014597415924072

Episode Reward: 2.0
Step 319 (319790) @ Episode 1272/10000, loss: 0.00221121311187744147
Episode time:  2.606674909591675

Episode Reward: 3.0
Step 259 (320

Step 548 (371716) @ Episode 1397/10000, loss: 0.00123993912711739544
Episode time:  4.473617076873779

Episode Reward: 5.0
Step 483 (372199) @ Episode 1398/10000, loss: 0.00081123539712280044
Episode time:  3.964867115020752

Episode Reward: 7.0
Step 493 (372692) @ Episode 1399/10000, loss: 0.00180198438465595253
Episode time:  4.129009485244751

Episode Reward: 5.0
Step 519 (373211) @ Episode 1400/10000, loss: 0.00099310476798564237
Episode time:  4.219712972640991

Episode Reward: 6.0
Step 394 (373605) @ Episode 1401/10000, loss: 0.00409443490207195375
Episode time:  3.2662200927734375

Episode Reward: 4.0
Step 474 (374079) @ Episode 1402/10000, loss: 0.00120954273734241724
Episode time:  3.930798292160034

Episode Reward: 5.0
Step 294 (374373) @ Episode 1403/10000, loss: 0.00079533003736287363
Episode time:  2.4207704067230225

Episode Reward: 3.0
Step 453 (374826) @ Episode 1404/10000, loss: 0.00177026982419192865
Episode time:  3.723527431488037

Episode Reward: 5.0
Step 440 (3752

Step 794 (461726) @ Episode 1529/10000, loss: 0.00121385673992335865
Episode time:  6.475345849990845

Episode Reward: 14.0
Step 940 (462666) @ Episode 1530/10000, loss: 0.00324797187931835654
Episode time:  7.669115304946899

Episode Reward: 19.0
Step 1181 (463847) @ Episode 1531/10000, loss: 0.00224665296263992857
Episode time:  9.591545104980469

Episode Reward: 21.0
Step 925 (464772) @ Episode 1532/10000, loss: 0.00086443597683683045
Episode time:  7.556511878967285

Episode Reward: 20.0
Step 945 (465717) @ Episode 1533/10000, loss: 0.00147589237894862977
Episode time:  7.666053771972656

Episode Reward: 23.0
Step 1090 (466807) @ Episode 1534/10000, loss: 0.0012897185515612364
Episode time:  8.804104566574097

Episode Reward: 23.0
Step 751 (467558) @ Episode 1535/10000, loss: 0.0008303842623718083
Episode time:  6.145002126693726

Episode Reward: 12.0
Step 926 (468484) @ Episode 1536/10000, loss: 0.00304828933440148836
Episode time:  7.5900750160217285

Episode Reward: 14.0
Step 98

Step 1204 (602929) @ Episode 1661/10000, loss: 0.0020331749692559242
Episode time:  10.312902212142944

Episode Reward: 22.0
Step 955 (603884) @ Episode 1662/10000, loss: 0.0041888272389769552
Episode time:  8.143864393234253

Episode Reward: 17.0
Step 894 (604778) @ Episode 1663/10000, loss: 0.00343843479640781965
Episode time:  7.734127521514893

Episode Reward: 15.0
Step 1019 (605797) @ Episode 1664/10000, loss: 0.0014492591144517064
Episode time:  8.761394262313843

Episode Reward: 20.0
Step 1331 (607128) @ Episode 1665/10000, loss: 0.0012776169460266829
Episode time:  11.498779296875

Episode Reward: 30.0
Step 1162 (608290) @ Episode 1666/10000, loss: 0.0019783438183367252
Episode time:  9.982787370681763

Episode Reward: 27.0
Step 1233 (609523) @ Episode 1667/10000, loss: 0.0015934640541672707
Episode time:  10.682124614715576

Episode Reward: 24.0
Step 1337 (610860) @ Episode 1668/10000, loss: 0.0012013043742626905
Episode time:  11.456190347671509

Episode Reward: 41.0
Step 668