In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import tensorflow as tf
import numpy as np
import tensorflow.contrib.layers as layers

In [3]:
import datetime
import os
import random

from collections import deque, namedtuple

In [4]:
from environment import CubeEnvironment

## Experience replay

In [12]:
class PrioritizedExperienceReplay():
    '''
    alpha: tradeoff between sampling high priority transitions and random sampling
    beta: used to compute importance sampling weights, increased from beta_start to 1.0 over the course of beta_steps
    TL;DR of PER: https://medium.com/arxiv-bytes/summary-prioritized-experience-replay-e5f9257cef2d
    '''
    
    def __init__(self, max_size=500000, alpha=0.6, beta_start=0.4, beta_steps=100000):
        self._min_size = 10
        self.max_size = max_size
        self.replay_memory = []
        self.priorities = np.zeros((max_size,), dtype=np.float32)
        
        self.alpha = alpha
        self.beta = beta_start
        self.beta_incr = (1.0 - beta_start) / beta_steps
        
        self.index = 0
        self.priorities[0] = 1.0**alpha  # init the first max prob
        
    @property
    def min_size(self):
        return self._min_size
        
    def update_beta(self):
        self.beta = min(1.0, self.beta + self.beta_incr)
        
    def get_probabilities(self):
        '''
        turn current priorities in probabilities
        '''
        size = len(self.replay_memory)
        end_index = size if size < self.max_size else self.index
        
        prios = self.priorities[:end_index]
        probs = prios / prios.sum()
        
    def insert(self, transition):
        
        # add the transition to the memory
        if len(self.replay_memory) < self.max_size:
            self.replay_memory.append(transition)
        else:
            self.replay_memory[self.index] = transition
            
        # update priorities and index
        self.priorities[self.index] = self.priorities.max()
        self.index = (self.index + 1) % self.max_size
        
    def sample(self, batch_size):
        '''
        sample a batch of transitions
        '''
        current_size = len(replay_buffer)
        
        # samples
        probs = self.get_probabilities()
        indices = np.random.choice(current_size, batch_size, p=probs)
        samples = [self.replay_memory[i] for i in indices]
        #samples = random.sample(self.replay_memory, batch_size)
        
        # importance sampling weights
        prob_min = probs.min()
        max_weight = (prob_min * total)**(-beta)

        is_weights  = (current_size * probs[indices]) ** (-self.beta)
        is_weights /= max_weight  # to ensure it is not > 1
        
        self.update_beta()
        return samples, is_weights
        

In [48]:
def make_epsilon_greedy_policy(estimator, nb_actions):
    """
    estimator: q_values estimator for a given state
    nb_actions: number of possible actions in the environment

    Returns:
        A function that takes the (sess, state, epsilon) as an argument and returns
        the probabilities for each action in the form of a numpy array of length nb_actions.

    """
    def policy_fn(sess, observed_state, epsilon):
        action_probs = np.ones(nb_actions, dtype=float) * epsilon / nb_actions
        q_values = estimator.predict(sess, np.expand_dims(np.expand_dims(observed_state, 0), 3))[0]
        best_action = np.argmax(q_values)
        
        action_probs[best_action] += (1.0 - epsilon)
        return action_probs
    
    return policy_fn


def greedy_policy(sess, estimator, observed_state, nb_actions, epsilon):
    action_probs = np.ones(nb_actions, dtype=float) * epsilon / nb_actions
    q_values = estimator.predict(sess, np.expand_dims(np.expand_dims(observed_state, 0), 3))[0]
    best_action = np.argmax(q_values)
        
    action_probs[best_action] += (1.0 - epsilon)
    return action_probs

## Estimator

In [7]:
class Estimator():
    '''
    Q-Value Estimator neural network.

    This network is used for both the Q-Network and the Target Network.
    '''

    def __init__(self, actions, scope="estimator"):
        self.valid_actions = actions
        self.scope = scope

        with tf.variable_scope(scope):
            # Build the graph
            self._build_model()

    def _build_model(self):
        '''
        TODO at like wn
        '''

        # Placeholders for our input
        self.X_pl = tf.placeholder(shape=[None, 1, 54, 1], dtype=tf.uint8, name="X")
        # The TD target value
        self.y_pl = tf.placeholder(shape=[None], dtype=tf.float32, name="y")
        # Integer id of which action was selected
        self.actions_pl = tf.placeholder(shape=[None], dtype=tf.int32, name="actions")

        batch_size = tf.shape(self.X_pl)[0]
        X = tf.to_float(self.X_pl)
        
        # conv
        conv1 = layers.conv2d(X, 32, [1,3] , activation_fn=tf.nn.relu)
        conv2 = layers.conv2d(conv1, 64, [1,3] , activation_fn=tf.nn.relu)
        conv3 = layers.conv2d(conv2, 64, [1,3] , activation_fn=tf.nn.relu)

        # fc
        flattened = tf.contrib.layers.flatten(conv3)
        fc1 = tf.contrib.layers.fully_connected(flattened, 512)
        self.predictions = tf.contrib.layers.fully_connected(fc1, len(self.valid_actions))

        # Get the predictions for the chosen actions only
        gather_indices = tf.range(batch_size) * tf.shape(self.predictions)[1] + self.actions_pl
        self.action_predictions = tf.gather(tf.reshape(self.predictions, [-1]), gather_indices)

        # Calculate the loss
        self.losses = tf.squared_difference(self.y_pl, self.action_predictions)
        self.loss = tf.reduce_mean(self.losses)

        # Optimizer Parameters from original paper
        self.optimizer = tf.train.RMSPropOptimizer(0.00025, 0.99, 0.0, 1e-6)
        self.train_op = self.optimizer.minimize(self.loss, global_step=tf.train.get_global_step())
        
    def predict(self, sess, s):
        """
        Predicts action values.

        Args:
          sess: Tensorflow session
          s: State input of shape [batch_size, 4, 160, 160, 3]

        Returns:
          Tensor of shape [batch_size, NUM_VALID_ACTIONS] containing the estimated 
          action values.
        """
        return sess.run(self.predictions, { self.X_pl: s })

    def update(self, sess, s, a, y):
        """
        Updates the estimator towards the given targets.

        Args:
          sess: Tensorflow session object
          s: State input of shape [batch_size, 4, 160, 160, 3]
          a: Chosen actions of shape [batch_size]
          y: Targets of shape [batch_size]

        Returns:
          The calculated loss on the batch.
        """
        feed_dict = { self.X_pl: s, self.y_pl: y, self.actions_pl: a }
        global_step, _, loss = sess.run(
            [tf.train.get_global_step(), self.train_op, self.loss],
            feed_dict)
        return loss

## Main DQN function

In [52]:
def learning(sess,
             env,
             epsilon_start=1.0,
             epsilon_end=0.1,
             epsilon_decay_steps=500000):
    
    Transition = namedtuple("Transition", ["state", "action", "reward", "next_state", "done"])
    
    # saving things (from docker)
    exp_dir = "/workspace/experiments/exp%s" % datetime.datetime.now().strftime("%Y%m%d%H%M")
    checkpoint_dir = os.path.join(exp_dir, "checkpoints")
    os.makedirs(checkpoint_dir, exist_ok=True)
    
    valid_actions = env.get_action_space()
    
    # estimators
    q_estimator = Estimator(valid_actions, scope="q_estimator")
    target_estimator = Estimator(valid_actions, scope="target_q")
    sess.run(tf.global_variables_initializer())
    
    saver = tf.train.Saver()
    
    # loading checkpoint
    latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir)
    if latest_checkpoint:
        print("Loading model checkpoint %s ..." % latest_checkpoint)
        saver.restore(sess, latest_checkpoint)
        
    # the policy we're following
    policy = make_epsilon_greedy_policy(q_estimator, len(valid_actions))
    
    # epsilon decay schedule for epsilon-greedy policy
    epsilons = np.linspace(epsilon_start, epsilon_end, epsilon_decay_steps)
    
    # replay memory
    replay = PrioritizedExperienceReplay()
    global_step = sess.run(tf.train.get_global_step())
    epsilon = epsilons[min(global_step, epsilon_decay_steps-1)]
    populate_replay(replay, sess, env, q_estimator, epsilon, len(valid_actions))
    
    # main loop for episodes
    for i_episode in range(num_episodes):
        print("Starting episode %s" % i_episode)
        
        # save checkpoint for each episode
        saver.save(tf.get_default_session(), checkpoint_path)
        
        state = env.init_cube()
        
        # one step in the environment
        for t in itertools.count():

            # Epsilon for this time step
            epsilon = epsilons[min(total_t, epsilon_decay_steps - 1)]

In [53]:
def populate_replay(replay, sess, env, q_estimator, epsilon, nb_actions):
    '''
    populate replay memory with initial experience 
    with more and more distance from the solved cube
    '''
    print("Populating experience replay...")
    state = env.init_cube()

    for i in range(replay.min_size):
        action_probs = greedy_policy(sess, q_estimator, state, nb_actions, epsilon)
        action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
        next_state, reward, solved = env.take_action(action)

        replay.insert(Transition(state, action, reward, next_state, solved))
        
        # we start again if the cube was solved
        state = env.init_cube() if solved else next_state

In [54]:
def copy_model_parameters(sess, estimator1, estimator2):
    '''
    Copy the parameters from estimator 1 to estimator 2
    '''
    e1_params = [t for t in tf.trainable_variables() if t.name.startswith(estimator1.scope)]
    e1_params = sorted(e1_params, key=lambda v: v.name)
    e2_params = [t for t in tf.trainable_variables() if t.name.startswith(estimator2.scope)]
    e2_params = sorted(e2_params, key=lambda v: v.name)

    update_ops = []
    for e1_v, e2_v in zip(e1_params, e2_params):
        op = e2_v.assign(e1_v)
        update_ops.append(op)
        
    sess.run(update_ops)

## Main entrypoint

In [55]:
tf.reset_default_graph()

env = CubeEnvironment()

# global step variable
global_step = tf.Variable(0, name='global_step', trainable=False)

# main run
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    for t, stats in learning(sess,
                             env):
        print(stats)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Populating experience replay...


FailedPreconditionError: Attempting to use uninitialized value q_estimator/Conv/weights
	 [[Node: q_estimator/Conv/weights/read = Identity[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"](q_estimator/Conv/weights)]]

Caused by op 'q_estimator/Conv/weights/read', defined at:
  File "/usr/local/lib/python3.6/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/usr/local/lib/python3.6/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.6/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/usr/local/lib/python3.6/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/usr/local/lib/python3.6/site-packages/ipykernel/kernelapp.py", line 497, in start
    self.io_loop.start()
  File "/usr/local/lib/python3.6/site-packages/zmq/eventloop/ioloop.py", line 151, in start
    super(ZMQIOLoop, self).start()
  File "/usr/local/lib/python3.6/site-packages/tornado/ioloop.py", line 888, in start
    handler_func(fd_obj, events)
  File "/usr/local/lib/python3.6/site-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/usr/local/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 433, in _handle_events
    self._handle_recv()
  File "/usr/local/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 465, in _handle_recv
    self._run_callback(callback, msg)
  File "/usr/local/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 407, in _run_callback
    callback(*args, **kwargs)
  File "/usr/local/lib/python3.6/site-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/usr/local/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/usr/local/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 233, in dispatch_shell
    handler(stream, idents, msg)
  File "/usr/local/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 399, in execute_request
    user_expressions, allow_stdin)
  File "/usr/local/lib/python3.6/site-packages/ipykernel/ipkernel.py", line 208, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/usr/local/lib/python3.6/site-packages/ipykernel/zmqshell.py", line 537, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/usr/local/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2662, in run_cell
    raw_cell, store_history, silent, shell_futures)
  File "/usr/local/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2785, in _run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/usr/local/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2901, in run_ast_nodes
    if self.run_code(code, result):
  File "/usr/local/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2961, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-55-fb4e113a5beb>", line 12, in <module>
    env):
  File "<ipython-input-52-96e962e3c4b9>", line 17, in learning
    q_estimator = Estimator(valid_actions, scope="q_estimator")
  File "<ipython-input-7-57354a74934f>", line 14, in __init__
    self._build_model()
  File "<ipython-input-7-57354a74934f>", line 32, in _build_model
    conv1 = layers.conv2d(X, 32, [1,3] , activation_fn=tf.nn.relu)
  File "/usr/local/lib/python3.6/site-packages/tensorflow/contrib/framework/python/ops/arg_scope.py", line 183, in func_with_args
    return func(*args, **current_args)
  File "/usr/local/lib/python3.6/site-packages/tensorflow/contrib/layers/python/layers/layers.py", line 1154, in convolution2d
    conv_dims=2)
  File "/usr/local/lib/python3.6/site-packages/tensorflow/contrib/framework/python/ops/arg_scope.py", line 183, in func_with_args
    return func(*args, **current_args)
  File "/usr/local/lib/python3.6/site-packages/tensorflow/contrib/layers/python/layers/layers.py", line 1057, in convolution
    outputs = layer.apply(inputs)
  File "/usr/local/lib/python3.6/site-packages/tensorflow/python/keras/engine/base_layer.py", line 805, in apply
    return self.__call__(inputs, *args, **kwargs)
  File "/usr/local/lib/python3.6/site-packages/tensorflow/python/layers/base.py", line 362, in __call__
    outputs = super(Layer, self).__call__(inputs, *args, **kwargs)
  File "/usr/local/lib/python3.6/site-packages/tensorflow/python/keras/engine/base_layer.py", line 728, in __call__
    self.build(input_shapes)
  File "/usr/local/lib/python3.6/site-packages/tensorflow/python/keras/layers/convolutional.py", line 161, in build
    dtype=self.dtype)
  File "/usr/local/lib/python3.6/site-packages/tensorflow/python/layers/base.py", line 276, in add_weight
    getter=vs.get_variable)
  File "/usr/local/lib/python3.6/site-packages/tensorflow/python/keras/engine/base_layer.py", line 565, in add_weight
    aggregation=aggregation)
  File "/usr/local/lib/python3.6/site-packages/tensorflow/python/training/checkpointable/base.py", line 535, in _add_variable_with_custom_getter
    **kwargs_for_getter)
  File "/usr/local/lib/python3.6/site-packages/tensorflow/python/ops/variable_scope.py", line 1467, in get_variable
    aggregation=aggregation)
  File "/usr/local/lib/python3.6/site-packages/tensorflow/python/ops/variable_scope.py", line 1217, in get_variable
    aggregation=aggregation)
  File "/usr/local/lib/python3.6/site-packages/tensorflow/python/ops/variable_scope.py", line 510, in get_variable
    return custom_getter(**custom_getter_kwargs)
  File "/usr/local/lib/python3.6/site-packages/tensorflow/contrib/layers/python/layers/layers.py", line 1744, in layer_variable_getter
    return _model_variable_getter(getter, *args, **kwargs)
  File "/usr/local/lib/python3.6/site-packages/tensorflow/contrib/layers/python/layers/layers.py", line 1735, in _model_variable_getter
    use_resource=use_resource)
  File "/usr/local/lib/python3.6/site-packages/tensorflow/contrib/framework/python/ops/arg_scope.py", line 183, in func_with_args
    return func(*args, **current_args)
  File "/usr/local/lib/python3.6/site-packages/tensorflow/contrib/framework/python/ops/variables.py", line 297, in model_variable
    use_resource=use_resource)
  File "/usr/local/lib/python3.6/site-packages/tensorflow/contrib/framework/python/ops/arg_scope.py", line 183, in func_with_args
    return func(*args, **current_args)
  File "/usr/local/lib/python3.6/site-packages/tensorflow/contrib/framework/python/ops/variables.py", line 252, in variable
    use_resource=use_resource)
  File "/usr/local/lib/python3.6/site-packages/tensorflow/python/ops/variable_scope.py", line 481, in _true_getter
    aggregation=aggregation)
  File "/usr/local/lib/python3.6/site-packages/tensorflow/python/ops/variable_scope.py", line 903, in _get_single_variable
    aggregation=aggregation)
  File "/usr/local/lib/python3.6/site-packages/tensorflow/python/ops/variable_scope.py", line 2443, in variable
    aggregation=aggregation)
  File "/usr/local/lib/python3.6/site-packages/tensorflow/python/ops/variable_scope.py", line 2425, in <lambda>
    previous_getter = lambda **kwargs: default_variable_creator(None, **kwargs)
  File "/usr/local/lib/python3.6/site-packages/tensorflow/python/ops/variable_scope.py", line 2406, in default_variable_creator
    constraint=constraint)
  File "/usr/local/lib/python3.6/site-packages/tensorflow/python/ops/variables.py", line 259, in __init__
    constraint=constraint)
  File "/usr/local/lib/python3.6/site-packages/tensorflow/python/ops/variables.py", line 422, in _init_from_args
    self._snapshot = array_ops.identity(self._variable, name="read")
  File "/usr/local/lib/python3.6/site-packages/tensorflow/python/ops/array_ops.py", line 80, in identity
    return gen_array_ops.identity(input, name=name)
  File "/usr/local/lib/python3.6/site-packages/tensorflow/python/ops/gen_array_ops.py", line 3264, in identity
    "Identity", input=input, name=name)
  File "/usr/local/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
  File "/usr/local/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py", line 454, in new_func
    return func(*args, **kwargs)
  File "/usr/local/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 3155, in create_op
    op_def=op_def)
  File "/usr/local/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 1717, in __init__
    self._traceback = tf_stack.extract_stack()

FailedPreconditionError (see above for traceback): Attempting to use uninitialized value q_estimator/Conv/weights
	 [[Node: q_estimator/Conv/weights/read = Identity[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"](q_estimator/Conv/weights)]]


In [None]:
import sys
sys.path