In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import tensorflow as tf
import numpy as np
import tensorflow.contrib.layers as layers

In [3]:
import datetime
import os
import random

from collections import deque, namedtuple

In [59]:
from environment import CubeEnvironment

In [61]:
Transition = namedtuple("Transition", ["state", "action", "reward", "next_state", "done"])

## Experience replay

In [75]:
class PrioritizedExperienceReplay():
    '''
    alpha: tradeoff between sampling high priority transitions and random sampling
    beta: used to compute importance sampling weights, increased from beta_start to 1.0 over the course of beta_steps
    TL;DR of PER: https://medium.com/arxiv-bytes/summary-prioritized-experience-replay-e5f9257cef2d
    '''
    
    def __init__(self, max_size=500000, alpha=0.6, beta_start=0.4, beta_steps=100000):
        self._min_size = 10
        self.max_size = max_size
        self.replay_memory = []
        self.priorities = np.zeros((max_size,), dtype=np.float32)
        
        self.alpha = alpha
        self.beta = beta_start
        self.beta_incr = (1.0 - beta_start) / beta_steps
        
        self.index = 0
        self.priorities[0] = 1.0**alpha  # init the first max prob
        
    @property
    def min_size(self):
        return self._min_size
        
    def update_beta(self):
        self.beta = min(1.0, self.beta + self.beta_incr)
        
    def get_probabilities(self):
        '''
        turn current priorities in probabilities
        '''
        size = len(self.replay_memory)
        end_index = size if size < self.max_size else self.index
        
        prios = self.priorities[:end_index]
        probs = prios / prios.sum()
        return probs
        
    def insert(self, transition):
        
        # add the transition to the memory
        if len(self.replay_memory) < self.max_size:
            self.replay_memory.append(transition)
        else:
            self.replay_memory[self.index] = transition
            
        # update priorities and index
        self.priorities[self.index] = self.priorities.max()
        self.index = (self.index + 1) % self.max_size
        
    def sample(self, batch_size):
        '''
        sample a batch of transitions
        '''
        current_size = len(self.replay_memory)
        
        # samples
        probs = self.get_probabilities()
        indices = np.random.choice(current_size, batch_size, p=probs)
        samples = [self.replay_memory[i] for i in indices]
        #samples = random.sample(self.replay_memory, batch_size)
        
        # importance sampling weights
        prob_min = probs.min()
        max_weight = (prob_min * current_size)**(-self.beta)

        is_weights  = (current_size * probs[indices]) ** (-self.beta)
        is_weights /= max_weight  # to ensure it is not > 1
        
        self.update_beta()
        return samples, is_weights
    
    def __len__(self):
        return len(self.replay_memory)
        

In [48]:
def make_epsilon_greedy_policy(estimator, nb_actions):
    """
    estimator: q_values estimator for a given state
    nb_actions: number of possible actions in the environment

    Returns:
        A function that takes the (sess, state, epsilon) as an argument and returns
        the probabilities for each action in the form of a numpy array of length nb_actions.

    """
    def policy_fn(sess, observed_state, epsilon):
        action_probs = np.ones(nb_actions, dtype=float) * epsilon / nb_actions
        q_values = estimator.predict(sess, np.expand_dims(np.expand_dims(observed_state, 0), 3))[0]
        best_action = np.argmax(q_values)
        
        action_probs[best_action] += (1.0 - epsilon)
        return action_probs
    
    return policy_fn


def greedy_policy(sess, estimator, observed_state, nb_actions, epsilon):
    action_probs = np.ones(nb_actions, dtype=float) * epsilon / nb_actions
    q_values = estimator.predict(sess, np.expand_dims(np.expand_dims(observed_state, 0), 3))[0]
    best_action = np.argmax(q_values)
        
    action_probs[best_action] += (1.0 - epsilon)
    return action_probs

## Estimator

In [7]:
class Estimator():
    '''
    Q-Value Estimator neural network.

    This network is used for both the Q-Network and the Target Network.
    '''

    def __init__(self, actions, scope="estimator"):
        self.valid_actions = actions
        self.scope = scope

        with tf.variable_scope(scope):
            # Build the graph
            self._build_model()

    def _build_model(self):
        '''
        TODO at like wn
        '''

        # Placeholders for our input
        self.X_pl = tf.placeholder(shape=[None, 1, 54, 1], dtype=tf.uint8, name="X")
        # The TD target value
        self.y_pl = tf.placeholder(shape=[None], dtype=tf.float32, name="y")
        # Integer id of which action was selected
        self.actions_pl = tf.placeholder(shape=[None], dtype=tf.int32, name="actions")

        batch_size = tf.shape(self.X_pl)[0]
        X = tf.to_float(self.X_pl)
        
        # conv
        conv1 = layers.conv2d(X, 32, [1,3] , activation_fn=tf.nn.relu)
        conv2 = layers.conv2d(conv1, 64, [1,3] , activation_fn=tf.nn.relu)
        conv3 = layers.conv2d(conv2, 64, [1,3] , activation_fn=tf.nn.relu)

        # fc
        flattened = tf.contrib.layers.flatten(conv3)
        fc1 = tf.contrib.layers.fully_connected(flattened, 512)
        self.predictions = tf.contrib.layers.fully_connected(fc1, len(self.valid_actions))

        # Get the predictions for the chosen actions only
        gather_indices = tf.range(batch_size) * tf.shape(self.predictions)[1] + self.actions_pl
        self.action_predictions = tf.gather(tf.reshape(self.predictions, [-1]), gather_indices)

        # Calculate the loss
        self.losses = tf.squared_difference(self.y_pl, self.action_predictions)
        self.loss = tf.reduce_mean(self.losses)

        # Optimizer Parameters from original paper
        self.optimizer = tf.train.RMSPropOptimizer(0.00025, 0.99, 0.0, 1e-6)
        self.train_op = self.optimizer.minimize(self.loss, global_step=tf.train.get_global_step())
        
    def predict(self, sess, s):
        """
        Predicts action values.

        Args:
          sess: Tensorflow session
          s: State input of shape [batch_size, 4, 160, 160, 3]

        Returns:
          Tensor of shape [batch_size, NUM_VALID_ACTIONS] containing the estimated 
          action values.
        """
        return sess.run(self.predictions, { self.X_pl: s })

    def update(self, sess, s, a, y):
        """
        Updates the estimator towards the given targets.

        Args:
          sess: Tensorflow session object
          s: State input of shape [batch_size, 4, 160, 160, 3]
          a: Chosen actions of shape [batch_size]
          y: Targets of shape [batch_size]

        Returns:
          The calculated loss on the batch.
        """
        feed_dict = { self.X_pl: s, self.y_pl: y, self.actions_pl: a }
        global_step, _, loss = sess.run(
            [tf.train.get_global_step(), self.train_op, self.loss],
            feed_dict)
        return loss

## Main DQN function

In [58]:
def learning(sess,
             env,
             epsilon_start=1.0,
             epsilon_end=0.1,
             epsilon_decay_steps=500000):
    
    # saving things (from docker)
    exp_dir = "/workspace/experiments/exp%s" % datetime.datetime.now().strftime("%Y%m%d%H%M")
    checkpoint_dir = os.path.join(exp_dir, "checkpoints")
    os.makedirs(checkpoint_dir, exist_ok=True)
    
    valid_actions = env.get_action_space()
    print("VALID ACTIONS: %s" % str(valid_actions))
    
    # estimators
    q_estimator = Estimator(valid_actions, scope="q_estimator")
    target_estimator = Estimator(valid_actions, scope="target_q")
    sess.run(tf.global_variables_initializer())
    
    saver = tf.train.Saver()
    
    # loading checkpoint
    latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir)
    if latest_checkpoint:
        print("Loading model checkpoint %s ..." % latest_checkpoint)
        saver.restore(sess, latest_checkpoint)
        
    # the policy we're following
    policy = make_epsilon_greedy_policy(q_estimator, len(valid_actions))
    
    # epsilon decay schedule for epsilon-greedy policy
    epsilons = np.linspace(epsilon_start, epsilon_end, epsilon_decay_steps)
    
    # replay memory
    replay = PrioritizedExperienceReplay()
    global_step = sess.run(tf.train.get_global_step())
    epsilon = epsilons[min(global_step, epsilon_decay_steps-1)]
    populate_replay(replay, sess, env, q_estimator, epsilon, len(valid_actions))
    
    # main loop for episodes
    for i_episode in range(num_episodes):
        print("Starting episode %s" % i_episode)
        
        # save checkpoint for each episode
        saver.save(tf.get_default_session(), checkpoint_path)
        
        state = env.init_cube()
        
        # one step in the environment
        for t in itertools.count():

            # Epsilon for this time step
            epsilon = epsilons[min(total_t, epsilon_decay_steps - 1)]

In [67]:
def populate_replay(replay, sess, env, q_estimator, epsilon, nb_actions):
    '''
    populate replay memory with initial experience 
    with more and more distance from the solved cube
    '''
    print("Populating experience replay...")
    state = env.init_cube()

    for i in range(replay.min_size):
        action_probs = greedy_policy(sess, q_estimator, state, nb_actions, epsilon)
        action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
        next_state, reward, solved = env.take_action(action)

        replay.insert(Transition(state, action, reward, next_state, solved))
        
        # we start again if the cube was solved
        state = env.init_cube() if solved else next_state
        
    print("Experience replay populated with %s transitions." % len(replay))
    
    # testing
    print(replay.sample(10))

In [54]:
def copy_model_parameters(sess, estimator1, estimator2):
    '''
    Copy the parameters from estimator 1 to estimator 2
    '''
    e1_params = [t for t in tf.trainable_variables() if t.name.startswith(estimator1.scope)]
    e1_params = sorted(e1_params, key=lambda v: v.name)
    e2_params = [t for t in tf.trainable_variables() if t.name.startswith(estimator2.scope)]
    e2_params = sorted(e2_params, key=lambda v: v.name)

    update_ops = []
    for e1_v, e2_v in zip(e1_params, e2_params):
        op = e2_v.assign(e1_v)
        update_ops.append(op)
        
    sess.run(update_ops)

## Main entrypoint

In [76]:
tf.reset_default_graph()

env = CubeEnvironment()

# global step variable
global_step = tf.Variable(0, name='global_step', trainable=False)

# main run
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    for t, stats in learning(sess,
                             env):
        print(stats)

VALID ACTIONS: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Populating experience replay...
Experience replay populated with 10 transitions.
([Transition(state=matrix([[4, 5, 1, 0, 4, 1, 2, 2, 3, 2, 3, 0, 1, 0, 3, 2, 3, 4, 5, 0, 5,
         2, 5, 5, 4, 1, 0, 5, 5, 4, 2, 2, 4, 5, 2, 2, 0, 4, 1, 5, 1, 3,
         0, 0, 3, 1, 4, 1, 0, 3, 1, 3, 4, 3]]), action=2, reward=-1, next_state=matrix([[2, 5, 1, 1, 4, 1, 2, 2, 3, 0, 3, 0, 5, 0, 3, 5, 3, 4, 5, 0, 5,
         2, 5, 2, 4, 1, 5, 4, 5, 4, 0, 2, 4, 2, 2, 2, 0, 4, 1, 5, 1, 3,
         0, 0, 3, 3, 0, 1, 4, 3, 4, 3, 1, 1]]), done=False), Transition(state=matrix([[5, 5, 1, 2, 3, 1, 2, 2, 0, 4, 2, 1, 0, 0, 3, 3, 3, 4, 5, 3, 2,
         0, 1, 1, 2, 4, 4, 5, 5, 1, 2, 2, 1, 0, 0, 3, 0, 4, 2, 0, 4, 3,
         5, 5, 4, 1, 5, 0, 4, 5, 1, 3, 4, 3]]), action=8, reward=-1, next_state=matrix([[5, 5, 1, 2, 3, 1, 3, 4, 3, 4, 2, 1, 0, 0, 3, 3, 3, 4, 5, 3, 2,
         0, 1, 1, 5, 5, 4, 0, 2, 5, 0, 2, 5, 3, 1, 1, 0, 4, 2, 0, 4, 3,
         2, 2, 0, 1, 5, 0, 4, 5, 1, 2, 4, 4]]), done=False), Transition(state=matrix([

NameError: name 'num_episodes' is not defined

## Debug

In [None]:
import sys
sys.path

In [66]:
replay.sample()

NameError: name 'replay' is not defined