In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import tensorflow as tf
import numpy as np
import tensorflow.contrib.layers as layers

In [3]:
import datetime
import os
import random

from collections import deque, namedtuple

In [36]:
from environment import CubeEnvironment
from replay import PrioritizedExperienceReplay

In [5]:
Transition = namedtuple("Transition", ["state", "action", "reward", "next_state", "done"])

## Policy

In [7]:
def make_epsilon_greedy_policy(estimator, nb_actions):
    """
    estimator: q_values estimator for a given state
    nb_actions: number of possible actions in the environment

    Returns:
        A function that takes the (sess, state, epsilon) as an argument and returns
        the probabilities for each action in the form of a numpy array of length nb_actions.

    """
    def policy_fn(sess, observed_state, epsilon):
        action_probs = np.ones(nb_actions, dtype=float) * epsilon / nb_actions
        q_values = estimator.predict(sess, np.expand_dims(np.expand_dims(observed_state, 0), 3))[0]
        best_action = np.argmax(q_values)
        
        action_probs[best_action] += (1.0 - epsilon)
        return action_probs
    
    return policy_fn


def greedy_policy(sess, estimator, observed_state, nb_actions, epsilon):
    action_probs = np.ones(nb_actions, dtype=float) * epsilon / nb_actions
    q_values = estimator.predict(sess, np.expand_dims(np.expand_dims(observed_state, 0), 3))[0]
    best_action = np.argmax(q_values)
        
    action_probs[best_action] += (1.0 - epsilon)
    return action_probs

## Estimator

In [8]:
class Estimator():
    '''
    Q-Value Estimator neural network.

    This network is used for both the Q-Network and the Target Network.
    '''

    def __init__(self, actions, scope="estimator"):
        self.valid_actions = actions
        self.scope = scope

        with tf.variable_scope(scope):
            # Build the graph
            self._build_model()

    def _build_model(self):
        '''
        TODO at like wn
        '''

        # Placeholders for our input
        self.X_pl = tf.placeholder(shape=[None, 1, 54, 1], dtype=tf.uint8, name="X")
        # The TD target value
        self.y_pl = tf.placeholder(shape=[None], dtype=tf.float32, name="y")
        # Integer id of which action was selected
        self.actions_pl = tf.placeholder(shape=[None], dtype=tf.int32, name="actions")

        batch_size = tf.shape(self.X_pl)[0]
        X = tf.to_float(self.X_pl)
        
        # conv
        conv1 = layers.conv2d(X, 32, [1,3] , activation_fn=tf.nn.relu)
        conv2 = layers.conv2d(conv1, 64, [1,3] , activation_fn=tf.nn.relu)
        conv3 = layers.conv2d(conv2, 64, [1,3] , activation_fn=tf.nn.relu)

        # fc
        flattened = tf.contrib.layers.flatten(conv3)
        fc1 = tf.contrib.layers.fully_connected(flattened, 512)
        self.predictions = tf.contrib.layers.fully_connected(fc1, len(self.valid_actions))

        # Get the predictions for the chosen actions only
        gather_indices = tf.range(batch_size) * tf.shape(self.predictions)[1] + self.actions_pl
        self.action_predictions = tf.gather(tf.reshape(self.predictions, [-1]), gather_indices)

        # Calculate the loss
        self.losses = tf.squared_difference(self.y_pl, self.action_predictions)
        self.loss = tf.reduce_mean(self.losses)

        # Optimizer Parameters from original paper
        self.optimizer = tf.train.RMSPropOptimizer(0.00025, 0.99, 0.0, 1e-6)
        self.train_op = self.optimizer.minimize(self.loss, global_step=tf.train.get_global_step())
        
    def predict(self, sess, s):
        """
        Predicts action values.

        Args:
          sess: Tensorflow session
          s: State input of shape [batch_size, 4, 160, 160, 3]

        Returns:
          Tensor of shape [batch_size, NUM_VALID_ACTIONS] containing the estimated 
          action values.
        """
        return sess.run(self.predictions, { self.X_pl: s })

    def update(self, sess, s, a, y):
        """
        Updates the estimator towards the given targets.

        Args:
          sess: Tensorflow session object
          s: State input of shape [batch_size, 4, 160, 160, 3]
          a: Chosen actions of shape [batch_size]
          y: Targets of shape [batch_size]

        Returns:
          The calculated loss on the batch.
        """
        feed_dict = { self.X_pl: s, self.y_pl: y, self.actions_pl: a }
        global_step, _, loss = sess.run(
            [tf.train.get_global_step(), self.train_op, self.loss],
            feed_dict)
        return loss

## Main DQN function

In [37]:
def learning(sess,
             env,
             nb_episodes,
             epsilon_start=1.0,
             epsilon_end=0.1,
             epsilon_decay_steps=500000):
    
    # saving things (from docker)
    exp_dir = "/workspace/experiments/exp%s" % datetime.datetime.now().strftime("%Y%m%d%H%M")
    checkpoint_dir = os.path.join(exp_dir, "checkpoints")
    os.makedirs(checkpoint_dir, exist_ok=True)
    
    valid_actions = env.get_action_space()
    print("VALID ACTIONS: %s" % str(valid_actions))
    
    # estimators
    q_estimator = Estimator(valid_actions, scope="q_estimator")
    target_estimator = Estimator(valid_actions, scope="target_q")
    sess.run(tf.global_variables_initializer())
    
    saver = tf.train.Saver()
    
    # loading checkpoint
    latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir)
    if latest_checkpoint:
        print("Loading model checkpoint %s ..." % latest_checkpoint)
        saver.restore(sess, latest_checkpoint)
        
    # the policy we're following
    policy = make_epsilon_greedy_policy(q_estimator, len(valid_actions))
    
    # epsilon decay schedule for epsilon-greedy policy
    epsilons = np.linspace(epsilon_start, epsilon_end, epsilon_decay_steps)
    
    # replay memory
    replay = PrioritizedExperienceReplay()
    global_step = sess.run(tf.train.get_global_step())
    epsilon = epsilons[min(global_step, epsilon_decay_steps-1)]
    populate_replay(replay, sess, env, q_estimator, epsilon, len(valid_actions))
    
    # main loop for episodes
    for i_episode in range(num_episodes):
        print("Starting episode %s" % i_episode)
        
        # save checkpoint for each episode
        saver.save(tf.get_default_session(), checkpoint_path)
        
        state = env.init_cube()
        
        # one step in the environment
        for t in itertools.count():

            # Epsilon for this time step
            epsilon = epsilons[min(total_t, epsilon_decay_steps - 1)]

In [11]:
def copy_model_parameters(sess, estimator1, estimator2):
    '''
    Copy the parameters from estimator 1 to estimator 2
    '''
    e1_params = [t for t in tf.trainable_variables() if t.name.startswith(estimator1.scope)]
    e1_params = sorted(e1_params, key=lambda v: v.name)
    e2_params = [t for t in tf.trainable_variables() if t.name.startswith(estimator2.scope)]
    e2_params = sorted(e2_params, key=lambda v: v.name)

    update_ops = []
    for e1_v, e2_v in zip(e1_params, e2_params):
        op = e2_v.assign(e1_v)
        update_ops.append(op)
        
    sess.run(update_ops)

In [38]:
def populate_replay(replay, sess, env, q_estimator, epsilon, nb_actions):
    '''
    populate replay memory with initial experience 
    with more and more distance from the solved cube
    TODO: 
    '''
    print("Populating experience replay...")
    
    # we populate the replay with transitions from trajectories made with differently scrambled cubes
    nb_max_scrambling_moves = 10  # thanks Loic
    transitions_per_nb = int(replay.min_size / nb_max_scrambling_moves)
    current_scrambling_moves = 1
    total = 0
    state = env.init_cube(distance=current_scrambling_moves)
    trajectory = list()  # list of transitions for current trajectory
    
    for i in range(50000):  # TODO find something less arbitrary

        action_probs = greedy_policy(sess, q_estimator, state, nb_actions, epsilon)
        action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
        next_state, reward, solved = env.take_action(action)

        trajectory.append(Transition(state, action, reward, next_state, solved))

        # cube solved: put all the transitions leading to a success in the replay_memory
        if solved:
            print("Solved!")
            for t in trajectory:
                replay.insert(t)
            trajectory = list()
            current_scrambling_moves += 1
            state = env.init_cube(distance=current_scrambling_moves)
        elif len(trajectory) > current_scrambling_moves:  # dead end
            trajectory = list()
            state = env.init_cube(distance=current_scrambling_moves)
        else:
            state = next_state

        # for monitoring purposes
        total += 1
        if total % 1000 == 0:
            print("[%s] populated %s transitions (%s done)" % (total, len(replay), replay.nb_done_transitions))
        
    print("Experience replay populated with %s transitions." % len(replay))
    
    # testing
    print(replay.sample(10))

## Main entrypoint

In [39]:
tf.reset_default_graph()

nb_episodes = 500
env = CubeEnvironment()

# global step variable
global_step = tf.Variable(0, name='global_step', trainable=False)

# main run
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    for t, stats in learning(sess,
                             env,
                             nb_episodes):
        print(stats)

VALID ACTIONS: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


epsilons 1.0 0.1
selected epsilon 1.0
Populating experience replay...
Solved!
Solved!
[1000] populated 3 transitions (2 done)
Solved!
[2000] populated 4 transitions (3 done)
[3000] populated 4 transitions (3 done)
Solved!
[4000] populated 8 transitions (4 done)
[5000] populated 8 transitions (4 done)
[6000] populated 8 transitions (4 done)
[7000] populated 8 transitions (4 done)
Solved!
[8000] populated 9 transitions (5 done)
[9000] populated 9 transitions (5 done)
[10000] populated 9 transitions (5 done)
[11000] populated 9 transitions (5 done)
[12000] populated 9 transitions (5 done)
[13000] populated 9 transitions (5 done)
[14000] populated 9 transitions (5 done)
[15000] populated 9 transitions (5 done)
[16000] populated 9 transitions (5 done)
Solved!
[17000] populated 11 transitions (6 done)
[18000] populated 11 transitions (6 done)
[19000] populated 11 transitions (6 done)
[20000] populated 11 transitions (6 done)
[21000] populated 11 transitions (6 done)
Solved!
[22000] populated

NameError: name 'num_episodes' is not defined

## Debug

In [None]:
import sys
sys.path

In [None]:
replay.sample()