In [None]:
import tensorflow as tf
import numpy as np
import gym as gym
import matplotlib.pyplot as plt
import sys
from time import time

In [None]:
# Check versions
# I'm requiring python >= 3.6 so I can use f-strings
versions_ok = True
if tf.__version__.split('.')[0] != '1':
    print('Requires tensorflow version 1, but found {}'.format(tf.__version__))
    versions_ok = False
if sys.version.split('.')[0] != '3' or int([x for s in sys.version.split('.') for x in s.split(' ')][1]) < 6 :
    print('Requires python 3.6 or above, but found version {}'.format(sys.version))
    versions_ok = False
if versions_ok:
    print(f'Found versions:\nTensorflow: {tf.__version__}\nPython: {sys.version}\nThese should be fine.')

In [None]:
class VPGModel:
    
    def __init__(self, hidden_layer_sizes, env, activation, rng_seed=1234):
        # Define the computation graph
        self.input_obs = tf.placeholder(shape=(None, env.observation_space.shape[0]), dtype=tf.float32)
        
        # Save references to these in case we want to inspect them later
        self._weights = []
        self._biases = []
        
        x = self.input_obs
        prev_layer_size = self.input_obs.shape[1]
        for l, this_layer_size in enumerate(hidden_layer_sizes + [env.action_space.n]):
            W = tf.get_variable(name=f'layer_{l}_weight', shape=(prev_layer_size, this_layer_size),
                               dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer())
            b = tf.get_variable(name=f'layer_{l}_bias', shape=(this_layer_size),
                               dtype=tf.float32, initializer=tf.initializers.constant(0.0))
            self._weights.append(W)
            self._biases.append(b)

            # Replace x with a reference to the output of this layer
            x = tf.add(tf.matmul(x, W, name=f'hidden_layer_{l}_matmul'), b, name=f'hidden_layer_{l}_add')
            if l < len(hidden_layer_sizes) and activation is not None:
                    x = activation(x, name=f'hidden_layer_{l}_activation')

            # Update so we get the input size of the next layer correct
            prev_layer_size = this_layer_size
        
        self._action_logits = x
        self._action_ps = tf.nn.softmax(self._action_logits)
        
        self._rng = np.random.RandomState(rng_seed) # Used to sample for the policy.
    
        # Some additional placeholders that are needed for training, but not for policy sampling.
        self.input_actions = tf.placeholder(shape=(None), dtype=tf.int32)
        self.input_discounted_rewards = tf.placeholder(shape=(None), dtype=tf.float32)
        # This isn't really a loss function. But (for a fixed sample) its derivative is VPG update.
        # This allows us to take advantage of TF knowing the derivative of this from the raw logits.
        # The reshape is to make clear that the weights should be broadcast in the label (not batch) dimension.
        self.pseudo_loss = tf.losses.sparse_softmax_cross_entropy(
            labels=self.input_actions,
            logits=self._action_logits,
            weights=tf.reshape(self.input_discounted_rewards, shape=(-1, 1))
        )
    
    def select_action(self, obs: np.ndarray, sess: tf.Session) -> int:
        """
        Given an observation selects and action from the current policy.
        """
        # Get probability distribution for this action
        action_ps = sess.run(self._action_ps, feed_dict={self.input_obs: obs[None, :]})
        # Sample from that distribution
        return self._rng.choice(action_ps.shape[1], p=action_ps[0])
    

In [None]:
class ValueModel:
    """
    Model for the value function of a particular state, i.e. Q(s), under the present policy.
    """
    
    def __init__(self, hidden_layer_sizes, env, activation):
        # Define our computation graph - similar to VPGModel (except names and output size)
        self.input_obs = tf.placeholder(shape=(None, env.observation_space.shape[0]),
                                        dtype=tf.float32, name="V_input_s")
        
        # Save references to these in case we want to inspect them later
        self._weights = []
        self._biases = []
        
        x = self.input_obs
        prev_layer_size = self.input_obs.shape[1]
        for l, this_layer_size in enumerate(hidden_layer_sizes + [1]):
            W = tf.get_variable(name=f'V_layer_{l}_weight', shape=(prev_layer_size, this_layer_size),
                               dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer())
            b = tf.get_variable(name=f'V_layer_{l}_bias', shape=(this_layer_size),
                               dtype=tf.float32, initializer=tf.initializers.constant(0.0))
            self._weights.append(W)
            self._biases.append(b)

            # Replace x with a reference to the output of this layer
            x = tf.add(tf.matmul(x, W, name=f'V_hidden_layer_{l}_matmul'), b, name=f'V_hidden_layer_{l}_add')
            if l < len(hidden_layer_sizes) and activation is not None:
                    x = activation(x, name=f'V_hidden_layer_{l}_activation')

            # Update so we get the input size of the next layer correct
            prev_layer_size = this_layer_size
        
        self.value_prediction = tf.reshape(x, shape=(-1, ))
        
        self.input_value_gt = tf.placeholder(shape=[None], dtype=tf.float32, name='V_input_gt')
        self.loss = tf.losses.mean_squared_error(labels=self.input_value_gt,
                                                 predictions=self.value_prediction)

In [None]:
def generate_episode(model, env, sess, max_steps=200):
    obs = env.reset()
    
    observations = []
    actions = []
    rewards = []
    
    for step in range(max_steps):
        action = model.select_action(obs, sess)
        
        new_obs, reward, done, _ = env.step(action)
        
        observations.append(obs)
        actions.append(action)
        rewards.append(reward)
        
        obs = new_obs
        
        if done:
            break

    return observations, actions, rewards

In [None]:
def train_on_batch(model, value_model, env, sess, train_step, train_step_value_model, eps_per_batch=10, gamma=1.0):
    
    # Check consistency of whether we're using a baseline model or not
    if (value_model is None and train_step_value_model is not None)\
        or (value_model is not None and train_step_value_model is None):
        raise ValueError('Either none or both of value_model and train_step_value_model should be None')
    
    observations = []
    actions = []
    advantages = []
    discounted_rewards = []
    
    ep_total_rewards = []
    ep_lengths = [] # Same as total reward for CartPole, but not universally true
    
    # Assemble samples of trajectories
    for ep in range(eps_per_batch):
        o_this_ep, a_this_ep, r_this_ep = generate_episode(model, env, sess)
        
        if value_model is not None:
            baseline_values = sess.run(value_model.value_prediction,
                                       feed_dict={value_model.input_obs: np.array(o_this_ep)})
        else:
            baseline_values = np.zeros_like(o_this_ep)
        
        # Compute the future reward for each step in the episode (possibly with discounting)
        disc_r_this_ep = [0.0]*len(r_this_ep)
        adv_this_ep = [0.0]*len(r_this_ep)
        r_sum = 0
        for i in reversed(range(len(r_this_ep))):
            r_sum *= gamma
            r_sum += r_this_ep[i]
            disc_r_this_ep[i] = r_sum
            adv_this_ep[i] = disc_r_this_ep[i] - baseline_values[i]
        
        observations += o_this_ep
        actions += a_this_ep
        discounted_rewards += disc_r_this_ep
        advantages += adv_this_ep

        # Will be used to evaluate performance
        ep_total_rewards.append(sum(r_this_ep))
        ep_lengths.append(len(r_this_ep))
    
    # Train the model
    sess.run(train_step, feed_dict={model.input_obs: np.array(observations),
                                    model.input_actions: np.array(actions),
                                    model.input_discounted_rewards: np.array(advantages)})
    
    # If supplied, train the model for the baseline
    if value_model is not None:
        sess.run(train_step_value_model,
                 feed_dict={value_model.input_obs: np.array(observations),
                            value_model.input_value_gt: np.array(discounted_rewards)})
    
    # Return information about this batch
    return ep_total_rewards, ep_lengths

In [None]:
def run_training_and_report(model, value_model, env, sess, train_step,
                            train_step_value_model, num_batches=1000, output_freq=10):
    
    # Running state, used to index training in different ways
    num_episodes = 0
    num_steps = 0
    start_time_s = time()
    
    # Statistics collected between outputs, will be periodically reset
    total_reward_this_log = 0.0
    total_steps_this_log = 0
    num_episodes_this_log = 0
    min_ep_reward_this_log = None
    max_ep_reward_this_log = None
    rewards_this_loss = []
    
    # Logs that we will write values to periodically, to plot later
    log_mean_reward_per_ep = []
    log_mean_ep_length = []
    log_min_reward = []
    log_max_reward = []
    log_running_num_eps = []
    log_running_num_steps = []
    log_running_wall_clock_time_s = []
    log_10th_percentile = []
    log_90th_percentile = []
    
    for b in range(num_batches):

        ep_total_rewards, ep_lengths = train_on_batch(model, value_model, env, sess,
                                                      train_step, train_step_value_model)
        
        # Update state to log
        total_reward_this_log += sum(ep_total_rewards)
        total_steps_this_log += sum(ep_lengths)
        num_episodes_this_log += len(ep_lengths)
        if min_ep_reward_this_log is None or min(ep_total_rewards) < min_ep_reward_this_log:
            min_ep_reward_this_log = min(ep_total_rewards)
        if max_ep_reward_this_log is None or max(ep_total_rewards) > max_ep_reward_this_log:
            max_ep_reward_this_log = max(ep_total_rewards)
        rewards_this_loss += ep_total_rewards
    
        if b % output_freq == 0:
            # Update the logs
            log_mean_reward_per_ep.append(total_reward_this_log / num_episodes_this_log)
            log_mean_ep_length.append(total_steps_this_log / num_episodes_this_log)
            log_min_reward.append(min_ep_reward_this_log)
            log_max_reward.append(max_ep_reward_this_log)
            log_running_num_eps.append(num_episodes)
            log_running_num_steps.append(num_steps)
            log_running_wall_clock_time_s.append(time() - start_time_s)
            
            rewards_this_loss.sort()
            log_10th_percentile.append(rewards_this_loss[(int)(0.1*len(rewards_this_loss))])
            log_90th_percentile.append(rewards_this_loss[(int)(0.9*len(rewards_this_loss))])
            
            # Reset the state
            total_reward_this_log = 0.0
            total_steps_this_log = 0
            num_episodes_this_log = 0
            min_ep_reward_this_log = None
            max_ep_reward_this_log = None
            rewards_this_loss = []

            # Print some brief info to the screen
            print(f'Trained on {num_episodes} episodes: Mean reward {log_mean_reward_per_ep[-1]}')
        
        # Why am I updating this *after* updating the logs?
        # Because these are used as a count of how many episodes / steps we have trained on.
        # And the episodes from this batch had a training step *after* the batch was evaluated.
        num_episodes += len(ep_lengths)
        num_steps += sum(ep_lengths)
    
    return log_mean_reward_per_ep, log_mean_ep_length, log_min_reward, log_max_reward, log_running_num_eps,\
        log_running_num_steps, log_running_wall_clock_time_s, log_10th_percentile, log_90th_percentile

In [None]:
def make_plots(training_results):
    log_mean_reward_per_ep, log_mean_ep_length, log_min_reward, log_max_reward, log_running_num_eps,\
        log_running_num_steps, log_running_wall_clock_time_s, log_10th_percentile, log_90th_percentile = training_results
    
    fig, ax = plt.subplots(figsize=(7, 4.5))
    ax.plot(log_running_num_eps, log_min_reward, '--', color='tab:red', linewidth=1.0)
    ax.plot(log_running_num_eps, log_max_reward, '--', color='tab:red', linewidth=1.0)
    ax.fill_between(log_running_num_eps, log_10th_percentile, log_90th_percentile, facecolor='tab:red', alpha=0.4)
    ax.plot(log_running_num_eps, log_mean_reward_per_ep, '-', color='tab:red')
    ax.set_ybound(0, 205)
    ax.set_xbound(lower=0.0)
    ax.set_xlabel('Num episodes trained on')
    ax.set_ylabel('Per-episode total reward')
    
    fig, ax = plt.subplots(figsize=(7, 4.5))
    ax.plot(log_running_num_steps, log_min_reward, '--', color='tab:red', linewidth=1.0)
    ax.plot(log_running_num_steps, log_max_reward, '--', color='tab:red', linewidth=1.0)
    ax.fill_between(log_running_num_steps, log_10th_percentile, log_90th_percentile, facecolor='tab:red', alpha=0.4)
    ax.plot(log_running_num_steps, log_mean_reward_per_ep, '-', color='tab:red')
    ax.set_ybound(0, 205)
    ax.set_xbound(lower=0.0)
    ax.set_xlabel('Num steps trained on')
    ax.set_ylabel('Per-episode total reward')
    
    fig, ax = plt.subplots(figsize=(7, 4.5))
    ax.plot(np.array(log_running_wall_clock_time_s) / 60.0, log_min_reward, '--', color='tab:red', linewidth=1.0)
    ax.plot(np.array(log_running_wall_clock_time_s) / 60.0, log_max_reward, '--', color='tab:red', linewidth=1.0)
    ax.fill_between(np.array(log_running_wall_clock_time_s) / 60.0, log_10th_percentile, log_90th_percentile, facecolor='tab:red', alpha=0.4)
    ax.plot(np.array(log_running_wall_clock_time_s) / 60.0, log_mean_reward_per_ep, '-', color='tab:red')
    ax.set_ybound(0, 205)
    ax.set_xbound(lower=0.0)
    ax.set_xlabel('Training time [minutes]')
    ax.set_ylabel('Per-episode total reward')
    
    fig, ax = plt.subplots(figsize=(7, 4.5))
    ax.plot(np.array(log_running_wall_clock_time_s) / 60.0, log_running_num_eps, '-', color='tab:blue')
    ax.set_ybound(lower=0)
    ax.set_xbound(lower=0.0)
    ax.set_xlabel('Training time [minutes]')
    ax.set_ylabel('Num episodes', color='tab:blue')
    ax.tick_params(axis='y', labelcolor='tab:blue')
    
    ax2 = ax.twinx()
    ax2.plot(np.array(log_running_wall_clock_time_s) / 60.0, log_running_num_steps, '--', color='tab:red')
    ax2.set_ybound(lower=0)
    ax2.set_ylabel('Num steps', color='tab:red')
    ax2.tick_params(axis='y', labelcolor='tab:red')

In [None]:
# Create our model, environment, session, and optimizer
env = gym.make('CartPole-v0')
tf.reset_default_graph() # Necessary because I'm naming the layers explicitly

model = VPGModel(hidden_layer_sizes=[16], env=env, activation=tf.nn.relu)
train_step = tf.train.GradientDescentOptimizer(learning_rate=1e-2).minimize(model.pseudo_loss)

value_model = ValueModel(hidden_layer_sizes=[16], env=env, activation=tf.nn.relu)
train_step_value_model = tf.train.GradientDescentOptimizer(learning_rate=1e-2).minimize(value_model.loss)

sess = tf.Session()
sess.run(tf.global_variables_initializer())

In [None]:
# Actually run the training, and save the results so we can plots them
training_results = run_training_and_report(model, value_model, env, sess, train_step,
                                           train_step_value_model, num_batches=1000)

In [None]:
make_plots(training_results)