In [2]:
import gym
from datetime import datetime
import numpy as np

# DR TRPO related files
from train_helper import *
from value import NNValueFunction
from utils import Logger
from dr_policy import DRPolicyKL, DRPolicyWass, DRPolicySinkhorn

import tensorflow as tf
import tensorflow.contrib.distributions as dist
import tensorflow.contrib.layers as layers

In [3]:
class Discriminator:
    def __init__(self, sess, hidden_size, lr, name):
        self.sess = sess
        self.hidden_size = hidden_size
        self.lr = lr
        self.name = name

        self.ob_ac = tf.placeholder(dtype=tf.float32, shape=[None, 2])
        
        with tf.variable_scope(name):
            self._build_network()

    def _build_network(self):
        with tf.variable_scope('discriminator'):
            d_h1 = layers.fully_connected(self.ob_ac, self.hidden_size, activation_fn=tf.tanh)
            d_h2 = layers.fully_connected(d_h1, self.hidden_size, activation_fn=tf.tanh)
            d_out = layers.fully_connected(d_h2, 1, activation_fn=None)

        self.reward = - tf.squeeze(tf.log(tf.sigmoid(d_out)))
        
        expert_out, policy_out = tf.split(d_out, num_or_size_splits=2, axis=0)

        self.loss = (tf.losses.sigmoid_cross_entropy(tf.ones_like(policy_out), policy_out)
                     + tf.losses.sigmoid_cross_entropy(tf.zeros_like(expert_out), expert_out))
        
        with tf.name_scope('train_op'):
            grads = tf.gradients(self.loss, self.params())
            self.grads = list(zip(grads, self.params()))
            self.train_op = tf.train.AdamOptimizer(self.lr).apply_gradients(self.grads)

    def params(self):
        return tf.global_variables(self.name).copy()

    def get_reward(self, expert_ob_ac):
        feed_dict = {self.ob_ac: expert_ob_ac}

        return self.sess.run(self.reward, feed_dict=feed_dict)

    def update(self, all_ob_ac):
        feed_dict = {self.ob_ac: all_ob_ac}

        self.sess.run(self.train_op, feed_dict=feed_dict)

# Discrete State Space - KL DRPO 
### 'Taxi-v3', 'Roulette-v0', 'NChain-v0', 'FrozenLake-v0', 'CliffWalking-v0', 'FrozenLake8x8-v0'

In [None]:
env_name = 'Taxi-v3'
env = gym.make(env_name)
sta_num = env.observation_space.n
act_num = env.action_space.n
policy = DRPolicyKL(sta_num, act_num)
val_func = NNValueFunction(1, 10)
gamma = 0.9
lam = 1
total_eps = 1000
batch_eps = 60
logger = Logger(logname=env_name + '_DR-KL_Batch=' + str(batch_eps), now=datetime.utcnow().strftime("%b-%d_%H:%M:%S"))


eps = 0
while eps < total_eps:
        trajectories = run_policy(env, policy, batch_eps, logger)
        eps += len(trajectories)
        # add estimated values to episodes
        add_value(trajectories, val_func)  
        # calculated discounted sum of Rs
        add_disc_sum_rew(trajectories, gamma, logger)  
        # calculate advantage
        add_gae(trajectories, gamma, lam)  
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(trajectories)
        log_batch_stats(observes, actions, advantages, disc_sum_rew, eps, logger)
        disc_freqs = find_disc_freqs(trajectories, sta_num, gamma)
        policy.update(observes, actions, advantages, disc_freqs, env_name, eps)
        val_func.fit(observes, disc_sum_rew, logger)
        # write logger results to file and stdout
        logger.write(display=True) 
logger.close()

# Discrete State Space - Sinkhorn DRPO 
### 'Taxi-v3', 'Roulette-v0', 'NChain-v0', 'FrozenLake-v0', 'CliffWalking-v0', 'FrozenLake8x8-v0'

In [6]:
config = tf.ConfigProto(
device_count={'GPU': 1},
intra_op_parallelism_threads=1,
allow_soft_placement=True
)
config.gpu_options.allow_growth = True
sess = tf.InteractiveSession(config=config)
tf.keras.backend.set_session(sess)
discriminator = Discriminator(sess, 10, 0.01, 'D')
sess.run(tf.global_variables_initializer())
tf.reset_default_graph()

env_name = 'Taxi-v3'
env = gym.make(env_name)
sta_num = env.observation_space.n
act_num = env.action_space.n
policy = DRPolicySinkhorn(sta_num, act_num)
val_func = NNValueFunction(1, 10)
gamma = 0.9
lam = 1
total_eps = 5000
batch_eps = 60
logger = Logger(logname=env_name + '_DR-Sinkhorn_Batch=' + str(batch_eps), now=datetime.utcnow().strftime("%b-%d_%H:%M:%S"))


eps = 0
while eps < total_eps:
        # weight for RL
        lamb = 0.9
        trajectories = run_policy(env, policy, batch_eps, discriminator, lamb, logger)
        eps += len(trajectories)
        # add estimated values to episodes
        add_value(trajectories, val_func)  
        # calculated discounted sum of Rs
        add_disc_sum_rew(trajectories, gamma, logger)
        # calculate advantage
        add_gae(trajectories, gamma, lam)
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(trajectories)
        disc_freqs = find_disc_freqs(trajectories, sta_num, gamma)
        log_batch_stats(observes, actions, advantages, disc_sum_rew, eps, logger)
        policy.update(observes, actions, advantages, disc_freqs, env_name, eps)
        val_func.fit(observes, disc_sum_rew, logger)
        # write logger results to file and stdout
        logger.write(display=True)
        
        policy_ob_ac = np.stack((observes, actions),1)
        expert_ob_ac = policy_ob_ac
        discriminator.update(np.concatenate([expert_ob_ac, policy_ob_ac], axis=0))
logger.close()



Value Params -- h1: 10, h2: 7, h3: 5, lr: 0.00378
***** Episode 60, Mean Return = -684.2, Mean Discounted Return = -37.2 *****
ExplainedVarNew: -4.24e-10
ExplainedVarOld: -4.44e-06
ValFuncLoss: 537


***** Episode 120, Mean Return = -613.5, Mean Discounted Return = -32.3 *****
ExplainedVarNew: -4.38e-12
ExplainedVarOld: -5.26e-11
ValFuncLoss: 138


***** Episode 180, Mean Return = -593.1, Mean Discounted Return = -31.7 *****
ExplainedVarNew: -4.44e-11
ExplainedVarOld: -7.15e-11
ValFuncLoss: 92.2


***** Episode 240, Mean Return = -516.8, Mean Discounted Return = -27.7 *****
ExplainedVarNew: -2.39e-11
ExplainedVarOld: -1.86e-11
ValFuncLoss: 99.9


***** Episode 300, Mean Return = -475.4, Mean Discounted Return = -27.2 *****
ExplainedVarNew: -1.79e-11
ExplainedVarOld: -8.34e-12
ValFuncLoss: 105


***** Episode 360, Mean Return = -424.4, Mean Discounted Return = -26.1 *****
ExplainedVarNew: -7.65e-10
ExplainedVarOld: -4.33e-11
ValFuncLoss: 101


***** Episode 420, Mean Return = -334.1, Me

***** Episode 3360, Mean Return = -88.2, Mean Discounted Return = -16.5 *****
ExplainedVarNew: -0.00771
ExplainedVarOld: -0.00834
ValFuncLoss: 180


***** Episode 3420, Mean Return = -43.9, Mean Discounted Return = -11.6 *****
ExplainedVarNew: -0.00606
ExplainedVarOld: -0.00571
ValFuncLoss: 191


***** Episode 3480, Mean Return = -58.7, Mean Discounted Return = -15.2 *****
ExplainedVarNew: -0.0151
ExplainedVarOld: -0.00853
ValFuncLoss: 175


***** Episode 3540, Mean Return = -60.6, Mean Discounted Return = -14.7 *****
ExplainedVarNew: -0.0107
ExplainedVarOld: -0.00726
ValFuncLoss: 189


***** Episode 3600, Mean Return = -73.0, Mean Discounted Return = -14.2 *****
ExplainedVarNew: -0.01
ExplainedVarOld: -0.01
ValFuncLoss: 206


***** Episode 3660, Mean Return = -90.5, Mean Discounted Return = -17.3 *****
ExplainedVarNew: -0.0187
ExplainedVarOld: -0.0209
ValFuncLoss: 191


***** Episode 3720, Mean Return = -87.5, Mean Discounted Return = -15.0 *****
ExplainedVarNew: -0.00737
ExplainedVar

# Discrete State Space - Wasserstein DRPO 
### 'Taxi-v3', 'Roulette-v0', 'NChain-v0', 'FrozenLake-v0', 'CliffWalking-v0', 'FrozenLake8x8-v0'

In [None]:
env_name = 'Taxi-v3'
env = gym.make(env_name)
sta_num = env.observation_space.n
act_num = env.action_space.n
policy = DRPolicyWass(sta_num, act_num)
val_func = NNValueFunction(1, 10)
gamma = 0.9
lam = 1
total_eps = 5000
batch_eps = 60
logger = Logger(logname=env_name + '_DR-Wass_Batch=' + str(batch_eps), now=datetime.utcnow().strftime("%b-%d_%H:%M:%S"))


eps = 0
while eps < total_eps:
        trajectories = run_policy(env, policy, batch_eps, logger)
        eps += len(trajectories)
        # add estimated values to episodes
        add_value(trajectories, val_func)  
        # calculated discounted sum of Rs
        add_disc_sum_rew(trajectories, gamma, logger)  
        # calculate advantage
        add_gae(trajectories, gamma, lam)  
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(trajectories)
        disc_freqs = find_disc_freqs(trajectories, sta_num, gamma)
        log_batch_stats(observes, actions, advantages, disc_sum_rew, eps, logger)
        policy.update(observes, actions, advantages, disc_freqs, env_name, eps)
        val_func.fit(observes, disc_sum_rew, logger)
        # write logger results to file and stdout
        logger.write(display=True) 
logger.close()

### Policy Statistics

In [None]:
total_length = 0
total_success_dropoff = 0
total_illegal_action = 0
for i in range(1000):
    illegal_action, success_dropoff, eps_length = episode_stats(env, policy)
    total_illegal_action += illegal_action
    total_success_dropoff += success_dropoff
    total_length += eps_length
    print('------------------------')
print(total_illegal_action/1000)
print(total_success_dropoff/1000)
print(total_length/1000)

In [None]:
from scipy.stats import wasserstein_distance
wasserstein_distance([0, 1], [1, 0], [])