In [2]:
import gym
from datetime import datetime
import numpy as np
from scipy.stats import entropy

# DR TRPO related files
from train_helper import *
from value import NNValueFunction
from utils import Logger
from dr_policy import DRPolicyKL, DRPolicyWass, DRPolicySinkhorn

import tensorflow as tf
import tensorflow.contrib.distributions as dist
import tensorflow.contrib.layers as layers

## Discriminator Network Setup 

In [3]:
class Discriminator:
    def __init__(self, sess, hidden_size, lr, name):
        self.sess = sess
        self.hidden_size = hidden_size
        self.lr = lr
        self.name = name

        self.ob_ac = tf.placeholder(dtype=tf.float32, shape=[None, 2])
        
        with tf.variable_scope(name):
            self._build_network()

    def _build_network(self):
        with tf.variable_scope('discriminator'):
            d_h1 = layers.fully_connected(self.ob_ac, self.hidden_size, activation_fn=tf.tanh)
            d_h2 = layers.fully_connected(d_h1, self.hidden_size, activation_fn=tf.tanh)
            d_out = layers.fully_connected(d_h2, 1, activation_fn=None)

        self.reward = - tf.squeeze(tf.log(tf.sigmoid(d_out)))
        
        expert_out, policy_out = tf.split(d_out, num_or_size_splits=2, axis=0)

        self.loss = (tf.losses.sigmoid_cross_entropy(tf.ones_like(policy_out), policy_out)
                     + tf.losses.sigmoid_cross_entropy(tf.zeros_like(expert_out), expert_out))
        
        with tf.name_scope('train_op'):
            grads = tf.gradients(self.loss, self.params())
            self.grads = list(zip(grads, self.params()))
            self.train_op = tf.train.AdamOptimizer(self.lr).apply_gradients(self.grads)

    def params(self):
        return tf.global_variables(self.name).copy()

    def get_reward(self, expert_ob_ac):
        feed_dict = {self.ob_ac: expert_ob_ac}

        return self.sess.run(self.reward, feed_dict=feed_dict)

    def update(self, all_ob_ac):
        feed_dict = {self.ob_ac: all_ob_ac}

        self.sess.run(self.train_op, feed_dict=feed_dict)

# Discrete State Space - SPO + GAIL
### 'Taxi-v3', 'Roulette-v0', 'NChain-v0', 'FrozenLake-v0', 'CliffWalking-v0', 'FrozenLake8x8-v0'

In [8]:
config = tf.ConfigProto(
device_count={'GPU': 1},
intra_op_parallelism_threads=1,
allow_soft_placement=True
)
config.gpu_options.allow_growth = True
sess = tf.InteractiveSession(config=config)
tf.keras.backend.set_session(sess)
discriminator = Discriminator(sess, 10, 0.01, 'D')
sess.run(tf.global_variables_initializer())
tf.reset_default_graph()

env_name = 'Taxi-v3'
env = gym.make(env_name)
sta_num = env.observation_space.n
act_num = env.action_space.n
policy = DRPolicySinkhorn(sta_num, act_num)
val_func = NNValueFunction(1, 10)
gamma = 0.9
gae_weight = 1
total_eps = 2000
batch_eps = 10
logger = Logger(logname=env_name + '_DR-Sinkhorn_Batch=' + str(batch_eps), now=datetime.utcnow().strftime("%b-%d_%H:%M:%S"))


eps = 0
while eps < total_eps:
        # weight for RL
        lamb = 0.9
        # the randomness of the actions an agent takes can be quantified by the entropy
        entro = entropy(policy.distributions)
        trajectories = run_policy(env, policy, batch_eps, discriminator, lamb, logger)
        eps += len(trajectories)
        # add estimated values to episodes
        add_value(trajectories, val_func)  
        # calculated discounted sum of Rs
        add_disc_sum_rew(trajectories, gamma, logger)
        # calculate advantage
        add_gae(trajectories, gamma, gae_weight)
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(trajectories)
        disc_freqs = find_disc_freqs(trajectories, sta_num, gamma)
        log_batch_stats(observes, actions, advantages, disc_sum_rew, eps, logger)
        policy.update(observes, actions, advantages, disc_freqs, env_name, eps)
        val_func.fit(observes, disc_sum_rew, logger)
        # write logger results to file and stdout
        logger.write(display=True)
        
        policy_ob_ac = np.stack((observes, actions),1)
        data = np.load('expert_traj/expert_taxi.npz')
        expert_obs = data['obs'].T[0]
        expert_actions = data['actions'].T[0]
        expert_ob_ac = np.stack((expert_obs, expert_actions),1)
        min_len = min(len(expert_ob_ac), len(policy_ob_ac))
        discriminator.update(np.concatenate([expert_ob_ac[:min_len], policy_ob_ac[:min_len]], axis=0))
logger.close()
sess.close()



Value Params -- h1: 10, h2: 7, h3: 5, lr: 0.00378
***** Episode 10, Mean Return = -694.6, Mean Discounted Return = -29.4 *****
ExplainedVarNew: -3.72e-07
ExplainedVarOld: -1.58e-08
ValFuncLoss: 891


***** Episode 20, Mean Return = -605.9, Mean Discounted Return = -31.8 *****
ExplainedVarNew: -1.38e-08
ExplainedVarOld: -4.65e-07
ValFuncLoss: 632


***** Episode 30, Mean Return = -634.7, Mean Discounted Return = -29.2 *****
ExplainedVarNew: -8.1e-10
ExplainedVarOld: -3.06e-09
ValFuncLoss: 536


***** Episode 40, Mean Return = -645.8, Mean Discounted Return = -32.5 *****
ExplainedVarNew: -8.61e-09
ExplainedVarOld: -2.02e-08
ValFuncLoss: 369


***** Episode 50, Mean Return = -572.3, Mean Discounted Return = -29.5 *****
ExplainedVarNew: -1.5e-09
ExplainedVarOld: -2.95e-09
ValFuncLoss: 300


***** Episode 60, Mean Return = -512.0, Mean Discounted Return = -26.1 *****
ExplainedVarNew: -4.13e-09
ExplainedVarOld: -6.31e-09
ValFuncLoss: 198


***** Episode 70, Mean Return = -522.9, Mean Discoun

***** Episode 560, Mean Return = -166.0, Mean Discounted Return = -20.3 *****
ExplainedVarNew: -0.000263
ExplainedVarOld: -0.000985
ValFuncLoss: 215


***** Episode 570, Mean Return = -218.7, Mean Discounted Return = -19.3 *****
ExplainedVarNew: -1.75e-05
ExplainedVarOld: -0.00047
ValFuncLoss: 166


***** Episode 580, Mean Return = -188.7, Mean Discounted Return = -23.3 *****
ExplainedVarNew: -5.9e-05
ExplainedVarOld: -4.32e-05
ValFuncLoss: 164


***** Episode 590, Mean Return = -91.8, Mean Discounted Return = -17.5 *****
ExplainedVarNew: -0.000338
ExplainedVarOld: -6.65e-05
ValFuncLoss: 198


***** Episode 600, Mean Return = -255.7, Mean Discounted Return = -25.5 *****
ExplainedVarNew: -1.43e-05
ExplainedVarOld: -0.000143
ValFuncLoss: 145


***** Episode 610, Mean Return = -59.6, Mean Discounted Return = -21.3 *****
ExplainedVarNew: -5.21e-06
ExplainedVarOld: -6.66e-06
ValFuncLoss: 281


***** Episode 620, Mean Return = -318.5, Mean Discounted Return = -25.6 *****
ExplainedVarNew: -8.

***** Episode 1120, Mean Return = -206.9, Mean Discounted Return = -20.6 *****
ExplainedVarNew: -0.0096
ExplainedVarOld: -0.00741
ValFuncLoss: 243


***** Episode 1130, Mean Return = -119.8, Mean Discounted Return = -19.3 *****
ExplainedVarNew: -0.0118
ExplainedVarOld: -0.00928
ValFuncLoss: 240


***** Episode 1140, Mean Return = -152.7, Mean Discounted Return = -25.7 *****
ExplainedVarNew: -0.0175
ExplainedVarOld: -0.0496
ValFuncLoss: 213


***** Episode 1150, Mean Return = -132.5, Mean Discounted Return = -25.3 *****
ExplainedVarNew: -0.00353
ExplainedVarOld: -0.00238
ValFuncLoss: 223


***** Episode 1160, Mean Return = -105.5, Mean Discounted Return = -20.7 *****
ExplainedVarNew: -0.00222
ExplainedVarOld: -0.002
ValFuncLoss: 265


***** Episode 1170, Mean Return = -202.8, Mean Discounted Return = -19.3 *****
ExplainedVarNew: -0.0154
ExplainedVarOld: -0.0212
ValFuncLoss: 195


***** Episode 1180, Mean Return = -331.8, Mean Discounted Return = -24.9 *****
ExplainedVarNew: -0.0032
Expl

***** Episode 1680, Mean Return = -73.2, Mean Discounted Return = -22.0 *****
ExplainedVarNew: -0.0179
ExplainedVarOld: -0.0375
ValFuncLoss: 358


***** Episode 1690, Mean Return = -268.5, Mean Discounted Return = -30.5 *****
ExplainedVarNew: -1.05e-10
ExplainedVarOld: -9.54e-11
ValFuncLoss: 240


***** Episode 1700, Mean Return = -255.0, Mean Discounted Return = -20.9 *****
ExplainedVarNew: -6.27e-11
ExplainedVarOld: -7.3e-11
ValFuncLoss: 163


***** Episode 1710, Mean Return = -157.0, Mean Discounted Return = -26.5 *****
ExplainedVarNew: -0.0147
ExplainedVarOld: -0.027
ValFuncLoss: 242


***** Episode 1720, Mean Return = -88.4, Mean Discounted Return = -18.5 *****
ExplainedVarNew: -0.00788
ExplainedVarOld: -0.0066
ValFuncLoss: 270


***** Episode 1730, Mean Return = -192.4, Mean Discounted Return = -22.7 *****
ExplainedVarNew: -0.0118
ExplainedVarOld: -0.00666
ValFuncLoss: 251


***** Episode 1740, Mean Return = -256.5, Mean Discounted Return = -28.7 *****
ExplainedVarNew: -0.00511
E

# Discrete State Space - WPO + GAIL
### 'Taxi-v3', 'Roulette-v0', 'NChain-v0', 'FrozenLake-v0', 'CliffWalking-v0', 'FrozenLake8x8-v0'

In [9]:
config = tf.ConfigProto(
device_count={'GPU': 1},
intra_op_parallelism_threads=1,
allow_soft_placement=True
)
config.gpu_options.allow_growth = True
sess = tf.InteractiveSession(config=config)
tf.keras.backend.set_session(sess)
discriminator = Discriminator(sess, 10, 0.01, 'D')
sess.run(tf.global_variables_initializer())
tf.reset_default_graph()

env_name = 'Taxi-v3'
env = gym.make(env_name)
sta_num = env.observation_space.n
act_num = env.action_space.n
policy = DRPolicyWass(sta_num, act_num)
val_func = NNValueFunction(1, 10)
gamma = 0.9
gae_weight = 1
total_eps = 2000
batch_eps = 10
logger = Logger(logname=env_name + '_DR-Wass_Batch=' + str(batch_eps), now=datetime.utcnow().strftime("%b-%d_%H:%M:%S"))


eps = 0
while eps < total_eps:
        # weight for RL
        lamb = 0.9
        # the randomness of the actions an agent takes can be quantified by the entropy
        entro = entropy(policy.distributions)
        trajectories = run_policy(env, policy, batch_eps, discriminator, lamb, logger)
        eps += len(trajectories)
        # add estimated values to episodes
        add_value(trajectories, val_func)  
        # calculated discounted sum of Rs
        add_disc_sum_rew(trajectories, gamma, logger)  
        # calculate advantage
        add_gae(trajectories, gamma, gae_weight)  
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(trajectories)
        disc_freqs = find_disc_freqs(trajectories, sta_num, gamma)
        log_batch_stats(observes, actions, advantages, disc_sum_rew, eps, logger)
        policy.update(observes, actions, advantages, disc_freqs, env_name, eps)
        val_func.fit(observes, disc_sum_rew, logger)
        # write logger results to file and stdout
        logger.write(display=True)
        
        policy_ob_ac = np.stack((observes, actions),1)
        data = np.load('expert_traj/expert_taxi.npz')
        expert_obs = data['obs'].T[0]
        expert_actions = data['actions'].T[0]
        expert_ob_ac = np.stack((expert_obs, expert_actions),1)
        min_len = min(len(expert_ob_ac), len(policy_ob_ac))
        discriminator.update(np.concatenate([expert_ob_ac[:min_len], policy_ob_ac[:min_len]], axis=0))
logger.close()
sess.close()



Value Params -- h1: 10, h2: 7, h3: 5, lr: 0.00378
***** Episode 10, Mean Return = -692.2, Mean Discounted Return = -35.5 *****
ExplainedVarNew: -6.86e-08
ExplainedVarOld: -2.76e-05
ValFuncLoss: 931


***** Episode 20, Mean Return = -686.0, Mean Discounted Return = -34.7 *****
ExplainedVarNew: -6.72e-11
ExplainedVarOld: -1.05e-09
ValFuncLoss: 746


***** Episode 30, Mean Return = -616.6, Mean Discounted Return = -25.1 *****
ExplainedVarNew: -9.79e-12
ExplainedVarOld: -2.71e-11
ValFuncLoss: 496


***** Episode 40, Mean Return = -574.6, Mean Discounted Return = -30.2 *****
ExplainedVarNew: -4.8e-10
ExplainedVarOld: -1.01e-09
ValFuncLoss: 294


***** Episode 50, Mean Return = -575.3, Mean Discounted Return = -34.6 *****
ExplainedVarNew: -4.58e-11
ExplainedVarOld: -7.23e-11
ValFuncLoss: 305


***** Episode 60, Mean Return = -501.0, Mean Discounted Return = -30.1 *****
ExplainedVarNew: -1.02e-12
ExplainedVarOld: -1.28e-12
ValFuncLoss: 204


***** Episode 70, Mean Return = -593.6, Mean Discou

***** Episode 560, Mean Return = -135.7, Mean Discounted Return = -7.9 *****
ExplainedVarNew: -9.62e-06
ExplainedVarOld: -1.16e-05
ValFuncLoss: 90


***** Episode 570, Mean Return = -193.6, Mean Discounted Return = -12.4 *****
ExplainedVarNew: -1.13e-07
ExplainedVarOld: -0.000888
ValFuncLoss: 38.1


***** Episode 580, Mean Return = -259.0, Mean Discounted Return = -13.0 *****
ExplainedVarNew: -0.00155
ExplainedVarOld: -0.00249
ValFuncLoss: 79.2


***** Episode 590, Mean Return = -220.7, Mean Discounted Return = -15.6 *****
ExplainedVarNew: -0.00324
ExplainedVarOld: -0.00152
ValFuncLoss: 42.9


***** Episode 600, Mean Return = -218.9, Mean Discounted Return = -14.1 *****
ExplainedVarNew: -0.00189
ExplainedVarOld: -0.00113
ValFuncLoss: 37


***** Episode 610, Mean Return = -261.9, Mean Discounted Return = -17.4 *****
ExplainedVarNew: -0.0155
ExplainedVarOld: -0.00808
ValFuncLoss: 56.5


***** Episode 620, Mean Return = -156.7, Mean Discounted Return = -9.3 *****
ExplainedVarNew: -0.0028


***** Episode 1110, Mean Return = -167.3, Mean Discounted Return = -9.0 *****
ExplainedVarNew: -5.74e-05
ExplainedVarOld: -0.00172
ValFuncLoss: 1.42


***** Episode 1120, Mean Return = -149.1, Mean Discounted Return = -7.4 *****
ExplainedVarNew: -0.00965
ExplainedVarOld: -0.00945
ValFuncLoss: 3.15


***** Episode 1130, Mean Return = -132.3, Mean Discounted Return = -7.3 *****
ExplainedVarNew: -9.19e-09
ExplainedVarOld: -7.7e-09
ValFuncLoss: 5.44


***** Episode 1140, Mean Return = -149.4, Mean Discounted Return = -7.4 *****
ExplainedVarNew: -0.00133
ExplainedVarOld: -0.000131
ValFuncLoss: 3.12


***** Episode 1150, Mean Return = -96.8, Mean Discounted Return = -4.6 *****
ExplainedVarNew: -0.00333
ExplainedVarOld: -0.000196
ValFuncLoss: 11.6


***** Episode 1160, Mean Return = -149.7, Mean Discounted Return = -7.5 *****
ExplainedVarNew: -0.0168
ExplainedVarOld: -0.0206
ValFuncLoss: 3.24


***** Episode 1170, Mean Return = -131.0, Mean Discounted Return = -6.3 *****
ExplainedVarNew: -0.0

***** Episode 1660, Mean Return = -130.7, Mean Discounted Return = -6.5 *****
ExplainedVarNew: -0.000235
ExplainedVarOld: -6.26e-06
ValFuncLoss: 5.21


***** Episode 1670, Mean Return = -148.6, Mean Discounted Return = -7.6 *****
ExplainedVarNew: -0.00436
ExplainedVarOld: -0.00839
ValFuncLoss: 3.15


***** Episode 1680, Mean Return = -113.2, Mean Discounted Return = -5.5 *****
ExplainedVarNew: -0.0161
ExplainedVarOld: -0.00113
ValFuncLoss: 7.92


***** Episode 1690, Mean Return = -149.4, Mean Discounted Return = -8.2 *****
ExplainedVarNew: -0.00321
ExplainedVarOld: -0.0692
ValFuncLoss: 3.18


***** Episode 1700, Mean Return = -166.2, Mean Discounted Return = -8.3 *****
ExplainedVarNew: -9.19e-11
ExplainedVarOld: -1.79e-10
ValFuncLoss: 1.33


***** Episode 1710, Mean Return = -149.6, Mean Discounted Return = -7.3 *****
ExplainedVarNew: -0.0115
ExplainedVarOld: -6.34
ValFuncLoss: 3.13


***** Episode 1720, Mean Return = -114.2, Mean Discounted Return = -5.9 *****
ExplainedVarNew: -0.0229

# Discrete State Space - KL PO
### 'Taxi-v3', 'Roulette-v0', 'NChain-v0', 'FrozenLake-v0', 'CliffWalking-v0', 'FrozenLake8x8-v0'

In [None]:
env_name = 'Taxi-v3'
env = gym.make(env_name)
sta_num = env.observation_space.n
act_num = env.action_space.n
policy = DRPolicyKL(sta_num, act_num)
val_func = NNValueFunction(1, 10)
gamma = 0.9
lam = 1
total_eps = 1000
batch_eps = 60
logger = Logger(logname=env_name + '_DR-KL_Batch=' + str(batch_eps), now=datetime.utcnow().strftime("%b-%d_%H:%M:%S"))


eps = 0
while eps < total_eps:
        trajectories = run_policy(env, policy, batch_eps, logger)
        eps += len(trajectories)
        # add estimated values to episodes
        add_value(trajectories, val_func)  
        # calculated discounted sum of Rs
        add_disc_sum_rew(trajectories, gamma, logger)  
        # calculate advantage
        add_gae(trajectories, gamma, lam)  
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(trajectories)
        log_batch_stats(observes, actions, advantages, disc_sum_rew, eps, logger)
        disc_freqs = find_disc_freqs(trajectories, sta_num, gamma)
        policy.update(observes, actions, advantages, disc_freqs, env_name, eps)
        val_func.fit(observes, disc_sum_rew, logger)
        # write logger results to file and stdout
        logger.write(display=True) 
logger.close()

## Generate Expert Trajectories

In [None]:
from stable_baselines import PPO2
from stable_baselines.gail import generate_expert_traj

# Generate expert trajectories (train expert)
model = PPO2('MlpPolicy', 'Taxi-v3', verbose=1)
# Train for 60000 timesteps and record 10 trajectories
# all the data will be saved in 'expert_pendulum.npz' file
generate_expert_traj(model, 'expert_taxi', n_timesteps=600000, n_episodes=10)