In [8]:
import gym
from datetime import datetime
import numpy as np
from scipy.stats import entropy

# DR TRPO related files
from train_helper import *
from value import NNValueFunction
from utils import Logger
from dr_policy import DRPolicyKL, DRPolicyWass, DRPolicySinkhorn

import tensorflow as tf
import tensorflow.contrib.distributions as dist
import tensorflow.contrib.layers as layers

## Discriminator Network Setup 

In [6]:
class Discriminator:
    def __init__(self, sess, hidden_size, lr, name):
        self.sess = sess
        self.hidden_size = hidden_size
        self.lr = lr
        self.name = name

        self.ob_ac = tf.placeholder(dtype=tf.float32, shape=[None, 2])
        
        with tf.variable_scope(name):
            self._build_network()

    def _build_network(self):
        with tf.variable_scope('discriminator'):
            d_h1 = layers.fully_connected(self.ob_ac, self.hidden_size, activation_fn=tf.tanh)
            d_h2 = layers.fully_connected(d_h1, self.hidden_size, activation_fn=tf.tanh)
            d_out = layers.fully_connected(d_h2, 1, activation_fn=None)

        self.reward = - tf.squeeze(tf.log(tf.sigmoid(d_out)))
        
        expert_out, policy_out = tf.split(d_out, num_or_size_splits=2, axis=0)

        self.loss = (tf.losses.sigmoid_cross_entropy(tf.ones_like(policy_out), policy_out)
                     + tf.losses.sigmoid_cross_entropy(tf.zeros_like(expert_out), expert_out))
        
        with tf.name_scope('train_op'):
            grads = tf.gradients(self.loss, self.params())
            self.grads = list(zip(grads, self.params()))
            self.train_op = tf.train.AdamOptimizer(self.lr).apply_gradients(self.grads)

    def params(self):
        return tf.global_variables(self.name).copy()

    def get_reward(self, expert_ob_ac):
        feed_dict = {self.ob_ac: expert_ob_ac}

        return self.sess.run(self.reward, feed_dict=feed_dict)

    def update(self, all_ob_ac):
        feed_dict = {self.ob_ac: all_ob_ac}
        self.sess.run(self.train_op, feed_dict=feed_dict)

## Customize Environments

In [36]:
env_name = 'Taxi-v3'
env = gym.make(env_name)

class CustomEnv(gym.Env):
    def __init__(self, rl_env, discriminator, lamb):
        super(CustomEnv, self).__init__()
        self.action_space = rl_env.action_space
        self.observation_space = rl_env.observation_space
        self.rl_env = rl_env
        self.discriminator = discriminator
        self.lamb = lamb
    
    def step(self, action):
        observation, rl_reward, done, info = self.rl_env.step(action)
        il_reward = self.discriminator.get_reward(np.asarray([observation, action]).reshape(1,2))
        reward = lamb*rl_reward + (1-lamb)*il_reward
        return observation, reward, done, info
    
    def reset(self):
        return self.rl_env.reset()

## Baseline + GAIL

In [68]:
from stable_baselines.common.policies import MlpPolicy
from stable_baselines import TRPO
from stable_baselines.common.vec_env import DummyVecEnv, VecNormalize

config = tf.ConfigProto(
device_count={'GPU': 1},
intra_op_parallelism_threads=1,
allow_soft_placement=True
)
config.gpu_options.allow_growth = True
sess = tf.InteractiveSession(config=config)
tf.keras.backend.set_session(sess)
discriminator = Discriminator(sess, 10, 0.01, 'D')
sess.run(tf.global_variables_initializer())
tf.reset_default_graph()

ts = 0
total_ts = 60000
batch_ts = 1000
lamb = 0.9
gamma = 0.9
custom_env = CustomEnv(env, discriminator, lamb)
data = np.load('expert_traj/expert_taxi.npz')
expert_obs = data['obs'].T[0]
expert_actions = data['actions'].T[0]
expert_ob_ac = np.stack((expert_obs, expert_actions),1)
custom_env = DummyVecEnv(([lambda: custom_env]))

model = TRPO(MlpPolicy, custom_env, verbose=0)

while ts < total_ts: 
    model.learn(total_timesteps = batch_ts)
    ts += batch_ts
    
    # update discriminator 
    obs = custom_env.reset()[0]
    observes, actions, rewards, rew_sum = [],[],[],[]
    i = 0
    while i < len(expert_ob_ac):
        observes.append(obs)
        action, _states = model.predict(obs)
        obs, reward, done, info = env.step(action)
        actions.append(action)
        rewards.append(reward)
        if done:
            obs = custom_env.reset()[0]
            rew_sum.append(np.sum(rewards))
            rewards = []
        i += 1
    policy_ob_ac = np.stack((observes, actions),1)
    discriminator.update(np.concatenate([expert_ob_ac, policy_ob_ac], axis=0))
    custom_env = CustomEnv(env, discriminator, lamb)
    custom_env = DummyVecEnv(([lambda: custom_env]))
    model.set_env(custom_env)
    print('Timesteps: ' + str(ts) + ' Rewards Sum: ' + str(np.mean(rew_sum)))



Timesteps: 1000 Rewards Sum: -760.7
Timesteps: 2000 Rewards Sum: -714.8
Timesteps: 3000 Rewards Sum: -654.3
Timesteps: 4000 Rewards Sum: -647.3
Timesteps: 5000 Rewards Sum: -609.5
Timesteps: 6000 Rewards Sum: -541.1
Timesteps: 7000 Rewards Sum: -560.3
Timesteps: 8000 Rewards Sum: -531.2
Timesteps: 9000 Rewards Sum: -478.1
Timesteps: 10000 Rewards Sum: -468.2
Timesteps: 11000 Rewards Sum: -433.1
Timesteps: 12000 Rewards Sum: -414.2
Timesteps: 13000 Rewards Sum: -387.2
Timesteps: 14000 Rewards Sum: -369.2
Timesteps: 15000 Rewards Sum: -316.1
Timesteps: 16000 Rewards Sum: -318.8
Timesteps: 17000 Rewards Sum: -297.2
Timesteps: 18000 Rewards Sum: -269.3
Timesteps: 19000 Rewards Sum: -264.8
Timesteps: 20000 Rewards Sum: -253.1
Timesteps: 21000 Rewards Sum: -243.2
Timesteps: 22000 Rewards Sum: -237.8
Timesteps: 23000 Rewards Sum: -236.0
Timesteps: 24000 Rewards Sum: -220.7
Timesteps: 25000 Rewards Sum: -216.2
Timesteps: 26000 Rewards Sum: -206.3
Timesteps: 27000 Rewards Sum: -211.7
Timesteps: