In [1]:
import warnings, os
warnings.filterwarnings("ignore")
os.environ["CUDA_VISIBLE_DEVICES"]="0"

import tensorflow as tf

import numpy as np
from sklearn.utils import shuffle
import scipy.signal
import pickle

import multigoal

from matplotlib import pyplot as plt
%matplotlib inline

if __name__ == "__main__":
    print("Multi Goal Imitation Learning")

Multi Goal Imitation Learning


In [2]:
class PolicyLatent(object):
    def __init__(self, obs_dim, act_dim, latent_dim=4, clip_range=0.2,
                 epochs=10, lr=3e-5, hdim=64, max_std=1.0,
                 beta=1.0, eta=100, kl_targ=0.003,entcoeff=1e-3, logpostcoeff=1e-2,
                 seed=0):
        
        self.seed=0
        
        self.obs_dim = obs_dim
        self.act_dim = act_dim
        self.latent_dim = latent_dim
        
        self.beta = beta
        self.eta = eta
        self.kl_targ = kl_targ
        self.entcoeff = entcoeff
        self.logpostcoeff = logpostcoeff
#         self.clip_range = clip_range
        
        self.epochs = epochs
        self.lr = lr
        self.hdim = hdim
        self.max_std = max_std
        
        self._build_graph()
        self._init_session()

    def _build_graph(self):
        self.g = tf.Graph()
        with self.g.as_default():
            self._placeholders()
            self._policy_nn()
            self._logprob()
            self._kl_entropy()
            self._loss_train_op()
            self.init = tf.global_variables_initializer()
            self.variables = tf.global_variables()
            
    def _placeholders(self):
        # observations, actions and advantages:
        self.obs_ph = tf.placeholder(tf.float32, (None, self.obs_dim), 'obs')
        self.act_ph = tf.placeholder(tf.float32, (None, self.act_dim), 'act')
        self.latent_ph = tf.placeholder(tf.float32, (None, self.latent_dim), 'latent')
        self.logpost_ph = tf.placeholder(tf.float32, (None, ), 'logpost')
        self.advantages_ph = tf.placeholder(tf.float32, (None,), 'advantages')

        # strength of D_KL loss terms:
        self.beta_ph = tf.placeholder(tf.float32, (), 'beta')
        self.eta_ph = tf.placeholder(tf.float32, (), 'eta')
        
        # learning rate:
        self.lr_ph = tf.placeholder(tf.float32, (), 'lr')
        
        # place holder for old parameters
        self.old_std_ph = tf.placeholder(tf.float32, (None, self.act_dim), 'old_std')
        self.old_mean_ph = tf.placeholder(tf.float32, (None, self.act_dim), 'old_means')

    def _policy_nn(self):
        
        hid1_size = self.hdim
        hid2_size = self.hdim
        
        # TWO HIDDEN LAYERS
        out = tf.layers.dense(self.obs_ph, hid1_size, tf.tanh,
                              kernel_initializer=tf.random_normal_initializer(stddev=0.01,seed= self.seed), name="h1")
        out = tf.layers.dense(tf.concat([out, self.latent_ph],axis=1), hid2_size, tf.tanh,
                              kernel_initializer=tf.random_normal_initializer(stddev=0.01,seed= self.seed), name="h2")
                
        # MEAN FUNCTION
        self.mean = tf.layers.dense(out, self.act_dim,
                                kernel_initializer=tf.random_normal_initializer(stddev=0.01,seed= self.seed), 
                                name="mean")
        # UNIT VARIANCE
        self.logits_std = tf.get_variable("logits_std",shape=(1,),initializer=tf.random_normal_initializer(stddev=0.01,seed= self.seed))
        self.std = self.max_std*tf.ones_like(self.mean)*tf.sigmoid(self.logits_std)
        
        # SAMPLE OPERATION
        self.sample_action = self.mean + tf.random_normal(tf.shape(self.mean),seed= self.seed)*self.std
        
    def _logprob(self):
        # PROBABILITY WITH TRAINING PARAMETER
        y = self.act_ph 
        mu = self.mean
        sigma = self.std
        
        self.logp = tf.reduce_sum(-0.5*tf.square((y-mu)/sigma)-tf.log(sigma)- 0.5*np.log(2.*np.pi),axis=1)

        # PROBABILITY WITH OLD (PREVIOUS) PARAMETER
        old_mu_ph = self.old_mean_ph
        old_sigma_ph = self.old_std_ph
                
        self.logp_old = tf.reduce_sum(-0.5*tf.square((y-old_mu_ph)/old_sigma_ph)-tf.log(old_sigma_ph)- 0.5*np.log(2.*np.pi),axis=1)
        
    def _kl_entropy(self):

        mean, std = self.mean, self.std
        old_mean, old_std = self.old_mean_ph, self.old_std_ph
 
        log_std_old = tf.log(old_std)
        log_std_new = tf.log(std)
        frac_std_old_new = old_std/std

        # KL DIVERGENCE BETWEEN TWO GAUSSIAN
        kl = tf.reduce_sum(log_std_new - log_std_old + 0.5*tf.square(frac_std_old_new) + 0.5*tf.square((mean - old_mean)/std)- 0.5,axis=1)
        self.kl = tf.reduce_mean(kl)
        
        # ENTROPY OF GAUSSIAN
        entropy = tf.reduce_sum(log_std_new + 0.5 + 0.5*np.log(2*np.pi),axis=1)
        self.entropy = tf.reduce_mean(entropy)
        
    def _loss_train_op(self):
        
        # Proximal Policy Optimization CLIPPED LOSS FUNCTION
#         ratio = tf.exp(self.logp - self.logp_old) 
#         clipped_ratio = tf.clip_by_value(ratio,clip_value_min=1-self.clip_range,clip_value_max=1+self.clip_range) 
#         self.loss = -tf.reduce_mean(tf.minimum(self.advantages_ph*ratio,self.advantages_ph*clipped_ratio))
        
        loss1 = -tf.reduce_mean((self.advantages_ph + self.logpostcoeff*self.logpost_ph) * tf.exp(self.logp - self.logp_old))
        loss2 = tf.reduce_mean(self.beta_ph * self.kl)
        loss3 = self.eta_ph * tf.square(tf.maximum(0.0, self.kl - 2.0 * self.kl_targ))
        self.loss = loss1 - self.entcoeff*self.entropy + loss2 + loss3
        
        # OPTIMIZER 
        optimizer = tf.train.AdamOptimizer(self.lr_ph)
        self.train_op = optimizer.minimize(self.loss)

    def _init_session(self):
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        self.sess = tf.Session(config=config,graph=self.g)
        self.sess.run(self.init)

    def sample(self, obs, latent): # SAMPLE FROM POLICY        
        feed_dict = {self.obs_ph: obs, self.latent_ph:latent}
        sampled_action = self.sess.run(self.sample_action,feed_dict=feed_dict)
        return sampled_action
    
    def control(self, obs, latent): # COMPUTE MEAN
        feed_dict = {self.obs_ph: obs, self.latent_ph:latent}
        best_action = self.sess.run(self.mean,feed_dict=feed_dict)
        return best_action        
    
    def update(self, observes, actions, latents, logposts, advantages, batch_size = 128): # TRAIN POLICY
        
        num_batches = max(observes.shape[0] // batch_size, 1)
        batch_size = observes.shape[0] // num_batches
        
        old_means_np, old_std_np = self.sess.run([self.mean, self.std],{self.obs_ph: observes,self.latent_ph:latents}) # COMPUTE OLD PARAMTER
        for e in range(self.epochs):
            observes, actions, latents, logposts, advantages, old_means_np, old_std_np = shuffle(observes, actions, latents, logposts, advantages, old_means_np, old_std_np, random_state=self.seed)
            for j in range(num_batches): 
                start = j * batch_size
                end = (j + 1) * batch_size
                feed_dict = {self.obs_ph: observes[start:end,:],
                     self.act_ph: actions[start:end,:],
                     self.latent_ph: latents[start:end,:],
                     self.logpost_ph: logposts[start:end],
                     self.advantages_ph: advantages[start:end],
                     self.old_std_ph: old_std_np[start:end,:],
                     self.old_mean_ph: old_means_np[start:end,:],
                     self.beta_ph: self.beta,
                     self.eta_ph: self.eta,
                     self.lr_ph: self.lr}        
                self.sess.run(self.train_op, feed_dict)
            
        feed_dict = {self.obs_ph: observes,
                 self.act_ph: actions,
                 self.latent_ph: latents,
                 self.logpost_ph: logposts,
                 self.advantages_ph: advantages,
                 self.old_std_ph: old_std_np,
                 self.old_mean_ph: old_means_np,
                 self.beta_ph: self.beta,
                 self.eta_ph: self.eta,
                 self.lr_ph: self.lr}             
        loss, kl, entropy = self.sess.run([self.loss, self.kl, self.entropy], feed_dict)
        return loss, kl, entropy
    
    def close_sess(self):
        self.sess.close()

In [3]:
class LatentPosterior(object):
    def __init__(self, obs_dim, act_dim, latent_dim=4, epochs=20, lr=1e-3, hdim=64, seed=0):
        self.obs_dim = obs_dim
        self.act_dim = act_dim
        self.latent_dim = latent_dim
        self.epochs = epochs
        self.lr = lr
        self.hdim = hdim
        self.seed = seed
        self._build_graph()

    def _build_graph(self):
        """ Construct TensorFlow graph, including loss function, init op and train op """
        self.g = tf.Graph()
        with self.g.as_default():
            # Place Holder for 
            self.obs_act_gen_ph = tf.placeholder(tf.float32, (None, self.obs_dim + self.act_dim), 'obs_act_gen_rewfunc')
            self.latent_ph = tf.placeholder(tf.float32, (None, self.latent_dim), 'latent_gen_rewfunc')
            
            hid1_size = self.hdim
            hid2_size = self.hdim
            
            # Network for expert demonstrations
            out = tf.layers.dense(self.obs_act_gen_ph, hid1_size, tf.tanh,
                                  kernel_initializer=tf.random_normal_initializer(
                                      stddev=0.01,seed=self.seed), name="h1")
            out = tf.layers.dense(out, hid2_size, tf.tanh,
                                  kernel_initializer=tf.random_normal_initializer(
                                      stddev=0.01,seed=self.seed), name="h2")
            gen_logits = tf.layers.dense(out, self.latent_dim,
                                  kernel_initializer=tf.random_normal_initializer(
                                      stddev=0.01,seed=self.seed), name='output')
            self.gen_logits = gen_logits
            self.gen_prob = tf.nn.softmax(gen_logits)
            
            # Loss for classification
            generator_loss = tf.nn.sigmoid_cross_entropy_with_logits(logits=self.gen_logits, labels=self.latent_ph)
            generator_loss = tf.reduce_mean(generator_loss)
            
            # Total loss
            self.loss = generator_loss
                        
            optimizer = tf.train.AdamOptimizer(self.lr)
            self.train_op = optimizer.minimize(self.loss)
            self.init = tf.global_variables_initializer()
            self.variables = tf.global_variables()
            
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        self.sess = tf.Session(config=config,graph=self.g)
        self.sess.run(self.init)

    def fit(self, latent, obs_act_gen, batch_size=128):
        data_size = obs_act_gen.shape[0]
        num_batches = max(data_size // batch_size, 1)
        batch_size = data_size // num_batches
        
        for e in range(self.epochs):
            obs_act_gen, latent = shuffle(obs_act_gen, latent, random_state=self.seed)
            for j in range(num_batches):
                start = j * batch_size
                end = (j + 1) * batch_size
                feed_dict = {self.obs_act_gen_ph:obs_act_gen[start:end,:], self.latent_ph:latent[start:end,:]}
                self.sess.run(self.train_op, feed_dict=feed_dict)
        feed_dict = {self.obs_act_gen_ph:obs_act_gen, self.latent_ph:latent}
        loss = self.sess.run(self.loss, feed_dict=feed_dict)
        return loss
    
    def predict(self, x): # Predict the rewards
        feed_dict = {self.obs_act_gen_ph: x}
        gen_prob = self.sess.run(self.gen_prob, feed_dict=feed_dict)
        latent_code = np.zeros([1,self.latent_dim])
        latent_code[np.random.choice(1,p=gen_prob)] = 1
        return latent_code
    
    def logpost(self, x, latent): # Predict the rewards
        feed_dict = {self.obs_act_gen_ph: x}
        gen_prob = self.sess.run(self.gen_prob, feed_dict=feed_dict)
        logpost = np.sum(np.log(gen_prob)*latent,axis=1)
        return logpost
        
    def close_sess(self):
        self.sess.close()   

In [4]:
class Value(object):
    def __init__(self, obs_dim, epochs=20, lr=1e-4, hdim=64, seed=0):
        self.seed = seed
    
        self.obs_dim = obs_dim
        self.epochs = epochs
        self.lr = lr
        self.hdim = hdim
        
        self._build_graph()
        self._init_session()
        
    def _build_graph(self):
        self.g = tf.Graph()
        with self.g.as_default():
            self.obs_ph = tf.placeholder(tf.float32, (None, self.obs_dim), 'obs_valfunc')
            self.val_ph = tf.placeholder(tf.float32, (None,), 'val_valfunc')
            
            hid1_size = self.hdim 
            hid2_size = self.hdim 
            
            out = tf.layers.dense(self.obs_ph, hid1_size, tf.tanh,
                                  kernel_initializer=tf.random_normal_initializer(
                                      stddev=0.01,seed=self.seed), name="h1")
            out = tf.layers.dense(out, hid2_size, tf.tanh,
                                  kernel_initializer=tf.random_normal_initializer(
                                      stddev=0.01,seed=self.seed), name="h2")
            out = tf.layers.dense(out, 1,
                                  kernel_initializer=tf.random_normal_initializer(
                                      stddev=0.01,seed=self.seed), name='output')
            self.out = tf.squeeze(out)
            
            # L2 LOSS
            self.loss = tf.reduce_mean(tf.square(self.out - self.val_ph))
            
            # OPTIMIZER
            optimizer = tf.train.AdamOptimizer(self.lr)
            self.train_op = optimizer.minimize(self.loss)
            
            self.init = tf.global_variables_initializer()
            self.variables = tf.global_variables()
    
    def _init_session(self):
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        self.sess = tf.Session(config=config,graph=self.g)
        self.sess.run(self.init)

    def fit(self, x, y, batch_size=32):
        num_batches = max(x.shape[0] // batch_size, 1)
        x_train, y_train = x, y
        for e in range(self.epochs):
            x_train, y_train = shuffle(x_train, y_train, random_state=self.seed)
            for j in range(num_batches):
                start = j * batch_size
                end = (j + 1) * batch_size
                feed_dict = {self.obs_ph: x_train[start:end, :],
                             self.val_ph: y_train[start:end]}
                self.sess.run([self.train_op], feed_dict=feed_dict)
        feed_dict = {self.obs_ph: x_train,
                     self.val_ph: y_train}
        loss, = self.sess.run([self.loss], feed_dict=feed_dict)
        return loss

    def predict(self, x): # PREDICT VALUE OF THE GIVEN STATE
        feed_dict = {self.obs_ph: x}
        y_hat = self.sess.run(self.out, feed_dict=feed_dict)
        return np.squeeze(y_hat)

    def close_sess(self):
        self.sess.close()

In [5]:
class Reward(object):
    def __init__(self, obs_dim, act_dim, epochs=10, hdim=32, lr=1e-3, entcoeff=1e-1, seed=0):
        self.replay_buffer_obs_act_gen = None
        self.seed = seed
        self.obs_dim = obs_dim
        self.act_dim = act_dim
        self.hdim = hdim
        self.epochs = epochs
        self.entcoeff = entcoeff # Heuristics
        self.lr = lr
        self._build_graph()

    def _build_graph(self):
        """ Construct TensorFlow graph, including loss function, init op and train op """
        self.g = tf.Graph()
        with self.g.as_default():
            # Place Holder for 
            self.obs_act_exp_ph = tf.placeholder(tf.float32, (None, self.obs_dim + self.act_dim), 'obs_act_exp_rewfunc')
            self.obs_act_gen_ph = tf.placeholder(tf.float32, (None, self.obs_dim + self.act_dim), 'obs_act_gen_rewfunc')
            
            hid1_size = self.hdim
            hid2_size = self.hdim
            
            # Network for expert demonstrations
            out = tf.layers.dense(self.obs_act_exp_ph, hid1_size, tf.tanh,
                                  kernel_initializer=tf.random_normal_initializer(
                                      stddev=0.01,seed=self.seed), name="h1")
            out = tf.layers.dense(out, hid2_size, tf.tanh,
                                  kernel_initializer=tf.random_normal_initializer(
                                      stddev=0.01,seed=self.seed), name="h2")
            exp_logits = tf.layers.dense(out, 1,
                                  kernel_initializer=tf.random_normal_initializer(
                                      stddev=0.01,seed=self.seed), name='output')
            self.exp_logits = tf.squeeze(exp_logits)
            
            # Network for learner's demonstrations. Use the same parameter which is defined above
            out = tf.layers.dense(self.obs_act_gen_ph, hid1_size, tf.tanh,
                                  kernel_initializer=tf.random_normal_initializer(
                                      stddev=0.01,seed=self.seed), name="h1", reuse=True) # Reuse=Ture -> use the same parameter
            out = tf.layers.dense(out, hid2_size, tf.tanh,
                                  kernel_initializer=tf.random_normal_initializer(
                                      stddev=0.01,seed=self.seed), name="h2", reuse=True) # Reuse=Ture -> use the same parameter
            gen_logits = tf.layers.dense(out, 1,
                                  kernel_initializer=tf.random_normal_initializer(
                                      stddev=0.01,seed=self.seed), name='output', reuse=True) # Reuse=Ture -> use the same parameter
            self.gen_logits = tf.squeeze(gen_logits)

            # Check accuracy
            self.generator_acc = tf.reduce_mean(tf.to_float(gen_logits > 0.0))
            self.expert_acc = tf.reduce_mean(tf.to_float(exp_logits < 0.0))
            
            # Loss for classification
#             generator_loss = tf.nn.sigmoid_cross_entropy_with_logits(logits=gen_logits, labels=tf.ones_like(gen_logits))
            
            generator_loss = tf.reduce_mean(gen_logits)
            expert_loss = -tf.reduce_mean(exp_logits)
            
            # Entropy regularization
            logits = tf.concat([gen_logits, exp_logits], 0)
            entropy = tf.reduce_mean(logits**2) 
            entropy_loss = self.entcoeff*entropy
            
            # Total loss
            self.loss = generator_loss + expert_loss + entropy_loss
            
            # Build Reward for policy
            self.reward = gen_logits
            
            optimizer = tf.train.AdamOptimizer(self.lr)
            gvs = optimizer.compute_gradients(self.loss)
            clipped_gvs = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gvs]
            self.train_op = optimizer.apply_gradients(clipped_gvs)
            self.init = tf.global_variables_initializer()
            self.variables = tf.global_variables()
        self.sess = tf.Session(graph=self.g)
        self.sess.run(self.init)

    def fit(self, obs_act_exp, obs_act_gen, batch_size=128):
        data_size = min(obs_act_exp.shape[0],obs_act_gen.shape[0])
        num_batches = max(data_size // batch_size, 1)
        batch_size = data_size // num_batches
        
        obs_act_exp_train = obs_act_exp
        if self.replay_buffer_obs_act_gen is None:
            obs_act_gen_train = obs_act_gen
        else:
            obs_act_gen_train = np.concatenate([obs_act_gen, self.replay_buffer_obs_act_gen])
        self.replay_buffer_obs_act_gen = obs_act_gen
        
        for e in range(self.epochs):
            obs_act_exp_train = shuffle(obs_act_exp_train, random_state=self.seed)
            obs_act_gen_train = shuffle(obs_act_gen_train, random_state=self.seed)
            for j in range(num_batches):
                start = j * batch_size
                end = (j + 1) * batch_size
                feed_dict = {self.obs_act_gen_ph:obs_act_gen_train[start:end,:], self.obs_act_exp_ph:obs_act_exp_train[start:end,:]}
                self.sess.run(self.train_op, feed_dict=feed_dict)
        feed_dict = {self.obs_act_gen_ph:obs_act_gen_train, self.obs_act_exp_ph:obs_act_exp_train}
        loss, gen_acc, exp_acc = self.sess.run([self.loss,self.generator_acc,self.expert_acc], feed_dict=feed_dict)
        return loss, gen_acc, exp_acc
    
    def predict(self, x): # Predict the rewards
        feed_dict = {self.obs_act_gen_ph: x}
        rew_hat = self.sess.run(self.reward, feed_dict=feed_dict)
        return np.squeeze(rew_hat)
        
    def close_sess(self):
        self.sess.close()

In [6]:
def discount(x, gamma):
    return scipy.signal.lfilter([1.0], [1.0, -gamma], x[::-1])[::-1]


def add_disc_sum_rew(trajectories, gamma):
    for trajectory in trajectories:
        if gamma < 0.999:  # don't scale for gamma ~= 1
            rewards = trajectory['rewards'] * (1 - gamma)
        else:
            rewards = trajectory['rewards']
        disc_sum_rew = discount(rewards, gamma)
        trajectory['disc_sum_rew'] = disc_sum_rew

def add_rew(trajectories, rew_func):
    for trajectory in trajectories:
        observes = trajectory['observes']
        actions = trajectory['actions']
        observes_actions = np.concatenate([observes,actions],axis=1)
        trajectory['rewards'] = rew_func.predict(observes_actions)
    return trajectories

def add_value(trajectories, val_func):
    for trajectory in trajectories:
        observes = trajectory['observes']
        values = val_func.predict(observes)
        trajectory['values'] = values

def add_gae(trajectories, gamma, lam):
    for trajectory in trajectories:
        if gamma < 0.999:  # don't scale for gamma ~= 1
            rewards = trajectory['rewards'] * (1 - gamma)
        else:
            rewards = trajectory['rewards']
        values = trajectory['values']
        # temporal differences
        tds = rewards - values + np.append(values[1:] * gamma, 0)
        advantages = discount(tds, gamma * lam)
        trajectory['advantages'] = advantages

def build_train_set(trajectories):
    observes = np.concatenate([t['observes'] for t in trajectories])
    actions = np.concatenate([t['actions'] for t in trajectories])
    latents = np.concatenate([t['latents'] for t in trajectories])
    disc_sum_rew = np.concatenate([t['disc_sum_rew'] for t in trajectories])
    advantages = np.concatenate([t['advantages'] for t in trajectories])
    # normalize advantages
    advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-6)

    return observes, actions, latents, advantages, disc_sum_rew

def build_train_set_for_rew(trajectories,demonstrations):
    
    real_observes = np.concatenate([d['observes'] for d in demonstrations])
    real_actions = np.concatenate([d['actions'] for d in demonstrations])
    obs_act_exp = np.concatenate([real_observes,real_actions],axis=1)
    
    fake_observes = np.concatenate([t['observes'] for t in trajectories])
    fake_actions = np.concatenate([t['actions'] for t in trajectories])
    obs_act_gen = np.concatenate([fake_observes,fake_actions],axis=1)
    return obs_act_exp, obs_act_gen

def run_episode(env, policy, animate=False):
    obs = env.reset()
    observes, actions, latents, rewards, infos = [], [], [], [], []
    done = False
    latent = np.zeros([1,policy.latent_dim])
    latent[0][np.random.choice(policy.latent_dim)] = 1
    while not done:
        if animate:
            env.render()
        obs = obs.astype(np.float32).reshape((1, -1))
        observes.append(obs)
        action = policy.sample(obs,latent).reshape((1, -1)).astype(np.float32)
        actions.append(action)
        latents.append(latent)
        obs, reward, done, info = env.step(action)
        if not isinstance(reward, float):
            reward = np.asscalar(reward)
        rewards.append(reward)
        infos.append(info['goal_id'])
        
    return (np.concatenate(observes), np.concatenate(actions), np.concatenate(latents),
            np.array(rewards, dtype=np.float32), infos)

def run_policy(env, policy, episodes):
    total_steps = 0
    trajectories = []
    for e in range(episodes):
        observes, actions, latents, rewards, infos = run_episode(env, policy)
        total_steps += observes.shape[0]
        trajectory = {'observes': observes,
                      'actions': actions,
                      'latents': latents,
                      'true_rewards': rewards,
                      'infos': infos}
        trajectories.append(trajectory)
    return trajectories

def evaluation(env, policy, max_eval_epi=100, seed=0):
    return_list = np.zeros((max_eval_epi,))
    info_list = []

    env.seed(seed)
    for epi in range(max_eval_epi):
        obs = env.reset()
        env_infos_epi_list = {"pos": [],"goal_id": []}
        observes, actions, rewards, infos = run_episode(env, policy)

        pos_list = []
        goal_id_list = []
        for info in infos:
            pos_list.append(info["pos"])
            goal_id_list.append(info["goal_id"])
        env_infos_epi_list["pos"] = np.asarray(pos_list)
        env_infos_epi_list["goal_id"] = np.asarray(goal_id_list)

        info_list.append({"env_infos":env_infos_epi_list})
        return_list[epi] = np.sum(rewards)
#     print("Evaluation Result: {}".format(np.mean(return_list)))
    return return_list, info_list

In [8]:
def train_info_imitation_learning(seed,entcoeff,logpostcoeff,n_mixture,demo_size,GPU_ID=0,kl_targ=0.003,
                             gamma = 0.995,lam = 0.98,max_std=0.5,episode_size = 500,batch_size = 512,
                             nupdates = 300,save_iter=100,min_save_iter=200,verbose=False):
    np.random.seed = seed
    tf.set_random_seed(seed)
    demo_file = open('./multigoal_expert_demo.pkl', 'r')
    demonstrations, = pickle.load(demo_file)
    demonstrations = shuffle(demonstrations,random_state=seed)[:demo_size]

    demo_observes = []
    demo_actions = []
    for demonstration in demonstrations:
        for obs in demonstration['observes']:
            demo_observes.append(obs)
        for act in demonstration['actions']:
            demo_actions.append(act)
    demo_observes=np.asarray(demo_observes)
    demo_actions=np.asarray(demo_actions)
    exp_ret = np.mean([np.sum(t['rewards']) for t in demonstrations])

    env = multigoal.MultiGoalEnv(nr_goal=4)
    env.seed(seed)
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    # Define three networks
    posterior = LatentPosterior(obs_dim, act_dim, latent_dim=n_mixture, epochs=20, lr=1e-3)
    policy = PolicyLatent(obs_dim, act_dim, latent_dim=n_mixture, max_std=max_std, entcoeff=entcoeff,logpostcoeff=logpostcoeff, epochs=20, hdim=64, lr=1e-3, clip_range=0.2, seed=seed)
    val_func = Value(obs_dim, epochs=20, hdim=64, lr=1e-3, seed=seed)
    rew_func = Reward(obs_dim, act_dim, epochs=20, hdim=32, lr=3e-4, seed=seed)
    mean_ret_list = []
    info_list = []
    
    saver_prefix="./results_info_gail/seed:{},kl:{:.2e},entcoeff:{:.2e},logpostcoeff:{:.2e},mixture:{:d},epi_size:{}".format(seed,kl_targ,entcoeff,logpostcoeff,n_mixture,episode_size)

    n_episodes = episode_size
    train_rewards = True
    for update in range(nupdates+1):

        # Generate data
        trajectories = run_policy(env, policy, episodes=n_episodes)

        # Build data set for training rewards function
        obs_act_exp, obs_act_gen = build_train_set_for_rew(trajectories,demonstrations) 
        rew_loss, gen_acc, exp_acc = rew_func.fit(obs_act_exp, obs_act_gen, batch_size=batch_size)

        add_rew(trajectories,rew_func)
        add_value(trajectories, val_func)  # add estimated values to episodes
        add_disc_sum_rew(trajectories, gamma)  # calculated discounted sum of Rs
        add_gae(trajectories, gamma, lam)  # calculate advantage
        # Build data set for training policy and value function
        observes, actions, latents, advantages, returns = build_train_set(trajectories)
        logposts = posterior.logpost(obs_act_gen,latents)

        # Train policy and value
        pol_loss, pol_kl, pol_entropy = policy.update(observes, actions, latents, logposts, advantages, batch_size=batch_size)  # update policy
        vf_loss = val_func.fit(observes, returns, batch_size=batch_size)  # update value function
        post_loss = posterior.fit(latents, obs_act_gen,batch_size=batch_size)
        
        mean_ret = np.mean([np.sum(t['true_rewards']) for t in trajectories])
        mean_ret_list.append(mean_ret)
        info_list.append([t['infos'] for t in trajectories])
        
        train_ret = np.mean([np.sum(t['rewards']) for t in trajectories])
        if (update%20) == 0 and verbose:
            print('[{}/{}] True Mean Ret : {:.3f}, Train Mean Ret : {:.3f}, Reward Loss : {:.3f}, Reward Acc : [{:.3f},{:.3f}], Posterior : {:.3f}, Value Loss : {:.3f}, Policy loss : {:.5f}, Policy KL : {:.5f}, Policy Entropy : {:.3f} ***'.
                  format(update, nupdates, mean_ret, train_ret, rew_loss, gen_acc, exp_acc, post_loss, vf_loss, pol_loss, pol_kl, pol_entropy))
    
    evaluation_results = { "mean_ret_list": mean_ret_list, "info_list": info_list }
    pickle.dump( evaluation_results, open( saver_prefix+".pickle", "wb" ) )

if __name__=="__main__":
    seed=0
    entcoeff=1e-3
    logpostcoeff=1e-3
    demo_size=300
    n_mixture=4
    train_info_imitation_learning(seed,entcoeff,logpostcoeff,n_mixture,demo_size,verbose=True)

[0/300] True Mean Ret : -48.349, Train Mean Ret : -25.790, Reward Loss : -0.935, Reward Acc : [0.058,0.527], Posterior : 0.560, Value Loss : 0.002, Policy loss : -0.01474, Policy KL : 0.00901, Policy Entropy : 0.075 ***
[20/300] True Mean Ret : 145.551, Train Mean Ret : -342.115, Reward Loss : -63.339, Reward Acc : [0.295,0.000], Posterior : 0.557, Value Loss : 0.440, Policy loss : -0.02188, Policy KL : 0.00992, Policy Entropy : -0.181 ***
[40/300] True Mean Ret : -102.539, Train Mean Ret : 729.745, Reward Loss : -30.356, Reward Acc : [0.704,0.069], Posterior : 0.541, Value Loss : 1.500, Policy loss : -0.00298, Policy KL : 0.00623, Policy Entropy : -0.909 ***
[60/300] True Mean Ret : -112.415, Train Mean Ret : 1642.067, Reward Loss : 0.900, Reward Acc : [0.960,0.002], Posterior : 0.520, Value Loss : 1.096, Policy loss : -0.00078, Policy KL : 0.00492, Policy Entropy : -1.938 ***
[80/300] True Mean Ret : 118.229, Train Mean Ret : 853.759, Reward Loss : 1.208, Reward Acc : [0.994,0.000], 

Exception TypeError: "render() got an unexpected keyword argument 'close'" in <bound method MultiGoalEnv.__del__ of <multigoal.MultiGoalEnv object at 0x7fcfe2607390>> ignored
