In [1]:
import warnings, os
warnings.filterwarnings("ignore")
os.environ["CUDA_VISIBLE_DEVICES"]="0"

import tensorflow as tf

import numpy as np
from sklearn.utils import shuffle
import scipy.signal
import pickle

import multigoal

from matplotlib import pyplot as plt
%matplotlib inline

if __name__ == "__main__":
    print("Multi Goal Imitation Learning")

Multi Goal Imitation Learning


In [2]:
class Policy(object):
    def __init__(self, obs_dim, act_dim, 
                 beta=1.0, eta=100, kl_targ=0.003, entcoeff=1e-3,
                 epochs=20, lr=1e-3,
                 mdn_weight="softmax", n_mixture=4, max_std=0.1, eps=0.09,
                 seed=0,GPU_ID=0):
        
        self.seed=0
        self.GPU_ID=GPU_ID
        
        self.obs_dim = obs_dim
        self.act_dim = act_dim
        
        self.beta = beta
        self.eta = eta
        self.kl_targ = kl_targ
        self.entcoeff = entcoeff
        
        self.epochs = epochs
        self.lr = lr
        self.lr_multiplier = 1.0
                
        self.mdn_weight = mdn_weight
        self.n_mixture = n_mixture
        self.max_std = max_std
        self.eps = eps
        
        with tf.device('/device:GPU:%d'%(self.GPU_ID)):
            self._build_graph()
            self._init_session()

    def _build_graph(self):
        self.g = tf.Graph()
        with self.g.as_default():
            self._placeholders()
            self._policy_nn()
            self._logprob()
            self._kl_entropy()
            self._loss_train_op()
            self.init = tf.global_variables_initializer()
            self.variables = tf.global_variables()
            
    def _placeholders(self):
        # observations, actions and advantages:
        self.obs_ph = tf.placeholder(tf.float32, (None, self.obs_dim), 'obs')
        self.act_ph = tf.placeholder(tf.float32, (None, self.act_dim), 'act')
        self.advantages_ph = tf.placeholder(tf.float32, (None,), 'advantages')
        self.eps_ph = tf.placeholder(tf.float32, (), 'epsilon')
        
        # strength of D_KL loss terms:
        self.beta_ph = tf.placeholder(tf.float32, (), 'beta')
        self.eta_ph = tf.placeholder(tf.float32, (), 'eta')
        
        # learning rate:
        self.lr_ph = tf.placeholder(tf.float32, (), 'lr')
        
        self.old_std_ph = tf.placeholder(tf.float32, (None, self.act_dim, self.n_mixture), 'old_std')
        self.old_means_ph = tf.placeholder(tf.float32, (None, self.act_dim, self.n_mixture), 'old_means')
        self.old_pi_ph = tf.placeholder(tf.float32, (None, self.n_mixture), 'old_pi')
    
    def _policy_nn(self):
        
        hid1_size = 64  
        hid2_size = 64
        
        out = tf.layers.dense(self.obs_ph, hid1_size, tf.tanh,
                              kernel_initializer=tf.random_normal_initializer(stddev=0.01,seed= self.seed), name="h1")
        out = tf.layers.dense(out, hid2_size, tf.tanh,
                              kernel_initializer=tf.random_normal_initializer(stddev=0.01,seed= self.seed), name="h2")
        means = tf.layers.dense(out, self.act_dim*self.n_mixture,
                                kernel_initializer=tf.random_normal_initializer(stddev=0.01,seed= self.seed), 
                                name="flat_means")
        self.means = tf.reshape(means,shape=[-1,self.act_dim,self.n_mixture], name="means")
        logits_std = tf.layers.dense(out, self.act_dim*self.n_mixture,
                                kernel_initializer=tf.random_normal_initializer(stddev=0.01,seed= self.seed), 
                                name="flat_logits_std")
        self.std = tf.reshape(self.max_std*tf.sigmoid(logits_std),shape=[-1,self.act_dim,self.n_mixture], name="std")
        self.std = self.std + self.eps_ph
        if self.mdn_weight=="softmax":
            self.pi = tf.nn.softmax(tf.layers.dense(out, self.n_mixture,
                                                kernel_initializer=tf.random_normal_initializer(stddev=0.01,seed= self.seed), name="pi"))
        elif self.mdn_weight=="sparsemax":
            self.pi = tf.contrib.sparsemax.sparsemax(tf.layers.dense(out, self.n_mixture,
                                                kernel_initializer=tf.random_normal_initializer(stddev=0.01,seed= self.seed), name="pi"))
    def _logprob(self):
        y = self.act_ph 
        mu = self.means
        sigma = self.std + 1e-8
        pi = self.pi + 1e-8
        pi = pi/tf.reduce_sum(pi,axis=1,keep_dims=True)
        
        quadratics = -0.5*tf.reduce_sum(tf.square((tf.tile(y[:,:,tf.newaxis],[1,1,self.n_mixture])-mu)/sigma),axis=1)
        logdet = -0.5*tf.reduce_sum(tf.log(sigma),axis=1)
        logconstant = - 0.5*self.act_dim*np.log(2.*np.pi)
        logpi = tf.log(pi)
        
        exponents = quadratics + logdet + logconstant + logpi
        logprobs = tf.reduce_logsumexp(exponents,axis=1)
        
        self.logp = logprobs

        old_mu_ph = self.old_means_ph
        old_sigma_ph = self.old_std_ph + 1e-8
        old_pi_ph = self.old_pi_ph + 1e-8
        old_pi_ph = old_pi_ph/tf.reduce_sum(old_pi_ph,axis=1,keep_dims=True)
    
        quadratics = -0.5*tf.reduce_sum(tf.square((tf.tile(y[:,:,tf.newaxis],[1,1,self.n_mixture])-old_mu_ph)/old_sigma_ph),axis=1)
        logdet = -0.5*tf.reduce_sum(tf.log(old_sigma_ph),axis=1)
        logconstant = - 0.5*self.act_dim*np.log(2.*np.pi)
        logpi = tf.log(old_pi_ph)
        
        exponents = quadratics + logdet + logconstant + logpi
        old_logprobs = tf.reduce_logsumexp(exponents,axis=1)
        
        self.logp_old = old_logprobs
    
    def _kl_entropy(self):

        def energy(mu1,std1,pi1,mu2,std2,pi2):
            energy_components = []
            for i in range(self.n_mixture):
                for j in range(self.n_mixture):
                    mu1i = tf.squeeze(mu1[:,:,i]) 
                    mu2j = tf.squeeze(mu2[:,:,j])
                    std1i = tf.squeeze(std1[:,:,i])
                    std2j = tf.squeeze(std2[:,:,j])
                    pi1i = pi1[:,i]
                    pi2j = pi2[:,j]
                    energy_components.append(pi1i*pi2j * tf.exp(-0.5*tf.reduce_sum(((mu1i - mu2j)/(std1i**2+std2j**2))+2.*tf.log(std1i+std2j)+np.log(2*np.pi),axis=1)))
            return tf.reduce_sum(tf.stack(energy_components),axis=1) 
            
        mean, std, weight = self.means, (self.std + 1e-8), (self.pi + 1e-8)
        old_mean, old_std, old_weight = self.old_means_ph, (self.old_std_ph + 1e-8), (self.old_pi_ph + 1e-8)

        weight = weight/tf.reduce_sum(weight,axis=1,keep_dims=True)
        old_weight = old_weight/tf.reduce_sum(old_weight,axis=1,keep_dims=True)
        
        if self.mdn_weight=="softmax":
            self.entropy = tf.reduce_sum(self.pi*(-tf.log(self.pi) + 0.5 * (self.act_dim * (np.log(2 * np.pi) + 1) +
                                                                        tf.reduce_sum(tf.log(std),axis=1))),axis=1)
            self.entropy = tf.reduce_mean(self.entropy)
        elif self.mdn_weight=="sparsemax":
            self.entropy = tf.reduce_mean(0.5*(1-energy(mean, std, weight,mean, std, weight)))
            
        log_det_cov_old = tf.reduce_sum(tf.log(old_std),axis=1)
        log_det_cov_new = tf.reduce_sum(tf.log(std),axis=1)
        tr_old_new = tf.reduce_sum(old_std/std,axis=1)

        kl = tf.reduce_sum(old_weight*tf.log(old_weight/weight) + 0.5 * old_weight*(log_det_cov_new - log_det_cov_old + tr_old_new +
                         tf.reduce_sum(tf.square((mean - old_mean)/std),axis=1) - self.act_dim),axis=1)
        self.kl = tf.reduce_mean(kl)
        
    def _loss_train_op(self):
        
        loss1 = -tf.reduce_mean(self.advantages_ph * tf.exp(self.logp - self.logp_old))
        loss2 = tf.reduce_mean(self.beta_ph * self.kl)
        loss3 = self.eta_ph * tf.square(tf.maximum(0.0, self.kl - 2.0 * self.kl_targ))
        self.loss = loss1 - self.entcoeff*self.entropy + loss2 + loss3
        self.loss_p = -tf.reduce_mean(self.logp)
        
        optimizer = tf.train.AdamOptimizer(self.lr_ph)
        self.train_op = optimizer.minimize(self.loss)
        self.train_p_op = optimizer.minimize(self.loss_p)

    def _init_session(self):
        """Launch TensorFlow session and initialize variables"""
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        self.sess = tf.Session(config=config,graph=self.g)
        self.sess.run(self.init)

    def sample(self, obs):
        """Draw sample from policy distribution"""
        feed_dict = {self.obs_ph: obs, self.eps_ph:self.eps}
        pi, mu, sigma = self.sess.run([self.pi, self.means, self.std],feed_dict=feed_dict)
        pi = (pi+1e-10)/np.sum(pi+1e-10,axis=1,keepdims=True)
        sigma = sigma+1e-10
        n_points = np.shape(obs)[0]
        
        _y_sampled = np.zeros([n_points,self.act_dim])
        for i in range(n_points):
            k = np.random.choice(self.n_mixture,p=pi[i,:])
            _y_sampled[i,:] = mu[i,:,k] + np.random.randn(1,self.act_dim)*sigma[i,:,k]
        return _y_sampled
    
    def update(self, observes, actions, advantages, batch_size = 128):
        
        num_batches = max(observes.shape[0] // batch_size, 1)
        batch_size = observes.shape[0] // num_batches
        
        old_means_np, old_std_np, old_pi_np = self.sess.run([self.means, self.std, self.pi],{self.obs_ph: observes, self.eps_ph:self.eps})
        loss, kl, entropy = 0, 0, 0
        for e in range(self.epochs):
            # TODO: need to improve data pipeline - re-feeding data every epoch
            observes, actions, advantages, old_means_np, old_std_np, old_pi_np = shuffle(observes, actions, advantages, old_means_np, old_std_np, old_pi_np, random_state=self.seed)
            for j in range(num_batches):
                start = j * batch_size
                end = (j + 1) * batch_size
                feed_dict = {self.obs_ph: observes[start:end,:],
                     self.act_ph: actions[start:end,:],
                     self.advantages_ph: advantages[start:end],
                     self.old_std_ph: old_std_np[start:end,:,:],
                     self.old_means_ph: old_means_np[start:end,:,:],
                     self.old_pi_ph: old_pi_np[start:end,:],
                     self.beta_ph: self.beta,
                     self.eta_ph: self.eta,
                     self.lr_ph: self.lr * self.lr_multiplier,
                     self.eps_ph: self.eps}        
                self.sess.run(self.train_op, feed_dict)
            
            feed_dict = {self.obs_ph: observes,
                 self.act_ph: actions,
                 self.advantages_ph: advantages,
                 self.old_std_ph: old_std_np,
                 self.old_means_ph: old_means_np,
                 self.old_pi_ph: old_pi_np,
                 self.beta_ph: self.beta,
                 self.eta_ph: self.eta,
                 self.lr_ph: self.lr * self.lr_multiplier,
                 self.eps_ph: self.eps}        
            loss, kl, entropy = self.sess.run([self.loss, self.kl, self.entropy], feed_dict)
            if kl > self.kl_targ * 4:  # early stopping if D_KL diverges badly
                break
                
        # TODO: too many "magic numbers" in next 8 lines of code, need to clean up
        if kl > self.kl_targ * 2:  # servo beta to reach D_KL target
            self.beta = np.minimum(35, 1.5 * self.beta)  # max clip beta
            if self.beta > 30 and self.lr_multiplier > 0.1:
                self.lr_multiplier /= 1.5
        elif kl < self.kl_targ / 2:
            self.beta = np.maximum(1 / 35, self.beta / 1.5)  # min clip beta
            if self.beta < (1 / 30) and self.lr_multiplier < 10:
                self.lr_multiplier *= 1.5
        
        loss, kl, entropy = self.sess.run([self.loss, self.kl, self.entropy], feed_dict)
        return loss, kl, entropy
    
    def update_eps(self, decaying_rate = 0.99):
        self.eps *= decaying_rate
    
    def fit(self, x, y, batch_size=128):
        """ Fit model to current data batch + previous data batch
        Args:
            x: features
            y: target
            logger: logger to save training loss and % explained variance
        """
        num_batches = max(x.shape[0] // batch_size, 1)
        batch_size = x.shape[0] // num_batches
        
        x_train = x
        y_train = y
        for e in range(self.epochs):
            x_train, y_train = shuffle(x_train, y_train, random_state=self.seed)
            for j in range(num_batches):
                start = j * batch_size
                end = (j + 1) * batch_size
                feed_dict = {self.obs_ph: x_train[start:end, :],
                             self.act_ph: y_train[start:end, :],
                             self.lr_ph: self.lr,
                             self.eps_ph: 0.0}
                _, l = self.sess.run([self.train_p_op, self.loss_p], feed_dict=feed_dict)
        feed_dict = {self.obs_ph: x_train, self.act_ph: y_train, self.lr_ph: self.lr, self.eps_ph: 0.0}
        loss_p = self.sess.run(self.loss_p, feed_dict=feed_dict)
        return loss_p
    
    def save_policy(self, path):
        saver = tf.train.Saver(self.variables)
        saver.save(self.sess, path)

    def restore_session(self, path):
        with self.g.as_default():
            saver = tf.train.import_meta_graph(path+".meta")
            saver.restore(self.sess, path)
        
    def close_sess(self):
        self.sess.close()

if __name__ == "__main__":
    policy_func = Policy(10,5)
    policy_func.close_sess()

In [3]:
class Value(object):
    def __init__(self, obs_dim, epochs=20, lr=1e-3,seed=0,GPU_ID=0):
        self.seed = seed
        self.GPU_ID=GPU_ID
        
        self.replay_buffer_x = None
        self.replay_buffer_y = None
    
        self.obs_dim = obs_dim
        self.epochs = epochs
        self.lr = lr
        
        with tf.device('/device:GPU:%d'%(self.GPU_ID)):
            self._build_graph()
        
    def _build_graph(self):
        self.g = tf.Graph()
        with self.g.as_default():
            self.obs_ph = tf.placeholder(tf.float32, (None, self.obs_dim), 'obs_valfunc')
            self.val_ph = tf.placeholder(tf.float32, (None,), 'val_valfunc')
            
            hid1_size = 64
            hid2_size = 64
            
            out = tf.layers.dense(self.obs_ph, hid1_size, tf.tanh,
                                  kernel_initializer=tf.random_normal_initializer(
                                      stddev=0.01,seed=self.seed), name="h1")
            out = tf.layers.dense(out, hid2_size, tf.tanh,
                                  kernel_initializer=tf.random_normal_initializer(
                                      stddev=0.01,seed=self.seed), name="h2")
            out = tf.layers.dense(out, 1,
                                  kernel_initializer=tf.random_normal_initializer(
                                      stddev=0.01,seed=self.seed), name='output')
            self.out = tf.squeeze(out)
            self.loss = tf.reduce_mean(tf.square(self.out - self.val_ph))
            optimizer = tf.train.AdamOptimizer(self.lr)
            self.train_op = optimizer.minimize(self.loss)
            self.init = tf.global_variables_initializer()
            self.variables = tf.global_variables()
        self.sess = tf.Session(graph=self.g)
        self.sess.run(self.init)

    def fit(self, x, y, batch_size=32):
        num_batches = max(x.shape[0] // batch_size, 1)
        y_hat = self.predict(x)  
        
        if self.replay_buffer_x is None:
            x_train, y_train = x, y
        else:
            x_train = np.concatenate([x, self.replay_buffer_x])
            y_train = np.concatenate([y, self.replay_buffer_y])
        self.replay_buffer_x = x
        self.replay_buffer_y = y
        for e in range(self.epochs):
            x_train, y_train = shuffle(x_train, y_train, random_state=self.seed)
            for j in range(num_batches):
                start = j * batch_size
                end = (j + 1) * batch_size
                feed_dict = {self.obs_ph: x_train[start:end, :],
                             self.val_ph: y_train[start:end]}
                _, l = self.sess.run([self.train_op, self.loss], feed_dict=feed_dict)
        y_hat = self.predict(x)
        loss = np.mean(np.square(y_hat - y))
        return loss

    def predict(self, x):
        feed_dict = {self.obs_ph: x}
        y_hat = self.sess.run(self.out, feed_dict=feed_dict)
        return np.squeeze(y_hat)

    def save_value(self, path):
        saver = tf.train.Saver(self.variables)
        saver.save(self.sess, path)
        
    def close_sess(self):
        self.sess.close()

if __name__ == "__main__":
    value_func = Value(10)
    value_func.close_sess()

In [4]:
class Reward(object):
    def __init__(self, obs_dim, act_dim, epochs=20, lr=3e-4, entcoeff=1e-3, seed=0, GPU_ID=0):
        self.replay_buffer_obs_act_gen = None
        self.seed = seed
        self.GPU_ID = 0
        self.obs_dim = obs_dim
        self.act_dim = act_dim
        self.epochs = epochs
        self.entcoeff = entcoeff
        self.lr = lr  # learning rate set in _build_graph()
        
        with tf.device('/device:GPU:%d'%(self.GPU_ID)):
            self._build_graph()

    def _build_graph(self):
        """ Construct TensorFlow graph, including loss function, init op and train op """
        self.g = tf.Graph()
        with self.g.as_default():
            self.obs_act_exp_ph = tf.placeholder(tf.float32, (None, self.obs_dim + self.act_dim), 'obs_act_exp_rewfunc')
            self.obs_act_gen_ph = tf.placeholder(tf.float32, (None, self.obs_dim + self.act_dim), 'obs_act_gen_rewfunc')
            
            hid1_size = 32  # 10 chosen empirically on 'Hopper-v1'
            hid2_size = 32  # 10 chosen empirically on 'Hopper-v1'
            
            # 3 hidden layers with tanh activations
            out = tf.layers.dense(self.obs_act_exp_ph, hid1_size, tf.tanh,
                                  kernel_initializer=tf.random_normal_initializer(
                                      stddev=0.01,seed=self.seed), name="h1")
            out = tf.layers.dense(out, hid2_size, tf.tanh,
                                  kernel_initializer=tf.random_normal_initializer(
                                      stddev=0.01,seed=self.seed), name="h2")
            exp_logits = tf.layers.dense(out, 1,
                                  kernel_initializer=tf.random_normal_initializer(
                                      stddev=0.01,seed=self.seed), name='output')
            self.exp_logits = tf.squeeze(exp_logits)
            
            out = tf.layers.dense(self.obs_act_gen_ph, hid1_size, tf.tanh,
                                  kernel_initializer=tf.random_normal_initializer(
                                      stddev=0.01,seed=self.seed), name="h1", reuse=True)
            out = tf.layers.dense(out, hid2_size, tf.tanh,
                                  kernel_initializer=tf.random_normal_initializer(
                                      stddev=0.01,seed=self.seed), name="h2", reuse=True)
            gen_logits = tf.layers.dense(out, 1,
                                  kernel_initializer=tf.random_normal_initializer(
                                      stddev=0.01,seed=self.seed), name='output', reuse=True)
            self.gen_logits = tf.squeeze(gen_logits)

            self.generator_acc = tf.reduce_mean(tf.to_float(tf.nn.sigmoid(gen_logits) > 0.5))
            self.expert_acc = tf.reduce_mean(tf.to_float(tf.nn.sigmoid(exp_logits) < 0.5))
            
            generator_loss = tf.nn.sigmoid_cross_entropy_with_logits(logits=gen_logits, labels=tf.ones_like(gen_logits))
            generator_loss = tf.reduce_mean(generator_loss)
            expert_loss = tf.nn.sigmoid_cross_entropy_with_logits(logits=exp_logits, labels=tf.zeros_like(exp_logits))
            expert_loss = tf.reduce_mean(expert_loss)
            
            logits = tf.concat([gen_logits, exp_logits], 0)
            prob = tf.nn.sigmoid(logits)
            entropy = tf.reduce_mean(-prob*tf.log(tf.maximum(prob,1e-9))-(1-prob)*tf.log(tf.maximum(1-prob,1e-9))) 
            entropy_loss = -self.entcoeff*entropy
            # Loss + Accuracy terms
            
            self.loss = generator_loss + expert_loss + entropy_loss
            # Build Reward for policy
            self.reward = -tf.log(tf.maximum(tf.nn.sigmoid(gen_logits), 1e-9))
            
            optimizer = tf.train.AdamOptimizer(self.lr)
#             gvs = optimizer.compute_gradients(self.loss)
#             capped_gvs = [(tf.clip_by_value(grad, -10., 10.), var) for grad, var in gvs]
#             self.train_op = optimizer.apply_gradients(capped_gvs)
            self.train_op = optimizer.minimize(self.loss)
            self.init = tf.global_variables_initializer()
            self.variables = tf.global_variables()
        self.sess = tf.Session(graph=self.g)
        self.sess.run(self.init)

    def fit(self, obs_act_exp, obs_act_gen, batch_size=128):
        data_size = min(obs_act_exp.shape[0],obs_act_gen.shape[0])
        num_batches = max(data_size // batch_size, 1)
        batch_size = data_size // num_batches
        
        obs_act_exp_train = obs_act_exp
        if self.replay_buffer_obs_act_gen is None:
            obs_act_gen_train = obs_act_gen
        else:
            obs_act_gen_train = np.concatenate([obs_act_gen, self.replay_buffer_obs_act_gen])
        self.replay_buffer_obs_act_gen = obs_act_gen
        
        for e in range(self.epochs):
            obs_act_exp_train = shuffle(obs_act_exp_train, random_state=self.seed)
            obs_act_gen_train = shuffle(obs_act_gen_train, random_state=self.seed)
            for j in range(num_batches):
                start = j * batch_size
                end = (j + 1) * batch_size
                feed_dict = {self.obs_act_gen_ph:obs_act_gen_train[start:end,:], self.obs_act_exp_ph:obs_act_exp_train[start:end,:]}
                self.sess.run(self.train_op, feed_dict=feed_dict)
        feed_dict = {self.obs_act_gen_ph:obs_act_gen_train, self.obs_act_exp_ph:obs_act_exp_train}
        loss, gen_acc, exp_acc = self.sess.run([self.loss,self.generator_acc,self.expert_acc], feed_dict=feed_dict)
        return loss, gen_acc, exp_acc
    
    def predict(self, x):
        """ Predict method """
        feed_dict = {self.obs_act_gen_ph: x}
        rew_hat = self.sess.run(self.reward, feed_dict=feed_dict)

        return np.squeeze(rew_hat)

    def save_reward(self, path):
        saver = tf.train.Saver(self.variables)
        saver.save(self.sess, path)
        
    def close_sess(self):
        """ Close TensorFlow session """
        self.sess.close()

In [5]:
def discount(x, gamma):
    return scipy.signal.lfilter([1.0], [1.0, -gamma], x[::-1])[::-1]


def add_disc_sum_rew(trajectories, gamma):
    for trajectory in trajectories:
        if gamma < 0.999:  # don't scale for gamma ~= 1
            rewards = trajectory['rewards'] * (1 - gamma)
        else:
            rewards = trajectory['rewards']
        disc_sum_rew = discount(rewards, gamma)
        trajectory['disc_sum_rew'] = disc_sum_rew

def add_rew(trajectories, rew_func):
    for trajectory in trajectories:
        observes = trajectory['observes']
        actions = trajectory['actions']
        observes_actions = np.concatenate([observes,actions],axis=1)
        trajectory['rewards'] = rew_func.predict(observes_actions)
    return trajectories

def add_value(trajectories, val_func):
    for trajectory in trajectories:
        observes = trajectory['observes']
        values = val_func.predict(observes)
        trajectory['values'] = values

def add_gae(trajectories, gamma, lam):
    for trajectory in trajectories:
        if gamma < 0.999:  # don't scale for gamma ~= 1
            rewards = trajectory['rewards'] * (1 - gamma)
        else:
            rewards = trajectory['rewards']
        values = trajectory['values']
        # temporal differences
        tds = rewards - values + np.append(values[1:] * gamma, 0)
        advantages = discount(tds, gamma * lam)
        trajectory['advantages'] = advantages

def build_train_set(trajectories):
    observes = np.concatenate([t['observes'] for t in trajectories])
    actions = np.concatenate([t['actions'] for t in trajectories])
    disc_sum_rew = np.concatenate([t['disc_sum_rew'] for t in trajectories])
    advantages = np.concatenate([t['advantages'] for t in trajectories])
    # normalize advantages
    advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-6)

    return observes, actions, advantages, disc_sum_rew

def build_train_set_for_rew(trajectories,demonstrations):
    
    real_observes = np.concatenate([d['observes'] for d in demonstrations])
    real_actions = np.concatenate([d['actions'] for d in demonstrations])
    obs_act_exp = np.concatenate([real_observes,real_actions],axis=1)
    
    fake_observes = np.concatenate([t['observes'] for t in trajectories])
    fake_actions = np.concatenate([t['actions'] for t in trajectories])
    obs_act_gen = np.concatenate([fake_observes,fake_actions],axis=1)
    return obs_act_exp, obs_act_gen

def run_episode(env, policy, animate=False):
    obs = env.reset()
    observes, actions, rewards, infos = [], [], [], []
    done = False
    while not done:
        if animate:
            env.render()
        obs = obs.astype(np.float32).reshape((1, -1))
        observes.append(obs)
        action = policy.sample(obs).reshape((1, -1)).astype(np.float32)
        actions.append(action)
        obs, reward, done, info = env.step(action)
        if not isinstance(reward, float):
            reward = np.asscalar(reward)
        rewards.append(reward)
        infos.append(info['goal_id'])
        
    return (np.concatenate(observes), np.concatenate(actions),
            np.array(rewards, dtype=np.float32), infos)

def run_policy(env, policy, episodes):
    total_steps = 0
    trajectories = []
    for e in range(episodes):
        observes, actions, rewards, infos = run_episode(env, policy)
        total_steps += observes.shape[0]
        trajectory = {'observes': observes,
                      'actions': actions,
                      'true_rewards': rewards,
                      'infos': infos}
        trajectories.append(trajectory)
    return trajectories

def evaluation(env, policy, max_eval_epi=100, seed=0):
    return_list = np.zeros((max_eval_epi,))
    info_list = []

    env.seed(seed)
    for epi in range(max_eval_epi):
        obs = env.reset()
        env_infos_epi_list = {"pos": [],"goal_id": []}
        observes, actions, rewards, infos = run_episode(env, policy)

        pos_list = []
        goal_id_list = []
        for info in infos:
            pos_list.append(info["pos"])
            goal_id_list.append(info["goal_id"])
        env_infos_epi_list["pos"] = np.asarray(pos_list)
        env_infos_epi_list["goal_id"] = np.asarray(goal_id_list)

        info_list.append({"env_infos":env_infos_epi_list})
        return_list[epi] = np.sum(rewards)
#     print("Evaluation Result: {}".format(np.mean(return_list)))
    return return_list, info_list

In [None]:
def train_imitation_learning_timechecker(seed,kl_targ,entcoeff,n_mixture,demo_size,mdn_weight,GPU_ID=0,verbose=True,
                             gamma = 0.995,lam = 0.98,max_std=0.1,episode_size = 500,batch_size = 1024,
                             nupdates = 20,save_iter=10,min_save_iter=20):
        
        import time
        start=time.time()
        demo_file = open('./multigoal_expert_demo.pkl', 'r')
        demonstrations, = pickle.load(demo_file)
        demonstrations = shuffle(demonstrations,random_state=seed)[:demo_size]
        
        demo_observes = []
        demo_actions = []
        for demonstration in demonstrations:
            for obs in demonstration['observes']:
                demo_observes.append(obs)
            for act in demonstration['actions']:
                demo_actions.append(act)
        demo_observes=np.asarray(demo_observes)
        demo_actions=np.asarray(demo_actions)
        exp_ret = np.mean([np.sum(t['rewards']) for t in demonstrations])
        print("Load Demo : {}".format(time.time()-start))
        
        start = time.time()
        env = multigoal.MultiGoalEnv(nr_goal=4)
        obs_dim = env.observation_space.shape[0]
        act_dim = env.action_space.shape[0]
        print("Set Env : {}".format(time.time()-start))
        
        start = time.time()
        saver_prefix="./results/seed:{},kl:{:.2e},entcoeff:{:.2e},weight:{},mixture:{:d},epi_size:{}".format(seed,kl_targ,entcoeff,mdn_weight,n_mixture,episode_size)

        policy = Policy(obs_dim, act_dim,kl_targ=kl_targ,entcoeff=entcoeff,
                        mdn_weight=mdn_weight,n_mixture=n_mixture,max_std=max_std,eps=0.0,
                        seed=seed,GPU_ID=GPU_ID)
        val_func = Value(obs_dim,seed=seed,GPU_ID=GPU_ID)
        rew_func = Reward(obs_dim,act_dim,seed=seed,GPU_ID=GPU_ID)
        print("Set Network func : {}".format(time.time()-start))

        
        for update in range(nupdates+1):
            
            start = time.time()
            trajectories = run_policy(env, policy, episodes=episode_size)
            print("Run policy : {}".format(time.time()-start))
            
            start = time.time()
            add_rew(trajectories,rew_func)
            add_value(trajectories, val_func)  # add estimated values to episodes
            add_disc_sum_rew(trajectories, gamma)  # calculated discounted sum of Rs
            add_gae(trajectories, gamma, lam)  # calculate advantage
            observes, actions, advantages, disc_sum_rew = build_train_set(trajectories)
            obs_act_exp, obs_act_gen = build_train_set_for_rew(trajectories,demonstrations)
            print("Prepare demo : {}".format(time.time()-start))

            start = time.time()
            pol_loss, pol_kl, pol_entropy = policy.update(observes, actions, advantages,batch_size=batch_size)
            print("Update policy : {}".format(time.time()-start))  # update policy
            
            start = time.time()
            vf_loss = val_func.fit(observes, disc_sum_rew,batch_size=batch_size)  # update value function
            print("Update value : {}".format(time.time()-start))  # update policy
            
            start = time.time()
            rew_loss, gen_acc, exp_acc = rew_func.fit(obs_act_exp, obs_act_gen, batch_size=batch_size)
            print("Update reward : {}".format(time.time()-start))  # update policy
            
            if np.isnan(rew_loss) or np.isnan(pol_loss):
                policy.restore_session(saver_prefix+update_prefix+"_policy.ckpt")
                break

            if ((update%save_iter)==0) and (update>=min_save_iter):
                update_prefix = ",iter:{}".format(update)
                policy.save_policy(saver_prefix+update_prefix+"_policy.ckpt")
                val_func.save_value(saver_prefix+update_prefix+"_value.ckpt")
                rew_func.save_reward(saver_prefix+update_prefix+"_reward.ckpt")

                mean_ret = np.mean([np.sum(t['true_rewards']) for t in trajectories])
                train_ret = np.mean([np.sum(t['rewards']) for t in trajectories])
                if verbose:
                    print('[{}/{}] Mean Ret : {:.3f}/{:.3f}, Train Ret : {:.3f}, Rew Acc : [{:.3f},{:.3f}], Rew Loss : {:.3f}, Value Loss : {:.3f}, Policy loss : {:.5f}, Policy KL : {:.3f}, Policy Entropy : {:.3f}, Eps : {:.3f} ***'.format(
                        update, nupdates, mean_ret, exp_ret, train_ret, gen_acc, exp_acc, rew_loss, vf_loss, pol_loss, pol_kl, pol_entropy, policy.eps))
        
        return_list, info_list = evaluation(env, policy, max_eval_epi=100)
        evaluation_results = { "return_list": return_list, "info_list": info_list }
        pickle.dump( evaluation_results, open( saver_prefix+".pickle", "wb" ) )
        if verbose:
                print("Task is done with return: {:.3f}/{:.3f}".format(mean_ret,exp_ret))

if __name__ == "__main__":
    seed = 0
    kl_targ = 0.003
    entcoeff = 1e-3
    mdn_weight = "sparsemax"
    n_mixture = 4
    demo_size=400
#     train_imitation_learning_timechecker(seed,kl_targ,entcoeff,n_mixture,demo_size,mdn_weight,GPU_ID=0,verbose=True)

In [None]:
def train_imitation_learning(seed,kl_targ,entcoeff,n_mixture,demo_size,mdn_weight,GPU_ID=0,verbose=True,
                             gamma = 0.995,lam = 0.98,max_std=0.5,episode_size = 500,batch_size = 512,
                             nupdates = 300,save_iter=100,min_save_iter=200):
        np.random.seed = seed
        tf.set_random_seed(seed)
        demo_file = open('./multigoal_expert_demo.pkl', 'r')
        demonstrations, = pickle.load(demo_file)
        demonstrations = shuffle(demonstrations,random_state=seed)[:demo_size]
        
        demo_observes = []
        demo_actions = []
        for demonstration in demonstrations:
            for obs in demonstration['observes']:
                demo_observes.append(obs)
            for act in demonstration['actions']:
                demo_actions.append(act)
        demo_observes=np.asarray(demo_observes)
        demo_actions=np.asarray(demo_actions)
        exp_ret = np.mean([np.sum(t['rewards']) for t in demonstrations])
        
        env = multigoal.MultiGoalEnv(nr_goal=4)
        env.seed(seed)
        obs_dim = env.observation_space.shape[0]
        act_dim = env.action_space.shape[0]
        
        saver_prefix="./results/seed:{},kl:{:.2e},entcoeff:{:.2e},weight:{},mixture:{:d},epi_size:{}".format(seed,kl_targ,entcoeff,mdn_weight,n_mixture,episode_size)

        policy = Policy(obs_dim, act_dim,kl_targ=kl_targ,entcoeff=entcoeff,
                        mdn_weight=mdn_weight,n_mixture=n_mixture,max_std=max_std,
                        seed=seed,GPU_ID=GPU_ID)
        val_func = Value(obs_dim,seed=seed,GPU_ID=GPU_ID)
        rew_func = Reward(obs_dim,act_dim,seed=seed,GPU_ID=GPU_ID)
        mean_ret_list = []
        info_list = []
        train_rewards = True
        counter = 0
        for update in range(nupdates+1):
            
            trajectories = run_policy(env, policy, episodes=episode_size)
            
            add_rew(trajectories,rew_func)
            add_value(trajectories, val_func)  # add estimated values to episodes
            add_disc_sum_rew(trajectories, gamma)  # calculated discounted sum of Rs
            add_gae(trajectories, gamma, lam)  # calculate advantage
            observes, actions, advantages, disc_sum_rew = build_train_set(trajectories)
            obs_act_exp, obs_act_gen = build_train_set_for_rew(trajectories,demonstrations)

            pol_loss, pol_kl, pol_entropy = policy.update(observes, actions, advantages,batch_size=batch_size)  # update policy
            vf_loss = val_func.fit(observes, disc_sum_rew,batch_size=batch_size)  # update value function
            
            mean_ret = np.mean([np.sum(t['true_rewards']) for t in trajectories])
            mean_ret_list.append(mean_ret)
            train_ret = np.mean([np.sum(t['rewards']) for t in trajectories])
            info_list.append([t['infos'] for t in trajectories])
            if train_rewards:
                rew_loss, gen_acc, exp_acc = rew_func.fit(obs_act_exp, obs_act_gen, batch_size=batch_size)
                if gen_acc > 0.95 and exp_acc > 0.95:
                    counter+=1
                    if counter > 2:
                        train_rewards = False
                    if verbose:
                        print('[{}/{}] Mean Ret : {:.3f}/{:.3f}, Train Ret : {:.3f}, Rew Acc : [{:.3f},{:.3f}], Rew Loss : {:.3f}, Value Loss : {:.3f}, Policy loss : {:.5f}, Policy KL : {:.3f}, Policy Entropy : {:.3f}, Eps : {:.3f} ***'.format(
                            update, nupdates, mean_ret, exp_ret, train_ret, gen_acc, exp_acc, rew_loss, vf_loss, pol_loss, pol_kl, pol_entropy, policy.eps))
                        print(train_rewards)
            
#             if np.isnan(rew_loss) or np.isnan(pol_loss):
#                 policy.restore_session(saver_prefix+update_prefix+"_policy.ckpt")
#                 break

#             if ((update%save_iter)==0) and (update>=min_save_iter):
#                 update_prefix = ",iter:{}".format(update)
#                 policy.save_policy(saver_prefix+update_prefix+"_policy.ckpt")
#                 val_func.save_value(saver_prefix+update_prefix+"_value.ckpt")
#                 rew_func.save_reward(saver_prefix+update_prefix+"_reward.ckpt")
            
            if verbose and ((update%20)==0):
                print('[{}/{}] Mean Ret : {:.3f}/{:.3f}, Train Ret : {:.3f}, Rew Acc : [{:.3f},{:.3f}], Rew Loss : {:.3f}, Value Loss : {:.3f}, Policy loss : {:.5f}, Policy KL : {:.3f}, Policy Entropy : {:.3f}, Eps : {:.3f} ***'.format(
                    update, nupdates, mean_ret, exp_ret, train_ret, gen_acc, exp_acc, rew_loss, vf_loss, pol_loss, pol_kl, pol_entropy, policy.eps))
        
#         return_list, info_list = evaluation(env, policy, max_eval_epi=100)
#         env.close()
        evaluation_results = { "mean_ret_list": mean_ret_list, "info_list": info_list }
        pickle.dump( evaluation_results, open( saver_prefix+".pickle", "wb" ) )
        if verbose:
                print("Task is done with return: {:.3f}/{:.3f}".format(mean_ret,exp_ret))

if __name__ == "__main__":
    seed = 1
    kl_targ = 0.003
    entcoeff = 1e-3
    mdn_weight = "sparsemax"
    n_mixture = 4
    demo_size=300
    train_imitation_learning(seed,kl_targ,entcoeff,n_mixture,demo_size,mdn_weight,episode_size=500,nupdates=300,GPU_ID=0,verbose=True)

[0/300] Mean Ret : -54.280/1347.323, Train Ret : 14.556, Rew Acc : [0.778,0.622], Rew Loss : 1.267, Value Loss : 0.000, Policy loss : 0.11083, Policy KL : 0.000, Policy Entropy : -110.921, Eps : 0.090 ***
[20/300] Mean Ret : -69.337/1347.323, Train Ret : 8.387, Rew Acc : [0.780,0.932], Rew Loss : 0.648, Value Loss : 0.000, Policy loss : 0.27406, Policy KL : 0.001, Policy Entropy : -277.474, Eps : 0.090 ***
[40/300] Mean Ret : 194.873/1347.323, Train Ret : 5.549, Rew Acc : [0.902,0.993], Rew Loss : 0.357, Value Loss : 0.000, Policy loss : 0.61043, Policy KL : 0.003, Policy Entropy : -622.312, Eps : 0.090 ***
[60/300] Mean Ret : 329.457/1347.323, Train Ret : 3.869, Rew Acc : [0.929,0.986], Rew Loss : 0.276, Value Loss : 0.000, Policy loss : 0.71231, Policy KL : 0.001, Policy Entropy : -716.706, Eps : 0.090 ***
[80/300] Mean Ret : 366.441/1347.323, Train Ret : 5.578, Rew Acc : [0.890,0.935], Rew Loss : 0.425, Value Loss : 0.000, Policy loss : 1.06067, Policy KL : 0.002, Policy Entropy : -