#Initialization

In [66]:
%%capture
! pip install stable-baselines3

In [67]:
import math
import random
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Normal
from IPython.display import clear_output
import matplotlib.pyplot as plt
%matplotlib inline
import sklearn.preprocessing
from sklearn.kernel_approximation import RBFSampler

In [68]:
use_cuda = torch.cuda.is_available()
device   = torch.device("cuda" if use_cuda else "cpu")

In [69]:
samples = [[-0.6810, -0.7323, -0.3519],
        [ 0.9754, -0.2205, -0.9598],
        [ 0.8309, -0.5564, -0.8232],
        [ 0.4757,  0.8796,  0.8216],
        [-0.9221, -0.3869,  0.8180],
        [-0.9108, -0.4129,  0.2524],
        [-0.1120, -0.9937, -0.8391],
        [ 0.5760,  0.8175,  0.6597],
        [ 0.4632, -0.8863, -0.1590],
        [ 0.5347,  0.8450, -0.4712],
        [ 0.5242,  0.8516, -0.9269],
        [-0.8494, -0.5277,  0.9623],
        [ 0.8243, -0.5661, -0.8888],
        [-0.9457, -0.3252, -0.5394],
        [ 0.9998, -0.0188, -0.7901],
        [-0.9918, -0.1282,  0.9201]]

num_envs = 16

env1_name = "Pendulum-v1"
env2_name = "MountainCarContinuous-v0"

env1 = gym.make(env1_name)
env2 = gym.make(env2_name)

envs1 = make_vec_env(env1_name, n_envs=num_envs)
envs2 = make_vec_env(env2_name, n_envs=num_envs)

p_envs_ids = [env1_name, env2_name ]
p_envs = {env1_name: envs1,
          env2_name: envs2
          }

#model_ac = ActorCritic(envs=p_envs, env_ids=p_envs_ids, hidden_size=256).to(device)
#model_input = model_ac.create_model_input(samples, env1_name)

#dist, value = model_ac(model_input)
#print(f"dist: {dist}, val: {value}")

#Auxiliar function

In [70]:
def init_weights(m):
    if isinstance(m, nn.Linear):
        nn.init.normal_(m.weight, mean=0., std=0.1)
        nn.init.constant_(m.bias, 0.1)

def plot(name, frame_idx, rewards):
    clear_output(True)
    plt.figure(figsize=(20,5))
    plt.subplot(131)
    plt.title(name + ': ' + 'frame %s. reward: %s' % (frame_idx, rewards[-1]))
    plt.plot(rewards)
    plt.show()

def test_env(model, env, id_env, max_step, vis=False):

    state = torch.FloatTensor(env.reset()[0]).unsqueeze(0)
    if vis: env.render()
    done = False
    totalreward = 0
    step = 0
    while not done:
        #state = torch.FloatTensor(state).unsqueeze(0).to(device)
        #print(state)
        #state = torch.FloatTensor(state).to(device)
        #dist,  = model(model_input)
        state = np.array(state)
        model_input = model.create_model_input(state, idenv)
        dist,  = model(model_input)
        nextstate, reward, terminated, truncated,  = env.step(dist.sample().cpu().numpy())
        done = terminated or truncated


        #state = torch.FloatTensor([next_state.squeeze(-1)])#.to(device)
        #state = np.expand_dims( np.array(next_state.squeeze(-1)), 0 )
        #print(f"sq: {[next_state.squeeze(-1)]}")
        #state = torch.FloatTensor([next_state.squeeze(-1)])#.to(device)
        if vis: env.render()
        total_reward += reward
        step += 1
        if step >= max_step:
            break

    return total_reward

def test_env(model, env, id_env, vis=False):

    state = torch.FloatTensor(env.reset()[0]).unsqueeze(0)
    if vis: env.render()
    done = False
    total_reward = 0
    while not done:
        #state = torch.FloatTensor(state).unsqueeze(0).to(device)
        #print(state)
        #state = torch.FloatTensor(state).to(device)
        #dist, _ = model(model_input)
        state = np.array(state)
        model_input = model.create_model_input(state, id_env)
        dist, _ = model(model_input)
        next_state, reward, terminated, truncated, _ = env.step(dist.sample().cpu().numpy())
        done = terminated or truncated


        #state = torch.FloatTensor([next_state.squeeze(-1)])#.to(device)
        state = np.expand_dims( np.array(next_state.squeeze(-1)), 0 )
        #print(f"sq: {[next_state.squeeze(-1)]}")
        #state = torch.FloatTensor([next_state.squeeze(-1)])#.to(device)
        if vis: env.render()
        total_reward += reward
    return total_reward

p_env1_name = "Pendulum-v1"
p_env2_name = "MountainCarContinuous-v0"

p_env1 = gym.make(p_env1_name)
p_env2 = gym.make(p_env2_name)

#test_env(model_PC, env1, env1_name )
#test_env(model_PC, env2, env2_name)

# **General Advantage Estimator**

In [71]:
def compute_gae(next_value, rewards, masks, values, gamma=0.99, tau=0.95):
    values = values + [next_value]
    gae = 0
    returns = []
    for step in reversed(range(len(rewards))):
        delta = rewards[step] + gamma * values[step + 1] * masks[step] - values[step]
        gae = delta + gamma * tau * masks[step] * gae
        returns.insert(0, gae + values[step])
    return returns

# Neural Network

In [72]:
class RBFFeatureEncoder:
    def __init__(self, env1, env2, env3, n_component=100):

        self.env1 = env1
        self.env2 = env2
        self.env3 = env3

        data1 = np.array([env1.observation_space.sample() for x in range(10000)]); data1 = np.clip(data1, -1e38, 1e38)
        data2 = np.array([env2.observation_space.sample() for x in range(10000)]); data2 = np.clip(data2, -1e38, 1e38)
        data3 = np.array([env3.observation_space.sample() for x in range(10000)]); data3 = np.clip(data3, -1e38, 1e38)

        self.rbf_sampler1 = RBFSampler(gamma=0.999, n_components=n_component)
        self.rbf_sampler2 = RBFSampler(gamma=0.999, n_components=n_component)
        self.rbf_sampler3 = RBFSampler(gamma=0.999, n_components=n_component)

        self.standard_scaler1 = sklearn.preprocessing.StandardScaler()
        self.standard_scaler2 = sklearn.preprocessing.StandardScaler()
        self.standard_scaler3 = sklearn.preprocessing.StandardScaler()

        self.standard_scaler1.fit(data1)
        self.standard_scaler2.fit(data2)
        self.standard_scaler3.fit(data3)

        transformed_data1 = self.standard_scaler1.transform(data1)
        transformed_data2 = self.standard_scaler2.transform(data2)
        transformed_data3 = self.standard_scaler3.transform(data3)

        self.rbf_sampler1.fit( transformed_data1 )
        self.rbf_sampler2.fit( transformed_data2 )
        self.rbf_sampler3.fit( transformed_data3 )

    def wrapper_encode(self, model, model_input):

        state = model_input['state']
        env_id = model_input['env_id']

        if model.env1_id == env_id:
            transformed_state = self.standard_scaler1.transform([state])
            encoded_state = self.rbf_sampler1.transform(transformed_state).flatten()

        elif model.env2_id == env_id:
            transformed_state = self.standard_scaler2.transform([state])
            encoded_state = self.rbf_sampler2.transform(transformed_state).flatten()

        elif model.env3_id == env_id:
            transformed_state = self.standard_scaler3.transform([state])
            encoded_state = self.rbf_sampler3.transform(transformed_state).flatten()

        return encoded_state

    def encode(self, model, model_input):

        states = model_input['state']
        env_id = model_input['env_id']
        encoded_states = []

        for s in states:
            new_input = model.create_model_input(s,env_id )
            encoded_states.append( self.wrapper_encode(model, new_input) )

        return torch.tensor(encoded_states, dtype=torch.float32)


    @property
    def size(self):
        return self.rbf_sampler1.n_components

In [73]:
class Critic(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(Critic, self).__init__()

        self.relu = nn.ReLU()
        self.input_layer = nn.Linear(input_size, hidden_size)
        self.output_layer = nn.Linear(hidden_size, 1)

    def forward(self, x):
        state  = x['state']

        state = self.input_layer(state)
        state = self.relu(state)
        state = self.output_layer(state)
        return state


In [74]:
class Actor(nn.Module):
    def __init__(self, input_size, env_outputs, hidden_size, env_ids):
        super(Actor, self).__init__()

        self.env1_id = env_ids[0]
        self.env2_id = env_ids[1]
        self.env1_actions = env_outputs[0]
        self.env2_actions = env_outputs[1]

        self.relu = nn.ReLU()

        self.input_layer = nn.Linear(input_size, hidden_size)

        self.env1_output = nn.Linear(hidden_size, self.env1_actions)
        self.env2_output = nn.Linear(hidden_size, self.env2_actions)

    def forward(self, x):
        state  = x['state']
        env_id = x['env_id']

        state = self.input_layer(state)
        state = self.relu(state)

        if env_id == self.env1_id:   mu = self.env1_output(state)
        elif env_id == self.env2_id: mu = self.env2_output(state)

        return mu


In [75]:
class ActorCritic(nn.Module):
    def __init__(self, envs, env_ids, hidden_size, std=0.0, lr=0.0001):
        super(ActorCritic, self).__init__()

        self.env1_id = env_ids[0]
        self.env2_id = env_ids[1]

        self.envs1   = envs[self.env1_id]
        self.envs2   = envs[self.env2_id]

        self.env1_output = self.envs1.action_space.shape[0]
        self.env2_output = self.envs2.action_space.shape[0]

        self.encoder    = RBFFeatureEncoder(self.envs1, self.envs2, self.envs2)
        self.input_size = self.encoder.size

        #print(f"num_inputs1: {self.input_size}, num_outputs_1: {num_outputs_1}, env_id: {env1_id}")
        #print(f"num_inputs2: {self.input_size}, num_outputs_2: {num_outputs_2}, env_id: {env2_id}")


        self.critic = Critic(input_size=self.input_size,
                             hidden_size=hidden_size
                             )

        self.actor = Actor(input_size=self.input_size,
                           env_ids=env_ids,
                           env_outputs=[self.env1_output, self.env2_output ],
                           hidden_size=hidden_size
                           )

        self.log_std_env1 = nn.Parameter(torch.ones(1, self.env1_output) * std)
        self.log_std_env2 = nn.Parameter(torch.ones(1, self.env2_output) * std)

        self.optimizer = optim.Adam(self.parameters(), lr=lr)

        self.apply(init_weights)

    def forward(self, x):

        state  = x['state']
        env_id = x['env_id']

        state = self.encoder.encode(self, x).to(device)
        model_input = self.create_model_input(state, env_id)

        value = self.critic(model_input)
        mu    = self.actor(model_input)

        if env_id == self.env1_id:   std = self.log_std_env1.exp().expand_as(mu)
        elif env_id == self.env2_id: std = self.log_std_env2.exp().expand_as(mu)

        dist  = Normal(mu, std)
        return dist, value

    def create_model_input(self, state, env_id):
         return {'state':state, 'env_id': env_id}

# **Losses**

In [76]:
kldiv_loss = nn.KLDivLoss(reduction="batchmean")

#Loss function

In [84]:
def ppo_iter(hidden_network, mini_batch_size, states, actions, log_probs, returns, advantage):
    batch_size = states['network_PC'].size(0)
    mini_batch_states    = {}
    mini_batch_actions   = {}
    mini_batch_log_probs = {}
    mini_batch_returns   = {}
    mini_batch_advantage = {}
    for net in hidden_network.keys():
            mini_batch_states[net]    = []
            mini_batch_actions[net]   = []
            mini_batch_log_probs[net] = []
            mini_batch_returns[net]   = []
            mini_batch_advantage[net] = []

    for _ in range(batch_size // mini_batch_size):
        rand_ids = np.random.randint(0, batch_size, mini_batch_size)
        for net in hidden_network.keys():
            mini_batch_states[net].append(states[net][rand_ids, :])
            mini_batch_actions[net].append(actions[net][rand_ids, :])
            mini_batch_log_probs[net].append(log_probs[net][rand_ids, :])
            mini_batch_returns[net].append(returns[net][rand_ids, :])
            mini_batch_advantage[net].append(advantage[net][rand_ids, :])

    ret_dict = {'states': mini_batch_states,
                'actions': mini_batch_actions,
                'log_probs':mini_batch_log_probs,
                'returns': mini_batch_returns,
                'advantage': mini_batch_advantage
                }

    return ret_dict, batch_size

def ppo_update(hidden_network, env_id, ppo_epochs, mini_batch_size, states, actions, log_probs, returns, advantages, clip_param=0.2, beta=0.5, omega=4, omega12=1):
    for _ in range(ppo_epochs):
        ret_dict, batch_size = ppo_iter(hidden_network, mini_batch_size, states, actions, log_probs, returns, advantages)
        mini_batch_states = ret_dict['states']
        mini_batch_actions = ret_dict['actions']
        mini_batch_old_log_probs = ret_dict['log_probs']
        mini_batch_returns = ret_dict['returns']
        mini_batch_advantage = ret_dict['advantage']
        for idx in range(batch_size // mini_batch_size):
            new_log_probs = {}
            model_input   = {}
            dist          = {}
            value         = {}
            ppo_loss = 0
            casc_loss = 0
            net_id = 'network'
            for i in range(cascade_nets):
                if i == 0: NET = net_id + '_PC'
                else: NET = net_id + '_' + str(i+1)

                model_input[NET] = hidden_network[NET].create_model_input(np.array(mini_batch_states['network_PC'][idx]), env_id)
                dist[NET], value[NET] = hidden_network[NET](model_input[NET])
                new_log_probs[NET] = dist[NET].log_prob(mini_batch_actions[NET][idx])
                entropy[NET] += dist[NET].entropy().mean()

                ppo_loss += -beta*(omega**(i))*kldiv_loss(mini_batch_old_log_probs[NET][idx], new_log_probs[NET].exp())

                if i == 0:
                    casc_loss += -(omega12*kldiv_loss(mini_batch_old_log_probs[net_id + '_' + str(i+2)], new_log_probs[NET].exp()))
                elif i == cascade_nets-1:
                    casc_loss += -(omega*kldiv_loss(mini_batch_old_log_probs[net_id + '_' + str(i)][idx], new_log_probs[NET].exp())+kldiv_loss(new_log_probs[NET], new_log_probs[NET].exp()) )
                else:
                    casc_loss += -(omega*kldiv_loss(mini_batch_old_log_probs[net_id + '_' + str(i)][idx], new_log_probs[NET].exp())+kldiv_loss(mini_batch_old_log_probs[net_id + '_' + str(i+2)][idx], new_log_probs[NET].exp()) )

            #model_input = model.create_model_input(state, env_id)
            #dist, value = model(model_input)
            #entropy = dist.entropy().mean()
            #new_log_probs = dist.log_prob(action)


            ratio = (new_log_probs['network_PC'] - mini_batch_old_log_probs['network_PC'][idx]).exp()
            surr1 = ratio * mini_batch_advantage['network_PC'][idx]
            surr2 = torch.clamp(ratio, 1.0 - clip_param, 1.0 + clip_param) * mini_batch_advantage['network_PC'][idx]

            actor_loss  = - torch.min(surr1, surr2).mean()
            critic_loss = (mini_batch_returns['network_PC'][idx] - value['network_PC']).pow(2).mean()

            pg_loss = 0.5 * critic_loss + actor_loss - 0.001 * entropy['network_PC']

            total_loss = ppo_loss.mean() + casc_loss.mean() + pg_loss

            for net in hidden_network.keys():
                hidden_network[net].optimizer.zero_grad()

            total_loss.backward()

            for net in hidden_network.keys():
                hidden_network[net].optimizer.step()




#Main program

##Initialization for training

In [78]:
num_envs = 16

env1_id = "Pendulum-v1"
env2_id = "MountainCarContinuous-v0"
env_ids = [env1_id, env2_id ]

env1 = gym.make(env1_id)
env2 = gym.make(env2_id)
single_envs = { env1_id: env1,
                env2_id: env2
               }

num_inputs_1  = env1.observation_space.shape[0];    num_outputs_1 = env1.action_space.shape[0]
num_inputs_2  = env2.observation_space.shape[0];    num_outputs_2 = env2.action_space.shape[0]

print(f"num_inputs1: {num_inputs_1}, num_outputs_1: {num_outputs_1}, env_id: {env1_id}")
print(f"num_inputs2: {num_inputs_2}, num_outputs_2: {num_outputs_2}, env_id: {env2_id}")

envs1 = make_vec_env(env1_id, n_envs=num_envs)
envs2 = make_vec_env(env2_id, n_envs=num_envs)

vectorized_envs = { env1_id: envs1,
                    env2_id: envs2
                    }



num_inputs1: 3, num_outputs_1: 1, env_id: Pendulum-v1
num_inputs2: 2, num_outputs_2: 1, env_id: MountainCarContinuous-v0


# Training

In [79]:
def swtich_enviroment( vectorized_envs, env_ids, env_index):
    env_index += 1
    new_id_env = env_ids[ env_index % 2 ]   # -------- the switch is only for the first two environments
    new_env = vectorized_envs[ new_id_env ]
    return new_env, new_id_env, env_index

In [80]:
def train(hidden_network, main_env, vect_envs, train_id_env, state, max_frames, test_rewards, f_idx):
    frame_idx = f_idx
    while frame_idx < max_frames and not early_stop:
        print(f"\rit: {frame_idx}", end="")
        log_probs = []
        values    = []
        states    = []
        actions   = []
        rewards   = []
        masks     = []
        entropy = 0

        for _ in range(num_steps):

            model_input = main_model.create_model_input(state, train_id_env)
            dist, value = main_model(model_input)

            action = dist.sample()
            next_state, reward, done, _ = vect_envs.step(action.cpu().numpy())

            log_prob = dist.log_prob(action)
            entropy += dist.entropy().mean()

            log_probs.append(log_prob)
            values.append(value)
            rewards.append(torch.FloatTensor(reward).unsqueeze(1).to(device))
            masks.append(torch.FloatTensor(1-done).unsqueeze(1).to(device))

            states.append(torch.FloatTensor(model_input['state'] ).to(device) )
            actions.append(action)

            state = next_state
            frame_idx += 1

            if frame_idx % plot_frequency == 0:
                test_reward = np.mean([test_env(main_model, main_env, train_id_env) for _ in range(10)])
                test_rewards.append(test_reward)
                plot(train_id_env,frame_idx, test_rewards) # we cant use earlystop with env Mountaincar


        model_input = main_model.create_model_input(state, train_id_env)
        _, next_value = main_model(model_input)

        returns = compute_gae(next_value, rewards, masks, values)

        returns   = torch.cat(returns).detach()
        log_probs = torch.cat(log_probs).detach()
        values    = torch.cat(values).detach()
        states    = torch.cat(states)
        actions   = torch.cat(actions)
        advantage = returns - values


        ppo_update(main_model, optimizer, train_id_env, ppo_epochs, mini_batch_size, states, actions, log_probs, returns, advantage)

# **Settings an dHyperparameters**

In [81]:
#Hyper params:
hidden_size      = 256
lr               = 3e-4
num_steps        = 20
mini_batch_size  = 5
ppo_epochs       = 4
threshold_reward = 0

epoch = 18
train_per_epoch = 30000

swtich_counter = 0
switch_env_frequency = 3

cascade_nets = 4
hidden_network = {}
base_id = 'network'

for i in range(cascade_nets):
    if i == 0: hidden_id = base_id + '_PC'
    else: hidden_id = base_id + '_' + str(i+1)
    hidden_network[hidden_id] = ActorCritic(envs=vectorized_envs, env_ids=env_ids, hidden_size=256, lr=lr).to(device)
print(hidden_network.keys())

dict_keys(['network_PC', 'network_2', 'network_3', 'network_4'])


# **Inizio Training**

In [85]:
#early_stop = False
test_rewards = []

env_states = {}     # contain state to pass when callicng act
env_steps  = {}     # counter for steps
env_reward = {}     # counter for reward
env_index  = 1

# Reset of environments and initializing states
for id_env in vectorized_envs.keys():
    env_states[id_env] = vectorized_envs[id_env].reset()
    env_reward[id_env] = []
    env_steps[id_env]   = 0

actual_id_env = env_ids[env_index]
actual_vectorized_env = vectorized_envs[actual_id_env]
actual_single_env = single_envs[actual_id_env]

#optimizer = optim.Adam(model_PC.parameters(), lr=lr)  #: ----> Optimizer declared inside class ActorCritic()

for e in range(epoch):
    if e > 1 and ( e %  switch_env_frequency == 0):
        #print("entered")
        actual_vectorized_env, actual_id_env, env_index = swtich_enviroment(vectorized_envs, env_ids, env_index)
        actual_single_env=single_envs[actual_id_env]
        swtich_counter += 1

    print(f"e: {e}, actual_id_env: {actual_id_env}, env_index: {env_index}, switch: {swtich_counter}")

    actual_state = env_states[actual_id_env]
    frame_idx = 0
    while frame_idx < train_per_epoch:
        log_probs   = {}
        values      = {}
        states      = {}
        actions     = {}
        rewards     = {}
        masks       = {}
        model_input = {}
        dist        = {}
        value       = {}
        action      = {}
        next_state  = {}
        reward      = {}
        done        = {}
        log_prob    = {}
        entropy     = {}
        next_value  = {}
        returns     = {}
        advantage   = {}

        for net in hidden_network.keys():
            entropy[net]   = 0
            log_probs[net] = []
            values[net]    = []
            states[net]    = []
            actions[net]   = []
            rewards[net]   = []
            masks[net]     = []

        for _ in range(num_steps):

            for net in hidden_network.keys():
              state = env_states[actual_id_env]
              model_input[net] = hidden_network[net].create_model_input(state, actual_id_env)
              dist[net], value[net] = hidden_network[net](model_input[net])

              action[net] = dist[net].sample()
              next_state[net], reward[net], done[net], _ = actual_vectorized_env.step(action[net].cpu().numpy())
              #print(f"\nterminated: {terminated}, truncated: {truncated}")
              #done[net] = [terminated[i] or truncated[i]['TimeLimit.truncated'] for i in range(num_envs)]
              #print(f"\ndone[net]: {done[net]}")

              log_prob[net] = dist[net].log_prob(action[net])
              entropy[net] += dist[net].entropy().mean()

              log_probs[net].append(log_prob[net])
              values[net].append(value[net])
              rewards[net].append(torch.FloatTensor(reward[net]).unsqueeze(1).to(device))
              masks[net].append(torch.FloatTensor(1-done[net]).unsqueeze(1).to(device))

              states[net].append(torch.FloatTensor(model_input[net]['state'] ).to(device) )
              actions[net].append(action[net])

            #model_input = main_model.create_model_input(state, train_id_env)
            #dist, value = main_model(model_input)

            #action = dist.sample()
            #next_state, reward, done, _ = vect_envs.step(action.cpu().numpy())

            #log_prob = dist.log_prob(action)
            #entropy += dist.entropy().mean()

            #log_probs.append(log_prob)
            #values.append(value)
            #rewards.append(torch.FloatTensor(reward).unsqueeze(1).to(device))
            #masks.append(torch.FloatTensor(1-done).unsqueeze(1).to(device))

            #states.append(torch.FloatTensor(model_input['state'] ).to(device) )
            #actions.append(action)

            env_states[actual_id_env] = next_state['network_PC']
            frame_idx += 1

            if frame_idx % 1000 == 0:
                test_reward = np.mean([test_env(hidden_network['network_PC'], actual_single_env, actual_id_env) for _ in range(10)])
                test_rewards.append(test_reward)
                plot(actual_id_env,frame_idx + e*train_per_epoch, test_rewards)
                # we cant use earlystop with env Mountaincar

        for net in hidden_network.keys():
          model_input[net] = hidden_network[net].create_model_input(state, actual_id_env)
          _, next_value[net] = hidden_network[net](model_input[net])

          returns[net] = compute_gae(next_value[net], rewards[net], masks[net], values[net])

          returns[net]   = torch.cat(returns[net]).detach()
          log_probs[net] = torch.cat(log_probs[net]).detach()
          values[net]    = torch.cat(values[net]).detach()
          states[net]    = torch.cat(states[net])
          actions[net]   = torch.cat(actions[net])
          advantage[net] = returns[net] - values[net]

        #model_input = main_model.create_model_input(state, train_id_env)
        #_, next_value = main_model(model_input)

        #returns = compute_gae(next_value, rewards, masks, values)

        #returns   = torch.cat(returns).detach()
        #log_probs = torch.cat(log_probs).detach()
        #values    = torch.cat(values).detach()
        #states    = torch.cat(states)
        #actions   = torch.cat(actions)
        #advantage = returns - values


        ppo_update(hidden_network, actual_id_env, ppo_epochs, mini_batch_size, states, actions, log_probs, returns, advantage)

e: 0, actual_id_env: MountainCarContinuous-v0, env_index: 1, switch: 0


TypeError: kl_div(): argument 'input' (position 1) must be Tensor, not list

#Saving trajectories for GAIL - delete ?

In [None]:
from itertools import count

max_expert_num = 50000
num_steps = 0
expert_traj = []

for i_episode in count():
    state = env.reset()
    done = False
    total_reward = 0

    while not done:
        state = torch.FloatTensor(state).unsqueeze(0).to(device)
        dist, _ = model(state)
        action = dist.sample().cpu().numpy()[0]
        next_state, reward, done, _ = env.step(action)
        state = next_state
        total_reward += reward
        expert_traj.append(np.hstack([state, action]))
        num_steps += 1

    print("episode:", i_episode, "reward:", total_reward)

    if num_steps >= max_expert_num:
        break

expert_traj = np.stack(expert_traj)
print()
print(expert_traj.shape)
print()
np.save("expert_traj.npy", expert_traj)