In [1]:
import math
import random 

import gym
import numpy as np
        
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Normal,Beta
from sklearn import preprocessing

from IPython.display import clear_output
import matplotlib.pyplot as plt
%matplotlib inline

seed_number = 10

<h2>Use CUDA</h2>

In [2]:
np.random.seed(seed_number)
torch.backends.cudnn.deterministic = True
torch.manual_seed(seed_number)

use_cuda = torch.cuda.is_available()

if use_cuda:
    torch.cuda.manual_seed_all(seed_number)
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

<h2>Neural Network</h2>

In [3]:
def init_weights(m):
    if isinstance(m, nn.Linear):
        nn.init.normal_(m.weight, mean=0., std=0.1)
        nn.init.constant_(m.bias, 0.1)
        

class ActorCritic(nn.Module):
    def __init__(self, num_inputs, num_outputs, hidden_size, std=0.0):
        super(ActorCritic, self).__init__()
        
        self.critic = nn.Sequential(
            nn.Linear(num_inputs, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, 1)
        )
        
        self.actor = nn.Sequential(
            nn.Linear(num_inputs, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, num_outputs)
        )
        self.log_std = nn.Parameter(torch.ones(1, num_outputs) * std)
        
        self.apply(init_weights)
        
    def forward(self, x):
        value = self.critic(x)
        mu    = self.actor(x)
        std   = self.log_std.exp().expand_as(mu)
        dist  = Normal(mu, std)
        return dist, value

class ModelBasedGoalNetwork(nn.Module):
    def __init__(self, num_inputs, num_outputs, hidden_size):
        super(ModelBasedGoalNetwork, self).__init__()
        
        self.goalnetwork = nn.Sequential(
            nn.Linear(num_inputs, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, num_outputs)
        )
    
    def forward(self, x):
        subgoal = self.goalnetwork(x)
        return subgoal

In [4]:
def plot(frame_idx, rewards):
    clear_output(True)
    plt.figure(figsize=(20,5))
    plt.subplot(131)
    plt.title('frame %s. reward: %s' % (frame_idx, rewards[-1]))
    plt.plot(rewards)
    plt.show()
    
def test_env(model, goal, vis=False):
    state = env.reset()
    if vis: env.render()
    done = False
    total_reward = 0
    while not done:
        state_goal = np.concatenate((state,goal),0)
        state_goal = torch.FloatTensor(state_goal).unsqueeze(0).to(device)
        dist, _ = model(state_goal)
        next_state, reward, done, _ = env.step(dist.sample().cpu().numpy()[0])
        state = next_state
        if vis: env.render()
        total_reward += reward
    return total_reward

<h2>Hindsight GAE</h2>

In [5]:
def compute_gae(next_value, rewards, masks, values, gamma=0.99, lamda=0.95):
    values = values + [next_value]
    gae = 0
    returns = []
    
    for step in reversed(range(len(rewards))):
        delta = rewards[step] + gamma * values[step + 1] * masks[step] - values[step]
        gae = delta + gamma * lamda * masks[step] * gae
        returns.insert(0, gae + values[step])
    return returns

<h2> Importance Hindsight GAE </h2>

In [6]:
def hindsight_gae(rewards, current_logprobs, desired_logprobs, masks, values, gamma = 0.995, lamda = 0):
    lambda_ret = 1
    hindsight_gae = 0
    returns = []
    
    for step in range(len(rewards)):
        temp = 0
        is_weight_ratio = 1
        for step_ in range(step, len(rewards)):
            ratio = (current_logprobs[step_] - desired_logprobs[step_]).exp() 
            clipped_ratio = lambda_ret * torch.clamp(ratio, max = 1)
            is_weight_ratio = is_weight_ratio * clipped_ratio
        for step_ in range(step, len(rewards)):
            temp = temp + ((gamma ** (step_+1)) * rewards[step_] - (gamma ** (step_)) * rewards[step_])  
        temp = temp - (gamma ** (step + 1)) * rewards[step]
        
        delta = rewards[step] + is_weight_ratio * temp
        hindsight_gae = delta + gamma * lamda * masks[step] * hindsight_gae
        returns.insert(0, hindsight_gae + values[step])
        
    return returns

<h2> Compute Return </h2>

In [7]:
def compute_returns(rewards, gamma = 0.995):
    returns = 0
    returns_list = []
    for step in range(len(rewards)):
        returns = returns + (gamma ** i) * rewards[step] 
        returns_list.insert(0,returns)
    return returns_list

<h1> Proximal Policy Optimization Algorithm</h1>
<h2><a href="https://arxiv.org/abs/1707.06347">Arxiv</a></h2>

In [8]:
def ppo_iter(mini_batch_size, states, actions, log_probs, returns, advantage):
    batch_size = states.size(0)

    for _ in range(batch_size // mini_batch_size):
        rand_ids = np.random.randint(0, batch_size, mini_batch_size)
        yield states[rand_ids, :],actions[rand_ids,:],log_probs[rand_ids,:],returns[rand_ids,:],advantage[rand_ids,:]
        
        

def ppo_update(ppo_epochs, mini_batch_size, states, actions, log_probs, returns, 
               advantages, episode_count, test_reward, best_return, clip_param=0.2):
    actor_loss_list = []
    critic_loss_list = []
    clip = 5
    for ppo_epoch in range(ppo_epochs):
        for state, action, old_log_probs, return_, advantage in ppo_iter(mini_batch_size, states, 
                                                                         actions, log_probs, returns, advantages):
            model.zero_grad()
            dist, value = model(state)

            entropy = dist.entropy().mean()
            new_log_probs = dist.log_prob(action)

            ratio = (new_log_probs - old_log_probs).exp()

            surr1 = ratio * advantage
            surr2 = torch.clamp(ratio, 1.0 - clip_param, 1.0 + clip_param) * advantage

            actor_loss  = - torch.min(surr1, surr2).mean()
            # MSE Loss
            critic_loss = (return_ - value).pow(2).mean() 
            
            # Huber Loss
            # critic_loss = nn.functional.smooth_l1_loss(value, return_)
            
            actor_loss_list.append(actor_loss.data.cpu().numpy().item(0))
            critic_loss_list.append(critic_loss.data.cpu().numpy().item(0))
            
            loss = 0.5 * critic_loss + actor_loss - 0.0001 * entropy
            
            # optimizer.zero_grad()
            loss.backward()
            
            # clip gradient to prevent gradient exploding
            nn.utils.clip_grad_norm_(model.parameters(), clip)
            
            optimizer.step()
    
    mean_actor_loss = np.mean(actor_loss_list)
    mean_critic_loss = np.mean(critic_loss_list)
    
    mean_actor_loss_list.append(mean_actor_loss)
    mean_critic_loss_list.append(mean_critic_loss)
    
    assert ~np.isnan(mean_critic_loss), "Assert error: critic loss has nan value." 
    assert ~np.isinf(mean_critic_loss), "Assert error: critic loss has inf value."

    print ('\nEpisode: {0}, actor_loss: {1:.3f}, critic_loss: {2:.3f}, mean_reward: {3:.3f}, best_return: {4:.3f}'
           .format(episode_count, mean_actor_loss, mean_critic_loss, test_reward, best_return))

# Create Environment

In [9]:
from multiprocessing_env import SubprocVecEnv

num_envs = 16
env_name = "Pendulum-v0"

def make_env(i):
    def _thunk():
        
        env = gym.make(env_name)
        env.seed(i+seed_number)
        return env

    return _thunk

envs = [make_env(i) for i in range(num_envs)]
envs = SubprocVecEnv(envs)

env = gym.make(env_name)
env.seed(seed_number)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box au

[10]

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


# Initial Goal Distribution Generation

In [10]:
class RandomAgent(object):
    def __init__(self, action_space):
        self.action_space = action_space

    def act(self, observation, reward, done):
        return self.action_space.sample()
    
agent = RandomAgent(env.action_space)

episode_count = 50
reward = 0
done = False
initial_subgoals = []
        
for i in range(episode_count):
    state = env.reset()
#     print (state)
    done_count = 0
    while True:
        action = agent.act(state, reward, done)
        next_state, reward, done, _ = env.step(action)
        state = next_state
        if done:
            break
    initial_subgoals.append(state)

In [11]:
random.seed(seed_number)
initial_subgoal = initial_subgoals[random.randint(0, len(initial_subgoals)-1)]
print ('Initial subgoal sampled is: ', initial_subgoal)

Initial subgoal sampled is:  [ 0.11320659 -0.99357147 -0.14532486]


<h2> Model Based Goal Generator </h2>

In [12]:
def generate_random_trajectories(num_traj):
    print ('generating random trajectories...')
    dataset_random = []

    game_rewards = []
    for n in range(num_traj):

        obs = env.reset()
        while True:
            sampled_action = env.action_space.sample()
            new_obs, reward, done, _ = env.step(sampled_action)

            dataset_random.append([obs, new_obs, reward, done, sampled_action])

            obs = new_obs
            game_rewards.append(reward)

            if done:
                break

    # print some stats
    print('mean rand_dataset reward:',np.round(np.sum(game_rewards)/num_traj,2), 
          'max rand_dataset reward:', np.round(np.max(game_rewards),2), np.round(len(game_rewards)/num_traj))

    return dataset_random

def flatten_rl_dataset(temp, rl_dataset):
    for i in range(num_steps):
        for j in range(num_envs):
            rl_dataset.append([temp[0][0][i][j],temp[0][1][i][j],temp[0][2][i][j].cpu().numpy(),
                               temp[0][3][i][j].astype('bool'),temp[0][4][i][j].cpu().numpy()])
    return rl_dataset

def filtered_prep(dataset, min_dist):
    new_dataset = []
    for i in range(len(dataset)):
        curr_dist = np.linalg.norm(dataset[i][0]-desired_goal)
        if (curr_dist > (min_dist - 0.2)) and (curr_dist < (min_dist + 0.2)):
            new_dataset.append(dataset[i])
    return np.array(new_dataset)
                    
def MSELoss(y_truth, y_pred):
    y_truth = torch.FloatTensor(np.array(y_truth)).to(device)
    return F.mse_loss(y_pred.view(-1).float(), y_truth.view(-1))


def goal_network_update(dataset, train_iter, goal_model, goal_optimizer):
    # split dataset to 80% training and 20% validation
    len_data = len(dataset)
    
    d_train  = dataset[:int(len_data * 0.8)]
    d_valid  = dataset[int(len_data * 0.8):-1]
    
    state_action_input = np.concatenate((dataset[-1][0],dataset[-1][4]),0)
    sff      = np.arange(len(d_train))
    np.random.shuffle(sff)
    d_train  = d_train[sff]
    
    # training dataset
    x_train  = np.array([np.concatenate([s,a]) for s,_,_,_,a in d_train]) 
    y_train  = np.array([ns for _,ns,_,_,_ in d_train])
    
    # validation dataset
    x_valid  = np.array([np.concatenate([s,a]) for s,_,_,_,a in d_valid])
    y_valid  = np.array([ns for _,ns,_,_,_ in d_valid])

    losses_goal = []
    mse_valid = []
    # go through max_model_iter supervised iterations
    for i in range(train_iter):
        x_train += np.random.normal(loc = 0, scale=0.001, size = x_train.shape)

        goal_optimizer.zero_grad()
        pred_goal = goal_model((torch.tensor(x_train)).type(torch.FloatTensor).to(device))
        goal_loss = MSELoss(y_train, pred_goal)
        # print ('goal_loss: ', goal_loss.cpu().detach().numpy())
        losses_goal.append(goal_loss.cpu().detach().numpy())
        goal_loss.backward()
        goal_optimizer.step()

        # iteratively evaluate
        if i % 5 == 0:
            goal_model.eval()
            pred_goal = goal_model((torch.tensor(x_valid)).type(torch.FloatTensor).to(device))
            goal_model.train(True)
            valid_goal_loss = MSELoss(y_valid, pred_goal)
            mse_valid.append(valid_goal_loss)
            # print ('evaluation iteration: ',i, ' ,valid_goal_loss: ', valid_goal_loss.cpu().detach().numpy())
    # final evaluation
    goal_model.eval()
    pred_goal = goal_model((torch.tensor(x_valid)).type(torch.FloatTensor).to(device))
    goal_model.train(True)
    valid_goal_loss = MSELoss(y_valid, pred_goal)
    mse_valid.append(valid_goal_loss)
    # print ('end of training validation goal loss: ', valid_goal_loss.cpu().detach().numpy())
    return losses_goal, mse_valid, state_action_input

def model_based_goal_generator(rand_dataset, rl_dataset, rand_num_traj, mb_train_iter, min_dist, 
                               goal_model, goal_optimizer):
    print ('generate goal using supervised model based...')

    
    if len(rl_dataset) > 0:
        concat_dataset    = np.concatenate([rand_dataset, rl_dataset], axis=0)
    else:
        concat_dataset    = np.array(rl_dataset)
    print ('len after concat dataset: ', len(concat_dataset))

    filtered_dataset  = filtered_prep (concat_dataset, min_dist)
    print ('len filtered concat dataset: ', len(filtered_dataset))
    # if filtered dataset empty
    if len(filtered_dataset) > 0:
        mb_loss, mse_eval, state_action_input = goal_network_update(filtered_dataset, mb_train_iter, 
                                                                goal_model, goal_optimizer)
        all_mb_loss.append(mb_loss)
        all_goal_mse_eval.append(mse_eval)

        model_based_goal = goal_model(torch.FloatTensor(state_action_input).to(device))

        return model_based_goal
    else:
        return [0]
def evaluate_goals(state, num_envs, goal, desired_goal):
    sf_subgoal = state[random.randint(0, num_envs - 1)]
    mb_distance = np.linalg.norm(goal - desired_goal)
    sf_distance = np.linalg.norm(sf_subgoal - desired_goal) 
    print ('mb subgoal: ', goal, ' distance: ', mb_distance)
    print ('sample final subgoal: ', sf_subgoal, 'distance: ', sf_distance)
    mb_distance_list.append(mb_distance)
    sf_distance_list.append(sf_distance)
    

# Training Agent

In [13]:
num_inputs  = envs.observation_space.shape[0]
num_outputs = envs.action_space.shape[0]

# training hindsight ppo model hyperparameter
hidden_size      = 256
lr               = 3e-4
num_steps        = 20 # 20
mini_batch_size  = 5
ppo_epochs       = 4
threshold_reward = -200
max_frames       = 24000# 50000
frame_idx        = 0
episode_count    = 0
best_return      = -9999
desired_goal     = np.asarray([0,0,0])

# options hyperparameter
use_modelbased   = True
early_stop       = False
multi_goal       = True

# statistical variable
test_rewards          = []
mean_actor_loss_list  = []
mean_critic_loss_list = []

# model based parameter
rand_num_traj     = 1
mb_train_iter     = 60
init_goal_dist    = 7.0
end_goal_dist     = 0.01
decay_rate        = 1000 / math.log(700)
goal_network_lr   = 3e-4
all_mb_loss       = []
all_goal_mse_eval = []
mb_distance_list  = []
sf_distance_list  = []

# network model
model            = ActorCritic(2*num_inputs, num_outputs, hidden_size).to(device)
optimizer        = optim.Adam(model.parameters(), lr = lr)
goal_model       = ModelBasedGoalNetwork(num_inputs+num_outputs, num_inputs, hidden_size).to(device)
goal_optimizer   = optim.Adam(goal_model.parameters(), lr = goal_network_lr)

random_dataset   = generate_random_trajectories(rand_num_traj)
temp_rl_dataset  = []
rl_dataset       = []
state = envs.reset()

while frame_idx < max_frames:
    
    # sample state from previous episode
    if multi_goal:
        if frame_idx == 0: 
            goal = initial_subgoal
        else: 
            if use_modelbased:
                min_dist = init_goal_dist * math.exp((-1)*episode_count/decay_rate)
                goal = model_based_goal_generator(random_dataset, rl_dataset, rand_num_traj, mb_train_iter, 
                                                    min_dist,goal_model, goal_optimizer)
                if len(goal) > 1:
                    goal = goal.cpu().detach().numpy()
                    temp = goal
                else: 
                    goal = temp
                    
                if episode_count != 0:
                    evaluate_goals(state, num_envs, goal, desired_goal)
            else: 
                if len(state) > 1:
                    goal = state[random.randint(0, num_envs - 1)]
                else:
                    goal = state[0]
    else:
        goal = desired_goal
        
    log_probs         = []
    log_probs_desired = []
    values            = []
    states            = []
    states_goals      = []
    next_states       = []
    actions           = []
    actions_desired   = []
    rewards           = []
    dones             = []
    masks             = []
    entropy = 0

    for i in range(num_steps):
        state_goals = []
        state_desired_goals = []
        next_state_goals = []
        
        # append state with subgoal and desired goal
        for s in state: 
            state_goal = np.concatenate((s,goal),0)
            state_goals.append((state_goal))
            state_desired_goal = np.concatenate((s, desired_goal), 0)
            state_desired_goals.append((state_desired_goal))
            
        state_goals = np.array(state_goals)
        state_goals = torch.FloatTensor(state_goals).to(device)
        state_desired_goals = np.array(state_desired_goals)
        state_desired_goals = torch.FloatTensor(state_desired_goals).to(device)
        
        # for subgoal
        dist, value = model(state_goals)
        action = dist.sample() 
        next_state, reward, done, _ = envs.step(action.cpu().numpy())
        
        # for desired goal
        dist_desired, value_desired = model(state_desired_goals)
        action_desired = dist_desired.sample()
        
        # append next state with sub goal
        for n_s in next_state: 
            next_state_goal = np.concatenate((n_s, goal), 0)
            next_state_desired_goal = np.concatenate((n_s, desired_goal), 0)
            next_state_goals.append((next_state_goal)) 
        next_state_goals = np.array(next_state_goals)
        
        # for subgoal
        log_prob = dist.log_prob(action)
        # for desired goal
        log_prob_desired = dist_desired.log_prob(action_desired)
        
        entropy += dist.entropy().mean()
        
        # normalized reward
        reward = (reward - np.mean(reward))/(np.std(reward) + 1e-5)
        
        states.append(state)
        next_states.append(next_state)
        states_goals.append(state_goals)
        actions.append(action)
        actions_desired.append(action_desired)
        log_probs.append(log_prob)
        log_probs_desired.append(log_prob_desired)
        dones.append(done)
        rewards.append(torch.FloatTensor(reward).unsqueeze(1).to(device))
        masks.append(torch.FloatTensor(1 - done).unsqueeze(1).to(device))
        values.append(value)
        
        state = next_state
        frame_idx += 1
        
        if frame_idx % num_steps == 0:
            test_reward = np.mean([test_env(model, desired_goal) for _ in range(5)])
            test_rewards.append(test_reward)
            if test_reward >= best_return:
                best_return = test_reward
            # plot(frame_idx, test_rewards)
            if test_reward > threshold_reward: early_stop = True
                
    temp_rl_dataset.append([states, next_states, rewards, dones, actions])
    
    # forget older data (reduce bias)
    if episode_count % 20 == 0:
        rl_dataset = []
    rl_dataset = flatten_rl_dataset(temp_rl_dataset, rl_dataset)
    
    next_state_goals = torch.FloatTensor(next_state_goals).to(device)
    _, next_value = model(next_state_goals)

    old_logprobs     = log_probs 
    current_logprobs = log_probs_desired
    
    # print ('old_logprobs: ', log_probs)
    # print ('current_logprobs: ', current_logprobs)
#     returns        = hindsight_gae(rewards, old_logprobs, current_logprobs, masks, values)
    returns        = compute_gae (next_value, rewards, masks, values)
                          
    returns        = torch.cat(returns).detach()
    log_probs      = torch.cat(log_probs).detach()
    values         = torch.cat(values).detach()
    states_goals   = torch.cat(states_goals)
    actions        = torch.cat(actions)
    advantage      = returns - values

    ppo_update(ppo_epochs, mini_batch_size, states_goals, actions, log_probs, returns, advantage, episode_count, 
               test_reward, best_return)
    
    if frame_idx % (num_steps * 50) == 0:
        lower_bound = int((frame_idx - (num_steps * 50)) / num_steps)
        upper_bound = int(frame_idx / num_steps)
        last_fifty_episode_mean_reward = np.mean(test_rewards[lower_bound:upper_bound])
        print ('last 50 episode mean reward: ', last_fifty_episode_mean_reward)
        print ('\n')
    
    episode_count += 1

generating random trajectories...
mean rand_dataset reward: -1275.48 max rand_dataset reward: -1.63 200.0

Episode: 0, actor_loss: 0.052, critic_loss: 25.468, mean_reward: -1204.931, best_return: -1204.931
generate goal using supervised model based...
len after concat dataset:  520
len filtered concat dataset:  13
mb subgoal:  [-0.8257032   0.66481185 -4.9530663 ]  distance:  5.065237095518225
sample final subgoal:  [-0.56493398 -0.82513611 -6.43949962] distance:  6.516682844938181

Episode: 1, actor_loss: 0.087, critic_loss: 25.867, mean_reward: -1228.206, best_return: -1204.931
generate goal using supervised model based...
len after concat dataset:  840
len filtered concat dataset:  26
mb subgoal:  [-0.86883104  0.58116186 -6.817758  ]  distance:  6.897422834656414
sample final subgoal:  [-0.99366727  0.11236257  8.        ] distance:  8.06225774829855

Episode: 2, actor_loss: 1.165, critic_loss: 17.923, mean_reward: -1258.006, best_return: -1204.931
generate goal using supervised mo


Episode: 22, actor_loss: 1.786, critic_loss: 11.913, mean_reward: -1288.585, best_return: -1090.544
generate goal using supervised model based...
len after concat dataset:  1160
len filtered concat dataset:  44
mb subgoal:  [ 0.1135449 -0.9914758  5.331623 ]  distance:  5.424216200268805
sample final subgoal:  [-0.6893503   0.72442816  6.01977121] distance:  6.10226559536636

Episode: 23, actor_loss: 0.033, critic_loss: 8.348, mean_reward: -1259.072, best_return: -1090.544
generate goal using supervised model based...
len after concat dataset:  1480
len filtered concat dataset:  59
mb subgoal:  [ 0.11280993 -0.98376054  5.3346844 ]  distance:  5.4258057669888435
sample final subgoal:  [-0.06028885  0.99818097 -0.58711908] distance:  1.1596158024353178

Episode: 24, actor_loss: 2.128, critic_loss: 8.342, mean_reward: -1184.732, best_return: -1090.544
generate goal using supervised model based...
len after concat dataset:  1800
len filtered concat dataset:  70
mb subgoal:  [ 0.10788915 


Episode: 44, actor_loss: -1.010, critic_loss: 10.911, mean_reward: -1365.886, best_return: -977.640
generate goal using supervised model based...
len after concat dataset:  1800
len filtered concat dataset:  60
mb subgoal:  [-0.0715403   0.99378455 -4.0853906 ]  distance:  4.205132795861147
sample final subgoal:  [ 0.99557625 -0.09395711 -5.44570957] distance:  5.536763739327341

Episode: 45, actor_loss: 0.302, critic_loss: 15.176, mean_reward: -1356.826, best_return: -977.640
generate goal using supervised model based...
len after concat dataset:  2120
len filtered concat dataset:  70
mb subgoal:  [-0.07618105  0.99702823 -4.084574  ]  distance:  4.205189107197796
sample final subgoal:  [ 0.99878502  0.04927967 -5.39059558] distance:  5.482565153649531

Episode: 46, actor_loss: -0.236, critic_loss: 16.717, mean_reward: -1463.941, best_return: -977.640
generate goal using supervised model based...
len after concat dataset:  2440
len filtered concat dataset:  80
mb subgoal:  [-0.072365


Episode: 66, actor_loss: -0.538, critic_loss: 1.380, mean_reward: -1449.716, best_return: -977.640
generate goal using supervised model based...
len after concat dataset:  2440
len filtered concat dataset:  107
mb subgoal:  [-2.4756230e-03  1.0070016e+00  4.9938874e+00]  distance:  5.094405758942295
sample final subgoal:  [ 0.91416941 -0.40533232 -6.32425721] distance:  6.402829782927688

Episode: 67, actor_loss: -0.062, critic_loss: 1.258, mean_reward: -1355.011, best_return: -977.640
generate goal using supervised model based...
len after concat dataset:  2760
len filtered concat dataset:  125
mb subgoal:  [-3.5356209e-03  1.0025853e+00  4.9869771e+00]  distance:  5.086760301774772
sample final subgoal:  [-0.03862695 -0.9992537  -8.        ] distance:  8.06225774829855

Episode: 68, actor_loss: -0.122, critic_loss: 1.989, mean_reward: -1407.965, best_return: -977.640
generate goal using supervised model based...
len after concat dataset:  3080
len filtered concat dataset:  130
mb su


Episode: 88, actor_loss: 0.219, critic_loss: 3.633, mean_reward: -865.391, best_return: -865.391
generate goal using supervised model based...
len after concat dataset:  3080
len filtered concat dataset:  168
mb subgoal:  [ 0.52831274 -0.85099924  3.2008579 ]  distance:  3.3539238523554817
sample final subgoal:  [ 0.85123042 -0.52479213 -2.45446458] distance:  2.6503577777542127

Episode: 89, actor_loss: 5.013, critic_loss: 10.402, mean_reward: -953.510, best_return: -865.391
generate goal using supervised model based...
len after concat dataset:  3400
len filtered concat dataset:  215
mb subgoal:  [ 0.52242893 -0.84778124  3.2021146 ]  distance:  3.3533867680064167
sample final subgoal:  [-0.74572596 -0.6662528   0.50418355] distance:  1.1199111817489713

Episode: 90, actor_loss: -2.145, critic_loss: 5.173, mean_reward: -963.042, best_return: -865.391
generate goal using supervised model based...
len after concat dataset:  3720
len filtered concat dataset:  214
mb subgoal:  [ 0.51709


Episode: 110, actor_loss: -2.521, critic_loss: 11.923, mean_reward: -952.375, best_return: -865.391
generate goal using supervised model based...
len after concat dataset:  3720
len filtered concat dataset:  139
mb subgoal:  [ 0.43908697 -0.9028258  -3.5766177 ]  distance:  3.7148467054887457
sample final subgoal:  [ 0.12222425  0.99250251 -0.89378601] distance:  1.3412134169137233

Episode: 111, actor_loss: -1.575, critic_loss: 7.951, mean_reward: -911.577, best_return: -865.391
generate goal using supervised model based...
len after concat dataset:  4040
len filtered concat dataset:  160
mb subgoal:  [ 0.7548156  -0.65367186  2.6644325 ]  distance:  2.845388223531496
sample final subgoal:  [-0.99510534 -0.09881984  4.4816046 ] distance:  4.591816607616492

Episode: 112, actor_loss: -1.446, critic_loss: 3.201, mean_reward: -876.324, best_return: -865.391
generate goal using supervised model based...
len after concat dataset:  4360
len filtered concat dataset:  185
mb subgoal:  [ 0.75


Episode: 132, actor_loss: 0.539, critic_loss: 5.489, mean_reward: -794.139, best_return: -794.139
generate goal using supervised model based...
len after concat dataset:  4360
len filtered concat dataset:  250
mb subgoal:  [ 0.8310774  -0.55416656  2.4759328 ]  distance:  2.669837751949929
sample final subgoal:  [ 0.46660874 -0.88446384  4.9848022 ] distance:  5.084117717494321

Episode: 133, actor_loss: 1.330, critic_loss: 5.890, mean_reward: -874.101, best_return: -794.139
generate goal using supervised model based...
len after concat dataset:  4680
len filtered concat dataset:  282
mb subgoal:  [ 0.8260539  -0.55771184  2.4741147 ]  distance:  2.667330295226366
sample final subgoal:  [ 0.69487641 -0.71912917 -0.96842854] distance:  1.3920681866512714

Episode: 134, actor_loss: -0.859, critic_loss: 6.651, mean_reward: -918.820, best_return: -794.139
generate goal using supervised model based...
len after concat dataset:  5000
len filtered concat dataset:  300
mb subgoal:  [ 0.829680


Episode: 154, actor_loss: -0.944, critic_loss: 4.699, mean_reward: -998.367, best_return: -786.180
generate goal using supervised model based...
len after concat dataset:  5000
len filtered concat dataset:  268
mb subgoal:  [ 0.6113753  -0.79521686  1.7383062 ]  distance:  2.0069523905138773
sample final subgoal:  [ 0.92852757 -0.37126344 -0.34801564] distance:  1.058827126698428

Episode: 155, actor_loss: -0.784, critic_loss: 3.372, mean_reward: -904.978, best_return: -786.180
generate goal using supervised model based...
len after concat dataset:  5320
len filtered concat dataset:  332
mb subgoal:  [ 0.61372375 -0.78674096  1.7361351 ]  distance:  2.002444345344962
sample final subgoal:  [-0.89920153  0.43753469  8.        ] distance:  8.06225774829855

Episode: 156, actor_loss: 1.440, critic_loss: 5.691, mean_reward: -892.330, best_return: -786.180
generate goal using supervised model based...
len after concat dataset:  5640
len filtered concat dataset:  368
mb subgoal:  [ 0.609332


Episode: 176, actor_loss: -0.182, critic_loss: 3.933, mean_reward: -801.868, best_return: -786.180
generate goal using supervised model based...
len after concat dataset:  5640
len filtered concat dataset:  441
mb subgoal:  [ 0.66548586 -0.74537724  1.4241335 ]  distance:  1.7397169308903897
sample final subgoal:  [ 0.93504282 -0.35453479 -0.78736061] distance:  1.2727673483962916

Episode: 177, actor_loss: 0.949, critic_loss: 11.861, mean_reward: -842.639, best_return: -786.180
generate goal using supervised model based...
len after concat dataset:  5960
len filtered concat dataset:  429
mb subgoal:  [ 0.6661652 -0.746818   1.4253025]  distance:  1.7415511488608024
sample final subgoal:  [ 0.99998918 -0.00465155  0.34860083] distance:  1.0590196123190436

Episode: 178, actor_loss: 0.357, critic_loss: 10.400, mean_reward: -813.884, best_return: -786.180
generate goal using supervised model based...
len after concat dataset:  6280
len filtered concat dataset:  453
mb subgoal:  [ 0.6622


Episode: 198, actor_loss: -1.808, critic_loss: 6.620, mean_reward: -749.423, best_return: -714.282
generate goal using supervised model based...
len after concat dataset:  6280
len filtered concat dataset:  393
mb subgoal:  [ 0.6623396  -0.74620074  1.4280369 ]  distance:  1.7420673912324198
sample final subgoal:  [ 0.99759884 -0.06925723  0.16212699] distance:  1.0130573337023656

Episode: 199, actor_loss: 4.763, critic_loss: 12.292, mean_reward: -718.361, best_return: -714.282
last 50 episode mean reward:  -847.9874919609651


generate goal using supervised model based...
len after concat dataset:  6600
len filtered concat dataset:  453
mb subgoal:  [-0.96338063  0.27244335  1.935683  ]  distance:  2.179265094399166
sample final subgoal:  [0.94116154 0.33795704 0.4664    ] distance:  1.1034169474689457

Episode: 200, actor_loss: -4.696, critic_loss: 8.455, mean_reward: -753.160, best_return: -714.282
generate goal using supervised model based...
len after concat dataset:  520
len fi


Episode: 220, actor_loss: -1.136, critic_loss: 9.317, mean_reward: -713.562, best_return: -684.739
generate goal using supervised model based...
len after concat dataset:  520
len filtered concat dataset:  42
mb subgoal:  [-0.69575053  0.70578784  1.9351892 ]  distance:  2.1742039232988786
sample final subgoal:  [ 0.46885084 -0.88327736 -0.29241395] distance:  1.0418761528622724

Episode: 221, actor_loss: -2.343, critic_loss: 7.248, mean_reward: -864.851, best_return: -684.739
generate goal using supervised model based...
len after concat dataset:  840
len filtered concat dataset:  66
mb subgoal:  [-0.70480096  0.709592    1.9345459 ]  distance:  2.1777816519906015
sample final subgoal:  [ 0.22886349 -0.97345853  6.65537082] distance:  6.730078805128931

Episode: 222, actor_loss: 0.505, critic_loss: 6.729, mean_reward: -606.615, best_return: -606.615
generate goal using supervised model based...
len after concat dataset:  1160
len filtered concat dataset:  89
mb subgoal:  [-0.6972405 


Episode: 242, actor_loss: -3.060, critic_loss: 6.138, mean_reward: -698.781, best_return: -606.615
generate goal using supervised model based...
len after concat dataset:  1160
len filtered concat dataset:  133
mb subgoal:  [-0.4126229  -0.90792924 -1.4239378 ]  distance:  1.7384452869635478
sample final subgoal:  [ 0.79718997 -0.60372854  0.01527056] distance:  1.0001165881836815

Episode: 243, actor_loss: -0.476, critic_loss: 6.709, mean_reward: -779.851, best_return: -606.615
generate goal using supervised model based...
len after concat dataset:  1480
len filtered concat dataset:  171
mb subgoal:  [-0.41256174 -0.90526104 -1.4213277 ]  distance:  1.7348997673323834
sample final subgoal:  [ 0.19472749  0.98085738 -3.17261172] distance:  3.326479391689477

Episode: 244, actor_loss: -0.753, critic_loss: 3.645, mean_reward: -788.330, best_return: -606.615
generate goal using supervised model based...
len after concat dataset:  1800
len filtered concat dataset:  219
mb subgoal:  [-0.41


Episode: 264, actor_loss: -0.991, critic_loss: 9.168, mean_reward: -717.828, best_return: -514.409
generate goal using supervised model based...
len after concat dataset:  1800
len filtered concat dataset:  398
mb subgoal:  [-0.9904399   0.13318413 -0.47044358]  distance:  1.1045480305735962
sample final subgoal:  [-0.29478844  0.95556254  5.93824075] distance:  6.021852143325613

Episode: 265, actor_loss: 1.251, critic_loss: 31.764, mean_reward: -685.541, best_return: -514.409
generate goal using supervised model based...
len after concat dataset:  2120
len filtered concat dataset:  492
mb subgoal:  [-0.99013484  0.13257796 -0.4698252 ]  distance:  1.103938237653639
sample final subgoal:  [0.64477374 0.76437348 2.84850958] distance:  3.0189413400431637

Episode: 266, actor_loss: 1.815, critic_loss: 14.920, mean_reward: -645.131, best_return: -514.409
generate goal using supervised model based...
len after concat dataset:  2440
len filtered concat dataset:  579
mb subgoal:  [-0.989312


Episode: 286, actor_loss: 2.490, critic_loss: 56.655, mean_reward: -327.051, best_return: -292.944
generate goal using supervised model based...
len after concat dataset:  2440
len filtered concat dataset:  548
mb subgoal:  [-0.5597958   0.8261192   0.54078084]  distance:  1.1350278237060887
sample final subgoal:  [ 0.99094851  0.13424247 -0.1138396 ] distance:  1.0064588682766877

Episode: 287, actor_loss: -0.805, critic_loss: 51.512, mean_reward: -366.827, best_return: -292.944
generate goal using supervised model based...
len after concat dataset:  2760
len filtered concat dataset:  622
mb subgoal:  [-0.56160873  0.82731855  0.5398896 ]  distance:  1.1363719028209358
sample final subgoal:  [ 0.9671292  -0.25428548  0.0094672 ] distance:  1.0000448128913721

Episode: 288, actor_loss: 2.110, critic_loss: 69.358, mean_reward: -394.727, best_return: -292.944
generate goal using supervised model based...
len after concat dataset:  3080
len filtered concat dataset:  687
mb subgoal:  [-0.


Episode: 308, actor_loss: -0.244, critic_loss: 6.101, mean_reward: -316.609, best_return: -217.985
generate goal using supervised model based...
len after concat dataset:  3080
len filtered concat dataset:  457
mb subgoal:  [-0.5634953  0.8248345  0.5395611]  distance:  1.1353436027260253
sample final subgoal:  [ 0.99022034  0.1395123  -0.20579279] distance:  1.0209557639723872

Episode: 309, actor_loss: -3.858, critic_loss: 161.024, mean_reward: -341.233, best_return: -217.985
generate goal using supervised model based...
len after concat dataset:  3400
len filtered concat dataset:  496
mb subgoal:  [-0.5641854  0.8253979  0.5371381]  distance:  1.134946780676285
sample final subgoal:  [ 0.76379549 -0.64545832 -0.30398534] distance:  1.0451828011137303

Episode: 310, actor_loss: -5.855, critic_loss: 32.707, mean_reward: -306.235, best_return: -217.985
generate goal using supervised model based...
len after concat dataset:  3720
len filtered concat dataset:  522
mb subgoal:  [-0.56181


Episode: 330, actor_loss: -3.239, critic_loss: 28.932, mean_reward: -723.548, best_return: -128.803
generate goal using supervised model based...
len after concat dataset:  3720
len filtered concat dataset:  23
mb subgoal:  [ 0.85775274 -0.51626986 -0.15860233]  distance:  1.0136217405331878
sample final subgoal:  [-0.25228968  0.96765175 -6.00835197] distance:  6.091001020316049

Episode: 331, actor_loss: -2.243, critic_loss: 26.739, mean_reward: -511.978, best_return: -128.803
generate goal using supervised model based...
len after concat dataset:  4040
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0.51626986 -0.15860233]  distance:  1.0136217405331878
sample final subgoal:  [-0.42286793  0.90619132  1.41948347] distance:  1.736356335410217

Episode: 332, actor_loss: -2.075, critic_loss: 13.232, mean_reward: -459.958, best_return: -128.803
generate goal using supervised model based...
len after concat dataset:  4360
len filtered concat dataset:  0
mb subgoal:  [ 0.85775


Episode: 352, actor_loss: -2.244, critic_loss: 15.562, mean_reward: -381.422, best_return: -128.803
generate goal using supervised model based...
len after concat dataset:  4360
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0.51626986 -0.15860233]  distance:  1.0136217405331878
sample final subgoal:  [0.99782607 0.06590244 0.28288312] distance:  1.0392414829430918

Episode: 353, actor_loss: -0.986, critic_loss: 26.143, mean_reward: -232.515, best_return: -128.803
generate goal using supervised model based...
len after concat dataset:  4680
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0.51626986 -0.15860233]  distance:  1.0136217405331878
sample final subgoal:  [-0.83270327  0.55371949  6.6613394 ] distance:  6.7359811885391565

Episode: 354, actor_loss: -0.270, critic_loss: 19.372, mean_reward: -174.099, best_return: -128.803
generate goal using supervised model based...
len after concat dataset:  5000
len filtered concat dataset:  0
mb subgoal:  [ 0.8577527


Episode: 374, actor_loss: 1.313, critic_loss: 28.514, mean_reward: -639.867, best_return: -104.639
generate goal using supervised model based...
len after concat dataset:  5000
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0.51626986 -0.15860233]  distance:  1.0136217405331878
sample final subgoal:  [0.9999401  0.01094489 0.18410303] distance:  1.0168057460760256

Episode: 375, actor_loss: 4.444, critic_loss: 54.378, mean_reward: -259.665, best_return: -104.639
generate goal using supervised model based...
len after concat dataset:  5320
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0.51626986 -0.15860233]  distance:  1.0136217405331878
sample final subgoal:  [ 0.99974794 -0.02245111  0.15229634] distance:  1.0115306101833097

Episode: 376, actor_loss: -1.102, critic_loss: 21.473, mean_reward: -279.967, best_return: -104.639
generate goal using supervised model based...
len after concat dataset:  5640
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 


Episode: 396, actor_loss: 1.712, critic_loss: 29.714, mean_reward: -151.235, best_return: -104.639
generate goal using supervised model based...
len after concat dataset:  5640
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0.51626986 -0.15860233]  distance:  1.0136217405331878
sample final subgoal:  [ 0.99970512  0.02428338 -0.08685613] distance:  1.003764906449828

Episode: 397, actor_loss: 1.797, critic_loss: 36.105, mean_reward: -349.208, best_return: -104.639
generate goal using supervised model based...
len after concat dataset:  5960
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0.51626986 -0.15860233]  distance:  1.0136217405331878
sample final subgoal:  [0.99996672 0.00815897 0.07592426] distance:  1.0028781052223659

Episode: 398, actor_loss: 1.138, critic_loss: 15.843, mean_reward: -315.375, best_return: -104.639
generate goal using supervised model based...
len after concat dataset:  6280
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0


Episode: 418, actor_loss: 1.838, critic_loss: 36.827, mean_reward: -256.158, best_return: -99.222
generate goal using supervised model based...
len after concat dataset:  6280
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0.51626986 -0.15860233]  distance:  1.0136217405331878
sample final subgoal:  [ 0.99997726  0.00674315 -0.12483438] distance:  1.0077616890368584

Episode: 419, actor_loss: 14.436, critic_loss: 53.962, mean_reward: -203.011, best_return: -99.222
generate goal using supervised model based...
len after concat dataset:  6600
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0.51626986 -0.15860233]  distance:  1.0136217405331878
sample final subgoal:  [ 0.9750548   0.22196429 -0.51245848] distance:  1.1236608425442254

Episode: 420, actor_loss: -7.333, critic_loss: 46.545, mean_reward: -124.331, best_return: -99.222
generate goal using supervised model based...
len after concat dataset:  520
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 


Episode: 440, actor_loss: -5.065, critic_loss: 26.545, mean_reward: -103.341, best_return: -99.222
generate goal using supervised model based...
len after concat dataset:  520
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0.51626986 -0.15860233]  distance:  1.0136217405331878
sample final subgoal:  [-0.14154271  0.98993215 -2.27613607] distance:  2.4861205576381065

Episode: 441, actor_loss: -8.530, critic_loss: 44.639, mean_reward: -306.959, best_return: -99.222
generate goal using supervised model based...
len after concat dataset:  840
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0.51626986 -0.15860233]  distance:  1.0136217405331878
sample final subgoal:  [ 0.23937674 -0.97092676  5.51039025] distance:  5.600392907257322

Episode: 442, actor_loss: -2.397, critic_loss: 26.754, mean_reward: -259.091, best_return: -99.222
generate goal using supervised model based...
len after concat dataset:  1160
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -


Episode: 462, actor_loss: 0.318, critic_loss: 36.394, mean_reward: -209.253, best_return: -53.935
generate goal using supervised model based...
len after concat dataset:  1160
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0.51626986 -0.15860233]  distance:  1.0136217405331878
sample final subgoal:  [-0.43110919 -0.90229976 -4.02497552] distance:  4.147339861108415

Episode: 463, actor_loss: -0.355, critic_loss: 35.675, mean_reward: -205.255, best_return: -53.935
generate goal using supervised model based...
len after concat dataset:  1480
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0.51626986 -0.15860233]  distance:  1.0136217405331878
sample final subgoal:  [0.99737695 0.07238244 0.05632888] distance:  1.0015852146512112

Episode: 464, actor_loss: 0.273, critic_loss: 5.837, mean_reward: -188.818, best_return: -53.935
generate goal using supervised model based...
len after concat dataset:  1800
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0.51


Episode: 484, actor_loss: 1.695, critic_loss: 21.549, mean_reward: -207.466, best_return: -53.935
generate goal using supervised model based...
len after concat dataset:  1800
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0.51626986 -0.15860233]  distance:  1.0136217405331878
sample final subgoal:  [ 0.99751188 -0.07049854 -0.01795439] distance:  1.0001611670898325

Episode: 485, actor_loss: -1.323, critic_loss: 18.209, mean_reward: -154.251, best_return: -53.935
generate goal using supervised model based...
len after concat dataset:  2120
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0.51626986 -0.15860233]  distance:  1.0136217405331878
sample final subgoal:  [ 0.99910873 -0.04221079  0.12588909] distance:  1.0078928821600932

Episode: 486, actor_loss: 2.848, critic_loss: 40.010, mean_reward: -283.725, best_return: -53.935
generate goal using supervised model based...
len after concat dataset:  2440
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 


Episode: 506, actor_loss: 1.010, critic_loss: 10.722, mean_reward: -129.372, best_return: -53.935
generate goal using supervised model based...
len after concat dataset:  2440
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0.51626986 -0.15860233]  distance:  1.0136217405331878
sample final subgoal:  [ 0.99882895 -0.04838114 -0.0265343 ] distance:  1.0003519724973207

Episode: 507, actor_loss: 2.543, critic_loss: 35.037, mean_reward: -281.514, best_return: -53.935
generate goal using supervised model based...
len after concat dataset:  2760
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0.51626986 -0.15860233]  distance:  1.0136217405331878
sample final subgoal:  [ 0.99980039 -0.01997938 -0.02525386] distance:  1.0003188279000033

Episode: 508, actor_loss: 2.657, critic_loss: 19.141, mean_reward: -137.866, best_return: -53.935
generate goal using supervised model based...
len after concat dataset:  3080
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -


Episode: 528, actor_loss: 2.283, critic_loss: 22.838, mean_reward: -126.524, best_return: -53.935
generate goal using supervised model based...
len after concat dataset:  3080
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0.51626986 -0.15860233]  distance:  1.0136217405331878
sample final subgoal:  [ 0.99996902  0.00787158 -0.16374139] distance:  1.013316950268426

Episode: 529, actor_loss: 17.787, critic_loss: 48.561, mean_reward: -154.057, best_return: -53.935
generate goal using supervised model based...
len after concat dataset:  3400
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0.51626986 -0.15860233]  distance:  1.0136217405331878
sample final subgoal:  [ 0.03546763 -0.99937083  0.68785717] distance:  1.2137328722640852

Episode: 530, actor_loss: -4.229, critic_loss: 18.068, mean_reward: -158.461, best_return: -53.935
generate goal using supervised model based...
len after concat dataset:  3720
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 


Episode: 550, actor_loss: -6.378, critic_loss: 46.400, mean_reward: -255.937, best_return: -53.935
generate goal using supervised model based...
len after concat dataset:  3720
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0.51626986 -0.15860233]  distance:  1.0136217405331878
sample final subgoal:  [ 0.55987226 -0.82857893  1.07675267] distance:  1.469488452733508

Episode: 551, actor_loss: -5.493, critic_loss: 29.349, mean_reward: -249.912, best_return: -53.935
generate goal using supervised model based...
len after concat dataset:  4040
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0.51626986 -0.15860233]  distance:  1.0136217405331878
sample final subgoal:  [-0.9704569   0.24127455 -5.1256571 ] distance:  5.222294583346672

Episode: 552, actor_loss: -3.805, critic_loss: 27.696, mean_reward: -132.053, best_return: -53.935
generate goal using supervised model based...
len after concat dataset:  4360
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 


Episode: 572, actor_loss: -3.237, critic_loss: 20.043, mean_reward: -299.487, best_return: -53.935
generate goal using supervised model based...
len after concat dataset:  4360
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0.51626986 -0.15860233]  distance:  1.0136217405331878
sample final subgoal:  [0.99892307 0.04639719 0.21565754] distance:  1.0229898214885214

Episode: 573, actor_loss: 0.335, critic_loss: 12.062, mean_reward: -159.045, best_return: -53.935
generate goal using supervised model based...
len after concat dataset:  4680
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0.51626986 -0.15860233]  distance:  1.0136217405331878
sample final subgoal:  [0.99601724 0.08916086 0.06653508] distance:  1.002211014065823

Episode: 574, actor_loss: -0.896, critic_loss: 5.760, mean_reward: -283.529, best_return: -53.935
generate goal using supervised model based...
len after concat dataset:  5000
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0.5162


Episode: 594, actor_loss: 1.499, critic_loss: 17.102, mean_reward: -157.184, best_return: -53.935
generate goal using supervised model based...
len after concat dataset:  5000
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0.51626986 -0.15860233]  distance:  1.0136217405331878
sample final subgoal:  [ 0.99944636  0.03327117 -0.03660483] distance:  1.000669732412513

Episode: 595, actor_loss: 1.787, critic_loss: 28.940, mean_reward: -411.248, best_return: -53.935
generate goal using supervised model based...
len after concat dataset:  5320
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0.51626986 -0.15860233]  distance:  1.0136217405331878
sample final subgoal:  [ 0.99983432  0.01820272 -0.00645989] distance:  1.000020864868859

Episode: 596, actor_loss: 1.754, critic_loss: 16.085, mean_reward: -259.990, best_return: -53.935
generate goal using supervised model based...
len after concat dataset:  5640
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0.


Episode: 616, actor_loss: 0.223, critic_loss: 8.101, mean_reward: -293.566, best_return: -53.935
generate goal using supervised model based...
len after concat dataset:  5640
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0.51626986 -0.15860233]  distance:  1.0136217405331878
sample final subgoal:  [ 0.99950505 -0.03145878 -0.07161504] distance:  1.0025610772283708

Episode: 617, actor_loss: 2.175, critic_loss: 37.825, mean_reward: -320.851, best_return: -53.935
generate goal using supervised model based...
len after concat dataset:  5960
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0.51626986 -0.15860233]  distance:  1.0136217405331878
sample final subgoal:  [ 0.99984924 -0.01736391  0.03200783] distance:  1.0005121194113333

Episode: 618, actor_loss: 1.538, critic_loss: 15.040, mean_reward: -256.794, best_return: -53.935
generate goal using supervised model based...
len after concat dataset:  6280
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0


Episode: 638, actor_loss: 2.624, critic_loss: 41.590, mean_reward: -456.623, best_return: -53.935
generate goal using supervised model based...
len after concat dataset:  6280
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0.51626986 -0.15860233]  distance:  1.0136217405331878
sample final subgoal:  [ 0.99999606 -0.0028075  -0.09426392] distance:  1.0044330178367438

Episode: 639, actor_loss: 20.070, critic_loss: 67.231, mean_reward: -518.736, best_return: -53.935
generate goal using supervised model based...
len after concat dataset:  6600
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0.51626986 -0.15860233]  distance:  1.0136217405331878
sample final subgoal:  [-0.66956163 -0.7427565  -0.21844816] distance:  1.0235817502424722

Episode: 640, actor_loss: -8.109, critic_loss: 31.967, mean_reward: -696.128, best_return: -53.935
generate goal using supervised model based...
len after concat dataset:  520
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 


Episode: 660, actor_loss: -7.168, critic_loss: 48.831, mean_reward: -840.317, best_return: -53.935
generate goal using supervised model based...
len after concat dataset:  520
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0.51626986 -0.15860233]  distance:  1.0136217405331878
sample final subgoal:  [-0.28728988 -0.95784368 -4.12200317] distance:  4.24156929746423

Episode: 661, actor_loss: -2.765, critic_loss: 24.055, mean_reward: -371.594, best_return: -53.935
generate goal using supervised model based...
len after concat dataset:  840
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0.51626986 -0.15860233]  distance:  1.0136217405331878
sample final subgoal:  [0.99943644 0.03356784 0.02656448] distance:  1.0003527734688302

Episode: 662, actor_loss: -3.127, critic_loss: 21.265, mean_reward: -437.021, best_return: -53.935
generate goal using supervised model based...
len after concat dataset:  1160
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0.51


Episode: 682, actor_loss: -1.123, critic_loss: 11.987, mean_reward: -316.506, best_return: -53.935
generate goal using supervised model based...
len after concat dataset:  1160
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0.51626986 -0.15860233]  distance:  1.0136217405331878
sample final subgoal:  [ 0.99951757 -0.03105834  0.36008246] distance:  1.0628543552439762

Episode: 683, actor_loss: -0.664, critic_loss: 5.850, mean_reward: -431.429, best_return: -53.935
generate goal using supervised model based...
len after concat dataset:  1480
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0.51626986 -0.15860233]  distance:  1.0136217405331878
sample final subgoal:  [0.99878149 0.04935105 0.07640872] distance:  1.0029148978078786

Episode: 684, actor_loss: -2.547, critic_loss: 3.542, mean_reward: -401.382, best_return: -53.935
generate goal using supervised model based...
len after concat dataset:  1800
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0.


Episode: 704, actor_loss: -0.592, critic_loss: 5.477, mean_reward: -368.203, best_return: -53.935
generate goal using supervised model based...
len after concat dataset:  1800
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0.51626986 -0.15860233]  distance:  1.0136217405331878
sample final subgoal:  [ 0.9998055  -0.01972194  0.00423544] distance:  1.0000089694321375

Episode: 705, actor_loss: 1.806, critic_loss: 10.920, mean_reward: -439.858, best_return: -53.935
generate goal using supervised model based...
len after concat dataset:  2120
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0.51626986 -0.15860233]  distance:  1.0136217405331878
sample final subgoal:  [ 0.99993906 -0.01103968 -0.02674559] distance:  1.0003575993531584

Episode: 706, actor_loss: 1.822, critic_loss: 5.045, mean_reward: -240.999, best_return: -53.935
generate goal using supervised model based...
len after concat dataset:  2440
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0


Episode: 726, actor_loss: 1.841, critic_loss: 37.543, mean_reward: -473.404, best_return: -53.935
generate goal using supervised model based...
len after concat dataset:  2440
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0.51626986 -0.15860233]  distance:  1.0136217405331878
sample final subgoal:  [ 0.99984344 -0.01769458  0.10914588] distance:  1.0059387771045643

Episode: 727, actor_loss: 2.006, critic_loss: 10.326, mean_reward: -292.775, best_return: -53.935
generate goal using supervised model based...
len after concat dataset:  2760
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0.51626986 -0.15860233]  distance:  1.0136217405331878
sample final subgoal:  [ 0.99986698 -0.01631045 -0.05950339] distance:  1.001768762384006

Episode: 728, actor_loss: 2.007, critic_loss: 15.751, mean_reward: -353.210, best_return: -53.935
generate goal using supervised model based...
len after concat dataset:  3080
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0


Episode: 748, actor_loss: 0.382, critic_loss: 5.357, mean_reward: -799.917, best_return: -53.935
generate goal using supervised model based...
len after concat dataset:  3080
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0.51626986 -0.15860233]  distance:  1.0136217405331878
sample final subgoal:  [0.99997119 0.00759048 0.04517138] distance:  1.0010197069007127

Episode: 749, actor_loss: 12.970, critic_loss: 95.678, mean_reward: -785.391, best_return: -53.935
last 50 episode mean reward:  -427.2629772467306


generate goal using supervised model based...
len after concat dataset:  3400
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0.51626986 -0.15860233]  distance:  1.0136217405331878
sample final subgoal:  [-0.57119487 -0.82081448 -0.61328403] distance:  1.1730802605240633

Episode: 750, actor_loss: -11.661, critic_loss: 44.985, mean_reward: -1139.336, best_return: -53.935
generate goal using supervised model based...
len after concat dataset:  3720
len filt


Episode: 770, actor_loss: -9.960, critic_loss: 26.385, mean_reward: -191.246, best_return: -53.935
generate goal using supervised model based...
len after concat dataset:  3720
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0.51626986 -0.15860233]  distance:  1.0136217405331878
sample final subgoal:  [ 0.70172878 -0.71244418  2.45983164] distance:  2.655328920592932

Episode: 771, actor_loss: -4.352, critic_loss: 14.445, mean_reward: -317.711, best_return: -53.935
generate goal using supervised model based...
len after concat dataset:  4040
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0.51626986 -0.15860233]  distance:  1.0136217405331878
sample final subgoal:  [ 0.99650107 -0.08357999  0.16604908] distance:  1.0136924069177025

Episode: 772, actor_loss: -3.350, critic_loss: 7.581, mean_reward: -126.861, best_return: -53.935
generate goal using supervised model based...
len after concat dataset:  4360
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 


Episode: 792, actor_loss: -2.511, critic_loss: 15.519, mean_reward: -187.102, best_return: -53.935
generate goal using supervised model based...
len after concat dataset:  4360
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0.51626986 -0.15860233]  distance:  1.0136217405331878
sample final subgoal:  [ 0.99916049 -0.04096731  0.39990367] distance:  1.0769971878926796

Episode: 793, actor_loss: 1.759, critic_loss: 38.562, mean_reward: -285.614, best_return: -53.935
generate goal using supervised model based...
len after concat dataset:  4680
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0.51626986 -0.15860233]  distance:  1.0136217405331878
sample final subgoal:  [ 0.99812308  0.0612398  -0.04924714] distance:  1.001211905995554

Episode: 794, actor_loss: 1.754, critic_loss: 26.429, mean_reward: -323.120, best_return: -53.935
generate goal using supervised model based...
len after concat dataset:  5000
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -


Episode: 814, actor_loss: 1.930, critic_loss: 24.410, mean_reward: -187.706, best_return: -53.935
generate goal using supervised model based...
len after concat dataset:  5000
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0.51626986 -0.15860233]  distance:  1.0136217405331878
sample final subgoal:  [ 0.9998612   0.01666091 -0.00208117] distance:  1.0000021656216447

Episode: 815, actor_loss: 1.760, critic_loss: 27.322, mean_reward: -183.268, best_return: -53.935
generate goal using supervised model based...
len after concat dataset:  5320
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0.51626986 -0.15860233]  distance:  1.0136217405331878
sample final subgoal:  [ 0.99944964  0.03317251 -0.0174629 ] distance:  1.0001524648968922

Episode: 816, actor_loss: 2.188, critic_loss: 20.416, mean_reward: -184.703, best_return: -53.935
generate goal using supervised model based...
len after concat dataset:  5640
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -


Episode: 836, actor_loss: 2.603, critic_loss: 14.003, mean_reward: -188.310, best_return: -53.935
generate goal using supervised model based...
len after concat dataset:  5640
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0.51626986 -0.15860233]  distance:  1.0136217405331878
sample final subgoal:  [0.99629335 0.08602073 0.02601243] distance:  1.0003382661505327

Episode: 837, actor_loss: -1.171, critic_loss: 6.884, mean_reward: -474.341, best_return: -53.935
generate goal using supervised model based...
len after concat dataset:  5960
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0.51626986 -0.15860233]  distance:  1.0136217405331878
sample final subgoal:  [0.99767867 0.06809746 2.29919202] distance:  2.5072462833209093

Episode: 838, actor_loss: 2.797, critic_loss: 41.420, mean_reward: -640.739, best_return: -53.935
generate goal using supervised model based...
len after concat dataset:  6280
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0.5162


Episode: 858, actor_loss: -0.328, critic_loss: 23.407, mean_reward: -315.832, best_return: -53.935
generate goal using supervised model based...
len after concat dataset:  6280
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0.51626986 -0.15860233]  distance:  1.0136217405331878
sample final subgoal:  [ 0.99997829  0.00659007 -0.0471737 ] distance:  1.0011120605012032

Episode: 859, actor_loss: 17.086, critic_loss: 45.634, mean_reward: -154.938, best_return: -53.935
generate goal using supervised model based...
len after concat dataset:  6600
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0.51626986 -0.15860233]  distance:  1.0136217405331878
sample final subgoal:  [ 0.70075388  0.71340311 -0.99688059] distance:  1.4120095322552642

Episode: 860, actor_loss: -9.433, critic_loss: 97.021, mean_reward: -205.105, best_return: -53.935
generate goal using supervised model based...
len after concat dataset:  520
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274


Episode: 880, actor_loss: -8.615, critic_loss: 34.128, mean_reward: -156.482, best_return: -53.935
generate goal using supervised model based...
len after concat dataset:  520
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0.51626986 -0.15860233]  distance:  1.0136217405331878
sample final subgoal:  [-0.97939394 -0.2019592  -5.30796638] distance:  5.401343081917432

Episode: 881, actor_loss: -3.268, critic_loss: 20.994, mean_reward: -129.226, best_return: -53.935
generate goal using supervised model based...
len after concat dataset:  840
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0.51626986 -0.15860233]  distance:  1.0136217405331878
sample final subgoal:  [ 0.999474   -0.03243032  0.09548934] distance:  1.0045487615318942

Episode: 882, actor_loss: -2.174, critic_loss: 3.713, mean_reward: -156.480, best_return: -53.935
generate goal using supervised model based...
len after concat dataset:  1160
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0


Episode: 902, actor_loss: -3.432, critic_loss: 9.610, mean_reward: -211.595, best_return: -53.935
generate goal using supervised model based...
len after concat dataset:  1160
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0.51626986 -0.15860233]  distance:  1.0136217405331878
sample final subgoal:  [ 0.97430869  0.22521674 -1.20955489] distance:  1.5694021261508855

Episode: 903, actor_loss: 1.005, critic_loss: 14.594, mean_reward: -129.802, best_return: -53.935
generate goal using supervised model based...
len after concat dataset:  1480
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0.51626986 -0.15860233]  distance:  1.0136217405331878
sample final subgoal:  [ 0.99584052 -0.09111346  0.29504459] distance:  1.0426175294037023

Episode: 904, actor_loss: 1.892, critic_loss: 49.744, mean_reward: -229.146, best_return: -53.935
generate goal using supervised model based...
len after concat dataset:  1800
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -


Episode: 924, actor_loss: 3.333, critic_loss: 22.761, mean_reward: -292.455, best_return: -53.044
generate goal using supervised model based...
len after concat dataset:  1800
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0.51626986 -0.15860233]  distance:  1.0136217405331878
sample final subgoal:  [ 0.99975218  0.02226169 -0.04794632] distance:  1.0011487651889994

Episode: 925, actor_loss: 1.322, critic_loss: 16.626, mean_reward: -160.652, best_return: -53.044
generate goal using supervised model based...
len after concat dataset:  2120
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0.51626986 -0.15860233]  distance:  1.0136217405331878
sample final subgoal:  [0.99946203 0.03279712 0.04257905] distance:  1.0009060774490999

Episode: 926, actor_loss: 1.064, critic_loss: 28.393, mean_reward: -307.717, best_return: -53.044
generate goal using supervised model based...
len after concat dataset:  2440
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0.5


Episode: 946, actor_loss: 0.862, critic_loss: 23.947, mean_reward: -335.478, best_return: -53.044
generate goal using supervised model based...
len after concat dataset:  2440
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0.51626986 -0.15860233]  distance:  1.0136217405331878
sample final subgoal:  [ 0.98863416  0.15034129 -0.02311544] distance:  1.0002671260852212

Episode: 947, actor_loss: -0.163, critic_loss: 40.994, mean_reward: -420.777, best_return: -53.044
generate goal using supervised model based...
len after concat dataset:  2760
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0.51626986 -0.15860233]  distance:  1.0136217405331878
sample final subgoal:  [ 0.98786384  0.15532236 -0.0135208 ] distance:  1.0000914017828684

Episode: 948, actor_loss: 2.575, critic_loss: 111.058, mean_reward: -129.369, best_return: -53.044
generate goal using supervised model based...
len after concat dataset:  3080
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274


Episode: 968, actor_loss: 0.611, critic_loss: 3.840, mean_reward: -1085.970, best_return: -53.044
generate goal using supervised model based...
len after concat dataset:  3080
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0.51626986 -0.15860233]  distance:  1.0136217405331878
sample final subgoal:  [ 0.99930313 -0.03732641 -0.04871599] distance:  1.0011859204175644

Episode: 969, actor_loss: 7.061, critic_loss: 213.702, mean_reward: -359.233, best_return: -53.044
generate goal using supervised model based...
len after concat dataset:  3400
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0.51626986 -0.15860233]  distance:  1.0136217405331878
sample final subgoal:  [ 0.36929621 -0.92931174  0.8511774 ] distance:  1.3132033204658224

Episode: 970, actor_loss: -4.765, critic_loss: 31.532, mean_reward: -1149.677, best_return: -53.044
generate goal using supervised model based...
len after concat dataset:  3720
len filtered concat dataset:  0
mb subgoal:  [ 0.8577527


Episode: 990, actor_loss: -0.365, critic_loss: 37.570, mean_reward: -1290.248, best_return: -53.044
generate goal using supervised model based...
len after concat dataset:  3720
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0.51626986 -0.15860233]  distance:  1.0136217405331878
sample final subgoal:  [ 0.68738886  0.72628958 -2.78645528] distance:  2.9604616243825856

Episode: 991, actor_loss: -0.793, critic_loss: 16.987, mean_reward: -1342.957, best_return: -53.044
generate goal using supervised model based...
len after concat dataset:  4040
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0.51626986 -0.15860233]  distance:  1.0136217405331878
sample final subgoal:  [ 0.70617491 -0.70803743 -1.3123541 ] distance:  1.6499312945285696

Episode: 992, actor_loss: 1.681, critic_loss: 11.080, mean_reward: -1120.454, best_return: -53.044
generate goal using supervised model based...
len after concat dataset:  4360
len filtered concat dataset:  0
mb subgoal:  [ 0.85775


Episode: 1012, actor_loss: 2.887, critic_loss: 12.706, mean_reward: -1493.220, best_return: -53.044
generate goal using supervised model based...
len after concat dataset:  4360
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0.51626986 -0.15860233]  distance:  1.0136217405331878
sample final subgoal:  [ 0.01116573  0.99993766 -6.93577239] distance:  7.007491609098637

Episode: 1013, actor_loss: 0.231, critic_loss: 3.147, mean_reward: -1192.853, best_return: -53.044
generate goal using supervised model based...
len after concat dataset:  4680
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0.51626986 -0.15860233]  distance:  1.0136217405331878
sample final subgoal:  [-0.95272925  0.30382064 -8.        ] distance:  8.06225774829855

Episode: 1014, actor_loss: -1.001, critic_loss: 0.907, mean_reward: -872.131, best_return: -53.044
generate goal using supervised model based...
len after concat dataset:  5000
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 


Episode: 1034, actor_loss: -1.487, critic_loss: 1.195, mean_reward: -1211.374, best_return: -53.044
generate goal using supervised model based...
len after concat dataset:  5000
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0.51626986 -0.15860233]  distance:  1.0136217405331878
sample final subgoal:  [ 0.94275748 -0.33347914 -6.48523662] distance:  6.561881893111877

Episode: 1035, actor_loss: -1.417, critic_loss: 1.030, mean_reward: -1493.700, best_return: -53.044
generate goal using supervised model based...
len after concat dataset:  5320
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0.51626986 -0.15860233]  distance:  1.0136217405331878
sample final subgoal:  [-0.30707838  0.95168423 -7.68025107] distance:  7.745079499010081

Episode: 1036, actor_loss: -0.773, critic_loss: 1.767, mean_reward: -1180.976, best_return: -53.044
generate goal using supervised model based...
len after concat dataset:  5640
len filtered concat dataset:  0
mb subgoal:  [ 0.857752


Episode: 1056, actor_loss: -0.952, critic_loss: 1.666, mean_reward: -1511.662, best_return: -53.044
generate goal using supervised model based...
len after concat dataset:  5640
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0.51626986 -0.15860233]  distance:  1.0136217405331878
sample final subgoal:  [-0.81143205 -0.58444677 -8.        ] distance:  8.06225774829855

Episode: 1057, actor_loss: -1.381, critic_loss: 1.590, mean_reward: -1535.672, best_return: -53.044
generate goal using supervised model based...
len after concat dataset:  5960
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0.51626986 -0.15860233]  distance:  1.0136217405331878
sample final subgoal:  [-0.74136551 -0.67110147 -8.        ] distance:  8.06225774829855

Episode: 1058, actor_loss: -0.869, critic_loss: 2.061, mean_reward: -1534.126, best_return: -53.044
generate goal using supervised model based...
len after concat dataset:  6280
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274


Episode: 1078, actor_loss: -0.756, critic_loss: 1.134, mean_reward: -754.375, best_return: -53.044
generate goal using supervised model based...
len after concat dataset:  6280
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0.51626986 -0.15860233]  distance:  1.0136217405331878
sample final subgoal:  [ 0.6236552   0.78169955 -6.48887224] distance:  6.5654750704176905

Episode: 1079, actor_loss: -1.881, critic_loss: 3.674, mean_reward: -958.723, best_return: -53.044
generate goal using supervised model based...
len after concat dataset:  6600
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0.51626986 -0.15860233]  distance:  1.0136217405331878
sample final subgoal:  [ 0.99930155 -0.03736869 -0.23167491] distance:  1.026485880143688

Episode: 1080, actor_loss: 1.468, critic_loss: 5.561, mean_reward: -711.317, best_return: -53.044
generate goal using supervised model based...
len after concat dataset:  520
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -


Episode: 1100, actor_loss: 0.271, critic_loss: 4.893, mean_reward: -1339.263, best_return: -53.044
generate goal using supervised model based...
len after concat dataset:  520
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0.51626986 -0.15860233]  distance:  1.0136217405331878
sample final subgoal:  [ 0.26120593 -0.9652831   3.59956862] distance:  3.7358926966459345

Episode: 1101, actor_loss: 3.384, critic_loss: 7.484, mean_reward: -1222.280, best_return: -53.044
generate goal using supervised model based...
len after concat dataset:  840
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0.51626986 -0.15860233]  distance:  1.0136217405331878
sample final subgoal:  [ 0.61774667 -0.78637717 -1.36463277] distance:  1.6918104521938073

Episode: 1102, actor_loss: 0.842, critic_loss: 1.425, mean_reward: -1471.650, best_return: -53.044
generate goal using supervised model based...
len after concat dataset:  1160
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 


Episode: 1122, actor_loss: 0.939, critic_loss: 1.389, mean_reward: -1336.500, best_return: -53.044
generate goal using supervised model based...
len after concat dataset:  1160
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0.51626986 -0.15860233]  distance:  1.0136217405331878
sample final subgoal:  [-0.4362471   0.89982691 -7.80020423] distance:  7.864043875232031

Episode: 1123, actor_loss: -0.381, critic_loss: 0.951, mean_reward: -1241.024, best_return: -53.044
generate goal using supervised model based...
len after concat dataset:  1480
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0.51626986 -0.15860233]  distance:  1.0136217405331878
sample final subgoal:  [ 0.36512313  0.93095924 -6.84894176] distance:  6.921560753244717

Episode: 1124, actor_loss: -1.025, critic_loss: 0.850, mean_reward: -1230.696, best_return: -53.044
generate goal using supervised model based...
len after concat dataset:  1800
len filtered concat dataset:  0
mb subgoal:  [ 0.8577527


Episode: 1144, actor_loss: -0.125, critic_loss: 0.183, mean_reward: -1493.081, best_return: -53.044
generate goal using supervised model based...
len after concat dataset:  1800
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0.51626986 -0.15860233]  distance:  1.0136217405331878
sample final subgoal:  [-0.89869969  0.43856456 -8.        ] distance:  8.06225774829855

Episode: 1145, actor_loss: -0.430, critic_loss: 0.283, mean_reward: -1558.057, best_return: -53.044
generate goal using supervised model based...
len after concat dataset:  2120
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0.51626986 -0.15860233]  distance:  1.0136217405331878
sample final subgoal:  [-0.72940151  0.68408584 -8.        ] distance:  8.06225774829855

Episode: 1146, actor_loss: -0.547, critic_loss: 0.211, mean_reward: -1522.938, best_return: -53.044
generate goal using supervised model based...
len after concat dataset:  2440
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274


Episode: 1166, actor_loss: -0.364, critic_loss: 0.311, mean_reward: -1483.383, best_return: -53.044
generate goal using supervised model based...
len after concat dataset:  2440
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0.51626986 -0.15860233]  distance:  1.0136217405331878
sample final subgoal:  [-0.52778018 -0.849381   -8.        ] distance:  8.06225774829855

Episode: 1167, actor_loss: -0.558, critic_loss: 0.285, mean_reward: -1501.199, best_return: -53.044
generate goal using supervised model based...
len after concat dataset:  2760
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0.51626986 -0.15860233]  distance:  1.0136217405331878
sample final subgoal:  [-0.56511725  0.8250106  -7.89172449] distance:  7.954829689742157

Episode: 1168, actor_loss: -0.435, critic_loss: 0.338, mean_reward: -1549.359, best_return: -53.044
generate goal using supervised model based...
len after concat dataset:  3080
len filtered concat dataset:  0
mb subgoal:  [ 0.8577527


Episode: 1188, actor_loss: -0.013, critic_loss: 0.493, mean_reward: -1497.328, best_return: -53.044
generate goal using supervised model based...
len after concat dataset:  3080
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0.51626986 -0.15860233]  distance:  1.0136217405331878
sample final subgoal:  [-0.48068218  0.87689489 -7.82964161] distance:  7.893243166040748

Episode: 1189, actor_loss: -2.050, critic_loss: 4.224, mean_reward: -1539.293, best_return: -53.044
generate goal using supervised model based...
len after concat dataset:  3400
len filtered concat dataset:  0
mb subgoal:  [ 0.85775274 -0.51626986 -0.15860233]  distance:  1.0136217405331878
sample final subgoal:  [-0.53427172 -0.8453128  -0.03340721] distance:  1.0005578653660556

Episode: 1190, actor_loss: 1.314, critic_loss: 3.540, mean_reward: -1522.936, best_return: -53.044
generate goal using supervised model based...
len after concat dataset:  3720
len filtered concat dataset:  0
mb subgoal:  [ 0.857752

# Saving and Loading Testing Reward

In [21]:
import pickle

with open('./Test Reward Plot/mb_test_rewards2', 'wb') as fp1:
    pickle.dump(test_rewards, fp1)
with open('./Loss Plot/mb_mean_actor_loss2', 'wb') as fp2:
    pickle.dump(mean_actor_loss_list, fp2)
with open('./Loss Plot/mb_mean_critic_loss2', 'wb') as fp3:
    pickle.dump(mean_critic_loss_list, fp3)

In [22]:
with open('./Goal Plot/mb_goal4', 'wb') as fp1:
    pickle.dump(mb_distance_list, fp1)

with open('./Goal Plot/sf_goal4', 'wb') as fp1:
    pickle.dump(sf_distance_list, fp1)

<h1> Save and Load Model </h1>

In [16]:
# model_name = 'model_14'

In [17]:
# torch.save(model, './Model/'+model_name )

In [18]:
# expert_model = torch.load('./Model/'+model_name)

In [19]:
# expert_test_rewards = []
# for i in range(5): 
# #     env = gym.wrappers.Monitor(env, 'test_video'+str(i), video_callable=lambda episode_id: True)
#     expert_test_reward = test_env(expert_model, [0, 0, 0], False)
#     expert_test_rewards.append(expert_test_reward)
#     print ('test {0}, total_reward from '+model_name+' load model: {1}'.format(i+1, expert_test_reward))

# # print ('mean expert test reward: ', np.mean(expert_test_rewards))