In [1]:
import mlagents
from mlagents_envs.environment import UnityEnvironment as UE
import numpy as np
from mlagents_envs.environment import ActionTuple


In [2]:
# env.close()

In [3]:
# env =  UE(file_name="stage0_160523\stage0_copy",seed=1,side_channels=[])
file_name = "C:\\Users\\Palaash.HPZ\\Desktop\\RL-concept-learning_large_build_envs\\build_envs\\windows\\S2 180723\\build"
env =  UE(file_name=file_name,seed=1,side_channels=[],worker_id=2,no_graphics = False)
env.reset()

In [4]:
# model
import os
import torch
import torch.nn as nn
import torch.nn.functional as F

vision_output_dim = 3136
num_words = 35  # Number of unique words in the vocabulary
bert_output_dim = 768  # output dim of BERT
language_output_dim = 128
embedding_dim = 128
mixing_dim = 256
lstm_hidden_dim = 256
num_actions = 4

# (3,128,128) --> (64,7,7) = 3136 (3-layer CNN)
class VisualModule(nn.Module): 
    def __init__(self):
        super(VisualModule, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, stride=2, padding=1),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=3, stride=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=3, padding=0),
            nn.ReLU()
        )
        # self.conv = nn.Sequential(
        #     nn.Conv2d(3, 32, kernel_size=5, stride=2, padding=2),
        #     nn.ReLU(),
        #     nn.Conv2d(32, 64, kernel_size=5, stride=2, padding=2),
        #     nn.ReLU(),
        #     nn.Conv2d(64, 128, kernel_size=5, stride=2, padding=2),
        #     nn.ReLU(),
        #     nn.Conv2d(128, 64, kernel_size=5, stride=2, padding=1),
        #     nn.ReLU(),
        #     nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1),
        #     nn.ReLU(),
        #     nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1),
        #     nn.ReLU(),
        # )

    def forward(self, vt):
        encoded_vt = self.conv(vt)
        return encoded_vt.view(vt.size(0), -1).squeeze()

# one-hot encoding [0 0 1 0 0] --> 128 dimensional embedding (FF)
# S1:5 S2:5 S3:11 S4:9 --> 30 + 5 (noun) = 35 in total
class LanguageModule(nn.Module): 
    def __init__(self, num_words, embedding_dim):
        super(LanguageModule, self).__init__()
        self.embedding = nn.Linear(num_words, embedding_dim)

    def forward(self, lt):
        embedded_lt = self.embedding(lt)
        return embedded_lt

# 3136(vision) + 128 (language) --> 256 dimensional embedding (FF)
class MixingModule(nn.Module):
    def __init__(self, vision_output_dim, language_output_dim, mixing_dim):
        super(MixingModule, self).__init__()
        self.linear = nn.Linear(vision_output_dim + language_output_dim, mixing_dim)

    def forward(self, vision_output, language_output):
        combined_output = torch.cat((vision_output, language_output), dim=0)
        mixed_output = self.linear(combined_output)
        return mixed_output

class LSTMModule(nn.Module):
    def __init__(self,mixing_dim,lstm_hidden_dim):
        super(LSTMModule, self).__init__()
        self.lstm = nn.LSTMCell(mixing_dim, lstm_hidden_dim)
    
    def forward(self,mixed_output,lstm_hidden_state):
        lstm_hidden_state = self.lstm(mixed_output, lstm_hidden_state) 
        # lstm_output = lstm_hidden_state[0] # output is (hidden_state,cell_state), we need hidden state, shape (1,256)
        return lstm_hidden_state

class Agent1(nn.Module):
    def __init__(self, num_words, embedding_dim, vision_output_dim, language_output_dim, mixing_dim, lstm_hidden_dim,num_actions):
        super(Agent1, self).__init__()
        self.language_module = LanguageModule(num_words, embedding_dim)
        self.visual_module = VisualModule()
        self.mixing_module = MixingModule(vision_output_dim, language_output_dim, mixing_dim)
        self.lstm_module = LSTMModule(mixing_dim, lstm_hidden_dim)
        self.action_predictor = nn.Linear(lstm_hidden_dim, num_actions)
        self.value_estimator = nn.Linear(lstm_hidden_dim, 1)

    def forward(self, vt, lt, lstm_hidden_state):
        vision_output = self.visual_module(vt)
        language_output = self.language_module(lt)
        mixed_output = self.mixing_module(vision_output, language_output).unsqueeze(0)
        lstm_output = self.lstm_module(mixed_output,lstm_hidden_state)
        action_probs = self.action_predictor(lstm_output[0]) 
        value_estimate = self.value_estimator(lstm_output[0])
        return action_probs,value_estimate,lstm_output
        
        
    def save(self, episode, ALG_NAME, ENV_ID):
        path = os.path.join('model', '_'.join([ALG_NAME, ENV_ID]))
        if not os.path.exists(path):
            os.makedirs(path)
        torch.save(self.state_dict(), os.path.join(path, f'agent_{episode}.pt'))

    def load(self, episode, ALG_NAME, ENV_ID):
        path = os.path.join('model', '_'.join([ALG_NAME, ENV_ID]))
        self.load_state_dict(torch.load(os.path.join(path, f'agent_{episode}.pt')))    

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# model w Bert 
import os
import torch
import torch.nn as nn
import torch.nn.functional as F

vision_output_dim = 3136
num_words = 35  # Number of unique words in the vocabulary
bert_output_dim = 768  # output dim of BERT
language_output_dim = 128
embedding_dim = 128
mixing_dim = 256
lstm_hidden_dim = 256
num_actions = 4

# (3,128,128) --> (64,7,7) = 3136 (3-layer CNN)
class VisualModule(nn.Module): 
    def __init__(self):
        super(VisualModule, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, stride=2, padding=1),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=3, stride=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=3, padding=0),
            nn.ReLU()
        )
        # self.conv = nn.Sequential(
        #     nn.Conv2d(3, 32, kernel_size=5, stride=2, padding=2),
        #     nn.ReLU(),
        #     nn.Conv2d(32, 64, kernel_size=5, stride=2, padding=2),
        #     nn.ReLU(),
        #     nn.Conv2d(64, 128, kernel_size=5, stride=2, padding=2),
        #     nn.ReLU(),
        #     nn.Conv2d(128, 64, kernel_size=5, stride=2, padding=1),
        #     nn.ReLU(),
        #     nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1),
        #     nn.ReLU(),
        #     nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1),
        #     nn.ReLU(),
        # )

    def forward(self, vt):
        encoded_vt = self.conv(vt)
        return encoded_vt.view(vt.size(0), -1).squeeze()

# one-hot encoding [0 0 1 0 0] --> 128 dimensional embedding (FF)
# S1:5 S2:5 S3:11 S4:9 --> 30 + 5 (noun) = 35 in total
# class LanguageModule(nn.Module): 
#     def __init__(self, num_words, embedding_dim):
#         super(LanguageModule, self).__init__()
#         self.embedding = nn.Linear(num_words, embedding_dim)

#     def forward(self, lt):
#         embedded_lt = self.embedding(lt)
#         return embedded_lt

# bert encoder 
from transformers import BertTokenizer,BertModel

class BertEncoder(nn.Module):
    def __init__(self):
        super(BertEncoder,self).__init__()
        self.bert_tokenizer=BertTokenizer.from_pretrained("bert-base-uncased")
        
    def forward(self,input_caption):
        tokens=self.bert_tokenizer.tokenize(input_caption)
        tokens=["[CLS]"]+tokens+["[SEP]"]
#         n_pad_to_add=self.max_words-len(tokens)
#         tokens+=["[PAD]"]*n_pad_to_add
        attention_mask=[1 if token!='[PAD]' else 0 for token in tokens]
        token_ids=self.bert_tokenizer.convert_tokens_to_ids(tokens)
        torch_token_ids=torch.tensor(token_ids).unsqueeze(0)
        torch_attention_mask=torch.tensor(attention_mask).unsqueeze(0)
        return torch_token_ids,torch_attention_mask
        
    
class BertModule(nn.Module):
    def __init__(self,bert_output_dim,embedding_dim):
        super(BertModule,self).__init__()
#         self.max_words=max_words
        self.bert_model=BertModel.from_pretrained("bert-base-uncased")
        self.embedding=nn.Linear(bert_output_dim,embedding_dim)

        # Freeze the weights of the BERT model
        for param in self.bert_model.parameters():
            param.requires_grad = False

    def forward(self,torch_token_ids,torch_attention_mask):
        output=self.bert_model(torch_token_ids,torch_attention_mask)
        output=output[1].view(-1)
        output=self.embedding(output)
        return output
    

# 3136(vision) + 128 (language) --> 256 dimensional embedding (FF)
class MixingModule(nn.Module):
    def __init__(self, vision_output_dim, language_output_dim, mixing_dim):
        super(MixingModule, self).__init__()
        self.linear = nn.Linear(vision_output_dim + language_output_dim, mixing_dim)

    def forward(self, vision_output, language_output):
        combined_output = torch.cat((vision_output, language_output), dim=0)
        mixed_output = self.linear(combined_output)
        return mixed_output

class LSTMModule(nn.Module):
    def __init__(self,mixing_dim,lstm_hidden_dim):
        super(LSTMModule, self).__init__()
        self.lstm = nn.LSTMCell(mixing_dim, lstm_hidden_dim)
    
    def forward(self,mixed_output,lstm_hidden_state):
        lstm_hidden_state = self.lstm(mixed_output, lstm_hidden_state) 
        # lstm_output = lstm_hidden_state[0] # output is (hidden_state,cell_state), we need hidden state, shape (1,256)
        return lstm_hidden_state

class Agent2(nn.Module):
    def __init__(self, bert_output_dim, embedding_dim, vision_output_dim, language_output_dim, mixing_dim, lstm_hidden_dim,num_actions):
        super(Agent2, self).__init__()
        self.bert_language_module = BertModule(bert_output_dim, embedding_dim)
        self.visual_module = VisualModule()
        self.mixing_module = MixingModule(vision_output_dim, language_output_dim, mixing_dim)
        self.lstm_module = LSTMModule(mixing_dim, lstm_hidden_dim)
        self.action_predictor = nn.Linear(lstm_hidden_dim, num_actions)
        self.value_estimator = nn.Linear(lstm_hidden_dim, 1)

    def forward(self, vt, torch_token_ids, torch_attention_mask, lstm_hidden_state):
        vision_output = self.visual_module(vt)
        language_output = self.bert_language_module(torch_token_ids, torch_attention_mask)
        mixed_output = self.mixing_module(vision_output, language_output).unsqueeze(0)
        lstm_output = self.lstm_module(mixed_output,lstm_hidden_state)
        action_probs = self.action_predictor(lstm_output[0]) 
        value_estimate = self.value_estimator(lstm_output[0])
        return action_probs,value_estimate,lstm_output
        
        
    def save(self, episode, ALG_NAME, ENV_ID):
        path = os.path.join('model', '_'.join([ALG_NAME, ENV_ID]))
        if not os.path.exists(path):
            os.makedirs(path)
        torch.save(self.state_dict(), os.path.join(path, f'agent_{episode}.pt'))

    def load(self, episode, ALG_NAME, ENV_ID):
        path = os.path.join('model', '_'.join([ALG_NAME, ENV_ID]))
        self.load_state_dict(torch.load(os.path.join(path, f'agent_{episode}.pt')))    

In [None]:
#

# import warnings
# # Disable all warnings
# warnings.filterwarnings("ignore")

from transformers import logging
logging.set_verbosity_error()

import argparse
import time
import json
# import matplotlib.pyplot as plt
import os
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical
from collections import OrderedDict
# add arguments in command --train/test
# parser = argparse.ArgumentParser(description='Train or test neural net motor controller.')
# parser.add_argument('--train', dest='train', action='store_true', default=False)
# parser.add_argument('--test', dest='test', action='store_true', default=True)
# args = parser.parse_args()
train = True
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device) 

S0_ALG_NAME = 'S2'
S0_ENV_ID = '23'
S0_episode = 24037

ALG_NAME = 'S2'
ENV_ID = '26'

TRAIN_EPISODES = 175267  # number of overall episodes for training
MAX_STEPS = 500  # maximum time step in one episode
LAM = 0.95  # reward discount in TD error
lr = 2.5e-5  #LR
speed = 3
num_steps = 250 # the step for updating the network
max_step_reward = -10

if __name__ == '__main__':
    #initialise pretrained model class 
#     agent = Agent1(num_words, embedding_dim, vision_output_dim, language_output_dim, mixing_dim, lstm_hidden_dim,num_actions)
#     agent.load(S0_episode,S0_ALG_NAME,S0_ENV_ID)
#     pretrained_dict = agent.state_dict()

    # intialise new model class
    agent = Agent2(bert_output_dim, embedding_dim, vision_output_dim, language_output_dim, mixing_dim, lstm_hidden_dim,num_actions)
    agent.load(S0_episode,S0_ALG_NAME,S0_ENV_ID)
#     bert_dict = agent.state_dict()

#     new_dict = OrderedDict()
#     for k in bert_dict.keys():
#         if k in pretrained_dict.keys():
#             new_dict[k] = pretrained_dict[k]
#         else:
#             new_dict[k] = bert_dict[k]
        
#     agent.load_state_dict(new_dict)
    agent.to(device)
    
    optimizer = optim.RMSprop(agent.parameters(), lr=lr)
    best_score = float('-inf')
    hashmap = {
    0: 'capsule',
    1: 'cube',
    2: 'cylinder',
    3: 'prism',
    4: 'sphere',
    5: 'red',
    6: 'green',
    7: 'blue',
    8: 'yellow',
    9: 'black'}
    if train:
        entropy_term = 0
        all_episode_reward = []
        all_average_reward = []
        all_steps = []
        all_actor_loss = []
        all_critic_loss = []
        all_entropy_loss = []
        all_total_loss = []
        tracked_agent = -1
        for episode in range(TRAIN_EPISODES):
            t0 = time.time()
            episode_reward = 0
            # env.reset()
            behavior_name=list(env.behavior_specs)[0]
            spec=env.behavior_specs[behavior_name]
            # state = env.reset().astype(np.float32)
            STEPS = 0
            decision_steps, terminal_steps = env.get_steps(behavior_name)
            # state -- vt, lt, lstm
            vt = torch.tensor(decision_steps.obs[0]).reshape(1,3,128,128).to(device)
            index1 = int(decision_steps.obs[1][0][0])
            index2 = int(decision_steps.obs[1][0][1])+5
            print(f'---{hashmap[index2]} {hashmap[index1]}---')
            input_caption = f"{hashmap[index2]} {hashmap[index1]}"
            bert_encoder=BertEncoder()
            torch_token_ids,torch_attention_mask=bert_encoder(input_caption)
#             text_encoder=BertEncoder(bert_output_dim,num_words)
#             lt=text_encoder(input_caption).to(device).detach()
            torch_token_ids=torch_token_ids.to(device)
            torch_attention_mask=torch_attention_mask.to(device)
        
            # 0-capsule,1-cube,2-cylinder,3-prism,4-sphere 
#             lt = torch.zeros(35).to(device)
#             lt[index1],lt[index2] = 1,1

            lstm_hidden_state = (torch.zeros(1, lstm_hidden_dim).to(device), torch.zeros(1, lstm_hidden_dim).to(device))
            done = False
            while True:

                # Need to use when calculating the loss
                log_probs = []
                # values = []
                values = torch.empty(0).to(device)
                rewards = []

                for steps in range(num_steps):
                    
                    lstm_hidden_state = tuple(tensor.detach() for tensor in lstm_hidden_state)
                    policy_dist, value, lstm_hidden_state = agent(vt,torch_token_ids,torch_attention_mask,lstm_hidden_state)
                    STEPS += 1
                    dist = F.softmax(policy_dist.detach(),dim=1).cpu().numpy()
                    

                    action_dist = Categorical(F.softmax(policy_dist.detach(),dim=1))
                    # action_dist = Categorical(F.softmax(policy_dist,dim=1))
                    action = action_dist.sample() # sample an action from action_dist
                    action_onehot = F.one_hot(torch.tensor(action),num_actions).cpu()
                    
                    log_prob = torch.log(F.softmax(policy_dist,dim=1)[0][action])
                    # log_prob = torch.log(F.softmax(policy_dist,dim=1)[0][action])
                    # entropy = -np.sum(np.mean(dist)* np.log(dist))
                    entropy = F.cross_entropy(policy_dist.detach(), action)

                    discrete_actions = np.array(action_onehot).reshape(1,4)*speed
                    action_tuple = ActionTuple()
                    action_tuple.add_discrete(discrete_actions)
                    env.set_actions(behavior_name,action_tuple)
                    
                    env.step()
                    decision_steps, terminal_steps = env.get_steps(behavior_name)

                    if tracked_agent == -1 and len(decision_steps) >= 1:
                        tracked_agent = decision_steps.agent_id[0]
                        # print(tracked_agent)

                    if tracked_agent in terminal_steps: # roll over or hit the target
                        print('Agent in terminal steps')
                        done = True
                        reward = terminal_steps[tracked_agent].reward
                        if reward > 0:
                            pass
                        else: reward = -1 # roll over or other unseen conditions

                        print(f'Terminal Step reward: {reward}')

                    elif tracked_agent in decision_steps: # the agent which requires action
                        reward = decision_steps[tracked_agent].reward
                        # print(f'Decision Step reward: {reward}')
                        if reward<0:
                            print(f'Decision Step reward: {reward}')
                            # if reward<-1: hit = 1
                    if STEPS >= MAX_STEPS:
                        reward = max_step_reward
                        print(f'Max Step Reward: {reward}')
                        env.reset()
                        done = True
                    if STEPS % num_steps == 0:
                        print (f'Step: {STEPS}')

                    episode_reward = episode_reward + reward

                    rewards.append(reward)
                    # values.append(value)
                    values = torch.cat((values, value), dim=0)
                    log_probs.append(log_prob)
                    entropy_term = entropy_term + entropy
                    vt_new = torch.tensor(decision_steps.obs[0]).reshape(1,3,128,128).to(device)
                    vt = vt_new

                    if done or steps == num_steps-1:
                        # _, Qval,_ = agent(vt_new,lt,lstm_hidden_state)
                        # Qval = Qval.detach()
                        break
                
                
                discounted_rewards = np.zeros_like(values.cpu().detach().numpy())
                cumulative = 0
                # print(len(rewards))
                for t in reversed(range(len(rewards))):
                    cumulative = rewards[t] + LAM * cumulative # Monte Carlo
                    discounted_rewards[t] = cumulative
                # print(f'rewards:{rewards}, discounted_rewards:{discounted_rewards}')
                # Advantage Actor Critic

                # Qvals[-1] = rewards[t] + LAM * Qval      or       Qvals[-1] = rewards[t]                   
                # for t in range(len(rewards)-1):
                #         Qvals[t] = rewards[t] + LAM * values[t+1]
                
                # r_(t+1) = R(s_t|a_t)--> reward[t]        a_t, V_t = agent(s_t)
                # A_t = r_(t+1) + LAM * V_(t+1) - V_t 
                #     = Q_t - V_t
                
                # Monte Carlo Advantage = reward + LAM * cumulative_reward
                # Actor_loss = -log(pai(s_t|a_t))*A_t
                # Critic_loss = A_t.pow(2) *0.5
                # Entropy_loss = -F.entropy(pai(St),index) * 0.001

                # entropy = -np.sum(np.mean(dist) * np.log(dist))
                
                #update actor critic
                
                # values = torch.FloatTensor(values).requires_grad_(True).to(device)
                discounted_rewards = torch.FloatTensor(discounted_rewards.astype(np.float32)).to(device)
                log_probs = torch.stack(log_probs)
                advantage = discounted_rewards - values
                actor_loss = (-log_probs * advantage).mean()
                critic_loss = 0.5 * torch.square(advantage).mean()
                entropy_term /= num_steps
                entropy_loss = -0.1 * entropy_term
                ac_loss = actor_loss + critic_loss + entropy_loss
                # ac_loss = values.mean()
                optimizer.zero_grad()
                ac_loss.backward()
                optimizer.step()
                # print('updated')
                # for name, param in agent.named_parameters():
                #     if param.grad is not None:
                #         print(name, param.grad)
                #     else:
                #         print(name, "gradients not computed")
                # for name, param in agent.named_parameters():
                #     if name == 'value_estimator.weight':
                #         print(name, param)
                
                
                if done: break


            all_episode_reward.append(float(episode_reward))
            all_steps.append(STEPS)
            all_actor_loss.append(float(actor_loss))
            all_critic_loss.append(float(critic_loss))
            all_entropy_loss.append(float(entropy_loss))
            all_total_loss.append(float(ac_loss))
            if episode >= 100:
                avg_score = np.mean(all_episode_reward[-100:])
                all_average_reward.append(avg_score)
                if avg_score > best_score:
                    best_score = avg_score
                    agent.save(episode, ALG_NAME, ENV_ID)
                    print(f'-----The best score for averaging previous 200 episode reward is {best_score}. Model has been saved-----')
                print('Training  | Episode: {}/{}  | Episode Reward: {:.1f}  | Average Reward {:.2f}  | Actor loss: {:.2f} | Critic loss: {:.2f} | Entropy loss: {:.4f}  | Total Loss: {:.2f} | Total Steps: {}' \
                    .format(episode + 1, TRAIN_EPISODES, episode_reward, avg_score, actor_loss, critic_loss,entropy_loss,  ac_loss, STEPS))
            else:  print('Training  | Episode: {}/{}  | Episode Reward: {:.1f}  | Actor loss: {:.2f} | Critic loss: {:.2f} | Entropy loss: {:.4f}  | Total Loss: {:.2f} | Total Steps: {}' \
                    .format(episode + 1, TRAIN_EPISODES, episode_reward, actor_loss, critic_loss, entropy_loss,  ac_loss, STEPS))
            if episode%5000 == 0:
                    agent.save(episode, ALG_NAME, ENV_ID)
                    print("Model has been saved")
        print(all_average_reward)
        agent.save(episode ,ALG_NAME, ENV_ID)
        print("Model has been saved")

        data = {
                    'all_average_reward': all_average_reward,
                    'all_episode_reward': all_episode_reward,
                    'all_actor_loss': all_actor_loss,
                    'all_critic_loss': all_critic_loss,
                    'all_entropy_loss': all_entropy_loss,
                    'all_total_loss': all_total_loss,
                    'all_steps': all_steps,
                } 
        file_path = f'result/{ALG_NAME}_{ENV_ID}.txt'
        with open(file_path, 'w') as file:
            json.dump(data, file)



cuda
---black capsule---




Decision Step reward: -2.5
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1/175267  | Episode Reward: 5.0  | Actor loss: -0.15 | Critic loss: 8.13 | Entropy loss: -0.0032  | Total Loss: 7.98 | Total Steps: 44
Model has been saved
---green prism---
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 2/175267  | Episode Reward: 10.0  | Actor loss: 0.02 | Critic loss: 6.68 | Entropy loss: -0.0000  | Total Loss: 6.69 | Total Steps: 6
---blue sphere---
Decision Step reward: -2.5
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 3/175267  | Episode Reward: 5.0  | Actor loss: 0.05 | Critic loss: 6.50 | Entropy loss: -0.0027  | Total Loss: 6.55 | Total Steps: 56
---yellow cylinder---
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 4/175267  | Episode Reward: 7.5  | Actor loss: 0.30 | Critic loss: 4.33 | Entropy loss: -0.0025

Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 33/175267  | Episode Reward: 10.0  | Actor loss: 0.01 | Critic loss: 1.27 | Entropy loss: -0.0000  | Total Loss: 1.27 | Total Steps: 6
---yellow capsule---
Decision Step reward: -2.5
Decision Step reward: -1.0
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 34/175267  | Episode Reward: 6.5  | Actor loss: -0.35 | Critic loss: 5.43 | Entropy loss: -0.0027  | Total Loss: 5.08 | Total Steps: 45
---green sphere---
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 35/175267  | Episode Reward: 10.0  | Actor loss: -0.25 | Critic loss: 1.09 | Entropy loss: -0.0020  | Total Loss: 0.84 | Total Steps: 7
---black cylinder---
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 36/175267  | Episode Reward: 7.5  | Actor loss: -0.23 | Critic loss: 2.49 | Entropy loss: -0.0042  | Total Loss: 2.26 | Total Steps: 75
---yellow cube---
Decision 

Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 65/175267  | Episode Reward: 7.5  | Actor loss: 0.17 | Critic loss: 7.41 | Entropy loss: -0.0006  | Total Loss: 7.58 | Total Steps: 30
---black cube---
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 66/175267  | Episode Reward: 10.0  | Actor loss: 0.11 | Critic loss: 2.96 | Entropy loss: -0.0005  | Total Loss: 3.07 | Total Steps: 30
---green cylinder---
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 67/175267  | Episode Reward: 7.5  | Actor loss: 0.07 | Critic loss: 3.99 | Entropy loss: -0.0004  | Total Loss: 4.06 | Total Steps: 34
---black prism---
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 68/175267  | Episode Reward: 10.0  | Actor loss: 0.01 | Critic loss: 0.86 | Entropy loss: -0.0000  | Total Loss: 0.87 | Total Steps: 6
---blue cube---
Decision Step reward: -2.5
Decision Step reward: -2.5
Decision Step rewa

Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 98/175267  | Episode Reward: 7.5  | Actor loss: 0.01 | Critic loss: 0.96 | Entropy loss: -0.0004  | Total Loss: 0.96 | Total Steps: 38
---red sphere---
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 99/175267  | Episode Reward: 10.0  | Actor loss: 0.03 | Critic loss: 0.44 | Entropy loss: -0.0001  | Total Loss: 0.47 | Total Steps: 6
---blue cube---
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 100/175267  | Episode Reward: 7.5  | Actor loss: -0.85 | Critic loss: 7.67 | Entropy loss: -0.0041  | Total Loss: 6.82 | Total Steps: 48
---yellow cube---
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
-----The best score for averaging previous 200 episode reward is 7.495. Model has been saved-----
Training  | Episode: 101/175267  | Episode Reward: 7.5  | Average Reward 7.50  | Actor loss: -0.36 | Critic loss: 3.80 | En

Agent in terminal steps
Terminal Step reward: 10.0
-----The best score for averaging previous 200 episode reward is 7.705. Model has been saved-----
Training  | Episode: 128/175267  | Episode Reward: 10.0  | Average Reward 7.71  | Actor loss: 0.00 | Critic loss: 0.80 | Entropy loss: -0.0000  | Total Loss: 0.80 | Total Steps: 6
---black prism---
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
-----The best score for averaging previous 200 episode reward is 7.73. Model has been saved-----
Training  | Episode: 129/175267  | Episode Reward: 7.5  | Average Reward 7.73  | Actor loss: 0.00 | Critic loss: 3.22 | Entropy loss: -0.0002  | Total Loss: 3.22 | Total Steps: 30
---black cylinder---
Agent in terminal steps
Terminal Step reward: 10.0
-----The best score for averaging previous 200 episode reward is 7.78. Model has been saved-----
Training  | Episode: 130/175267  | Episode Reward: 10.0  | Average Reward 7.78  | Actor loss: 0.02 | Critic loss: 2.32 | Entropy 

Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 155/175267  | Episode Reward: 0.0  | Average Reward 7.75  | Actor loss: -0.61 | Critic loss: 11.61 | Entropy loss: -0.0031  | Total Loss: 11.00 | Total Steps: 55
---green sphere---
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 156/175267  | Episode Reward: 7.5  | Average Reward 7.72  | Actor loss: 0.20 | Critic loss: 4.70 | Entropy loss: -0.0009  | Total Loss: 4.90 | Total Steps: 29
---green cylinder---
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 157/175267  | Episode Reward: 7.5  | Average Reward 7.70  | Actor loss: 0.10 | Critic loss: 5.85 | Entropy loss: -0.0007  | Total Loss: 5.96 | Total Steps: 30
---blue prism---
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 158/175267  | Episode Reward: 7.5  | Average Reward 7.75  | Actor l

Decision Step reward: -2.5
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 184/175267  | Episode Reward: 5.0  | Average Reward 7.43  | Actor loss: -0.23 | Critic loss: 6.98 | Entropy loss: -0.0046  | Total Loss: 6.74 | Total Steps: 64
---green capsule---
Decision Step reward: -1.0
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 185/175267  | Episode Reward: 9.0  | Average Reward 7.42  | Actor loss: -0.01 | Critic loss: 2.99 | Entropy loss: -0.0073  | Total Loss: 2.97 | Total Steps: 154
---green sphere---
Decision Step reward: -2.5
Decision Step reward: -2.5
Decision Step reward: -2.5
Decision Step reward: -1.0
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 186/175267  | Episode Reward: 1.5  | Average Reward 7.33  | Actor loss: -0.98 | Critic loss: 5.29 | Entropy loss: -0.0055  | Total Loss: 4.31 | Total Steps: 50
---green cylinder---
Decision Step reward: -2.5
Agent in terminal steps
Ter

Decision Step reward: -2.5
Decision Step reward: -1.0
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 213/175267  | Episode Reward: 6.5  | Average Reward 7.04  | Actor loss: -0.18 | Critic loss: 2.50 | Entropy loss: -0.0046  | Total Loss: 2.32 | Total Steps: 58
---blue cube---
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 214/175267  | Episode Reward: 10.0  | Average Reward 7.08  | Actor loss: 0.04 | Critic loss: 7.91 | Entropy loss: -0.0000  | Total Loss: 7.95 | Total Steps: 6
---yellow sphere---
Decision Step reward: -2.5
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 215/175267  | Episode Reward: 5.0  | Average Reward 7.04  | Actor loss: 0.00 | Critic loss: 2.87 | Entropy loss: -0.0012  | Total Loss: 2.87 | Total Steps: 42
---green cylinder---
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 216/175267  | Episode Reward: 10.0  | Average Reward 7.06  | Actor lo

Decision Step reward: -2.5
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 240/175267  | Episode Reward: 5.0  | Average Reward 6.32  | Actor loss: 0.03 | Critic loss: 3.93 | Entropy loss: -0.0027  | Total Loss: 3.96 | Total Steps: 42
---yellow capsule---
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 241/175267  | Episode Reward: 10.0  | Average Reward 6.37  | Actor loss: 0.72 | Critic loss: 0.83 | Entropy loss: -0.0015  | Total Loss: 1.55 | Total Steps: 7
---black prism---
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 242/175267  | Episode Reward: 10.0  | Average Reward 6.39  | Actor loss: 0.06 | Critic loss: 3.45 | Entropy loss: -0.0003  | Total Loss: 3.52 | Total Steps: 31
---yellow capsule---
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 243/175267  | Episode Reward: 7.5  | Average Reward 6.37  | Actor loss: 0.00 | Critic loss: 4

Decision Step reward: -2.5
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 270/175267  | Episode Reward: 5.0  | Average Reward 6.68  | Actor loss: -0.07 | Critic loss: 4.68 | Entropy loss: -0.0011  | Total Loss: 4.61 | Total Steps: 43
---black prism---
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 271/175267  | Episode Reward: 10.0  | Average Reward 6.70  | Actor loss: 0.00 | Critic loss: 0.72 | Entropy loss: -0.0000  | Total Loss: 0.73 | Total Steps: 6
---red prism---
Decision Step reward: -2.5
Decision Step reward: -2.5
Decision Step reward: -2.5
Decision Step reward: -2.5
Decision Step reward: -2.5
Step: 250
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 272/175267  | Episode Reward: -5.0  | Average Reward 6.67  | Actor loss: -0.78 | Critic loss: 9.77 | Entropy loss: -0.0107  | Total Loss: 8.99 | Total Steps: 399
---red cube---
Decision Step reward: -2.5
A

Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 300/175267  | Episode Reward: 10.0  | Average Reward 6.92  | Actor loss: 0.01 | Critic loss: 0.39 | Entropy loss: -0.0000  | Total Loss: 0.39 | Total Steps: 6
---blue cube---
Decision Step reward: -2.5
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 301/175267  | Episode Reward: 5.0  | Average Reward 7.00  | Actor loss: -0.14 | Critic loss: 7.35 | Entropy loss: -0.0012  | Total Loss: 7.21 | Total Steps: 52
---red sphere---
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 302/175267  | Episode Reward: 10.0  | Average Reward 7.03  | Actor loss: 0.06 | Critic loss: 3.76 | Entropy loss: -0.0005  | Total Loss: 3.83 | Total Steps: 30
---green cylinder---
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 303/175267  | Episode Reward: 10.0  | Average Reward 7.05  | Actor loss: 0.02 | Critic loss: 3.73 | Entropy loss: -0.0011  | 

Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 330/175267  | Episode Reward: 10.0  | Average Reward 7.54  | Actor loss: 0.02 | Critic loss: 2.26 | Entropy loss: -0.0000  | Total Loss: 2.28 | Total Steps: 6
---black cylinder---
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 331/175267  | Episode Reward: 7.5  | Average Reward 7.54  | Actor loss: 0.04 | Critic loss: 2.20 | Entropy loss: -0.0061  | Total Loss: 2.23 | Total Steps: 69
---red sphere---
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 332/175267  | Episode Reward: 7.5  | Average Reward 7.52  | Actor loss: 0.21 | Critic loss: 3.93 | Entropy loss: -0.0050  | Total Loss: 4.13 | Total Steps: 58
---black prism---
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 333/175267  | Episode Reward: 10.0  | Average Reward 7.71  | Actor loss: 0.01 | Critic loss: 2.07 | Entropy loss: -0.0000  | 

Decision Step reward: -2.5
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 360/175267  | Episode Reward: 5.0  | Average Reward 7.68  | Actor loss: 0.04 | Critic loss: 5.80 | Entropy loss: -0.0016  | Total Loss: 5.84 | Total Steps: 44
---red cube---
Decision Step reward: -2.5
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 361/175267  | Episode Reward: 5.0  | Average Reward 7.66  | Actor loss: -0.02 | Critic loss: 8.50 | Entropy loss: -0.0003  | Total Loss: 8.48 | Total Steps: 42
---black capsule---
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 362/175267  | Episode Reward: 10.0  | Average Reward 7.66  | Actor loss: 0.02 | Critic loss: 1.61 | Entropy loss: -0.0000  | Total Loss: 1.63 | Total Steps: 6
---yellow cube---
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 363/175267  | Episode Reward: 10.0  | Average Reward 7.68  | Actor loss: 

Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 389/175267  | Episode Reward: 7.5  | Average Reward 7.50  | Actor loss: -0.09 | Critic loss: 2.93 | Entropy loss: -0.0033  | Total Loss: 2.84 | Total Steps: 33
---yellow cylinder---
Decision Step reward: -1.0
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 390/175267  | Episode Reward: 6.5  | Average Reward 7.54  | Actor loss: -0.18 | Critic loss: 2.89 | Entropy loss: -0.0041  | Total Loss: 2.71 | Total Steps: 109
---red cylinder---
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 391/175267  | Episode Reward: 10.0  | Average Reward 7.54  | Actor loss: 0.08 | Critic loss: 4.30 | Entropy loss: -0.0024  | Total Loss: 4.38 | Total Steps: 37
---green prism---
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 392/175267  | Episode Reward: 10.0  | Average Reward 7.58  | Actor loss: 0.01 | Critic loss

Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 417/175267  | Episode Reward: 5.0  | Average Reward 6.91  | Actor loss: -0.03 | Critic loss: 3.77 | Entropy loss: -0.0007  | Total Loss: 3.73 | Total Steps: 43
---red cylinder---
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 418/175267  | Episode Reward: 7.5  | Average Reward 6.93  | Actor loss: 0.06 | Critic loss: 7.78 | Entropy loss: -0.0003  | Total Loss: 7.84 | Total Steps: 29
---blue cube---
Decision Step reward: -2.5
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 419/175267  | Episode Reward: 5.0  | Average Reward 6.99  | Actor loss: -0.45 | Critic loss: 5.10 | Entropy loss: -0.0035  | Total Loss: 4.64 | Total Steps: 53
---yellow capsule---
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 420/175267  | Episode Reward: 10.0  | Average Reward 7.04  | Actor lo

Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 445/175267  | Episode Reward: 7.5  | Average Reward 6.62  | Actor loss: 0.03 | Critic loss: 5.49 | Entropy loss: -0.0003  | Total Loss: 5.52 | Total Steps: 37
---blue prism---
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 446/175267  | Episode Reward: 10.0  | Average Reward 6.62  | Actor loss: -0.31 | Critic loss: 5.21 | Entropy loss: -0.0069  | Total Loss: 4.89 | Total Steps: 143
---yellow capsule---
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 447/175267  | Episode Reward: 10.0  | Average Reward 6.62  | Actor loss: 0.01 | Critic loss: 1.02 | Entropy loss: -0.0000  | Total Loss: 1.03 | Total Steps: 6
---blue sphere---
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 448/175267  | Episode Reward: 10.0  | Average Reward 6.67  | Actor loss: 0.01 | Critic loss: 0.45 | Entropy loss: -0.0000  | Total Loss: 0.46 | Total Steps: 6
---blue capsule--

Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 475/175267  | Episode Reward: 10.0  | Average Reward 6.62  | Actor loss: 0.02 | Critic loss: 4.28 | Entropy loss: -0.0000  | Total Loss: 4.30 | Total Steps: 6
---blue prism---
Decision Step reward: -2.5
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 476/175267  | Episode Reward: 5.0  | Average Reward 6.60  | Actor loss: -0.04 | Critic loss: 6.16 | Entropy loss: -0.0006  | Total Loss: 6.12 | Total Steps: 53
---black cube---
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 477/175267  | Episode Reward: 10.0  | Average Reward 6.65  | Actor loss: 0.05 | Critic loss: 4.70 | Entropy loss: -0.0002  | Total Loss: 4.75 | Total Steps: 31
---black prism---
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 478/175267  | Episode Reward: 7.5  | Average Reward 6.65  | Actor loss: -0.06 | Critic loss: 4.61 | 

Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 504/175267  | Episode Reward: 10.0  | Average Reward 6.34  | Actor loss: 0.16 | Critic loss: 3.15 | Entropy loss: -0.0002  | Total Loss: 3.31 | Total Steps: 6
---green prism---
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 505/175267  | Episode Reward: 7.5  | Average Reward 6.32  | Actor loss: 0.10 | Critic loss: 8.49 | Entropy loss: -0.0003  | Total Loss: 8.59 | Total Steps: 30
---blue prism---
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 506/175267  | Episode Reward: 10.0  | Average Reward 6.32  | Actor loss: 0.03 | Critic loss: 4.37 | Entropy loss: -0.0000  | Total Loss: 4.40 | Total Steps: 6
---black cylinder---
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 507/175267  | Episode Reward: 10.0  | Average Reward 6.67  | Actor loss: 0.02 | Critic loss: 3.44 | Entropy loss: -0.0002  | Total Loss: 3.46 | Total St

Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 532/175267  | Episode Reward: 10.0  | Average Reward 6.51  | Actor loss: 0.02 | Critic loss: 4.20 | Entropy loss: -0.0000  | Total Loss: 4.23 | Total Steps: 6
---blue cube---
Decision Step reward: -2.5
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 533/175267  | Episode Reward: 5.0  | Average Reward 6.54  | Actor loss: -0.06 | Critic loss: 4.79 | Entropy loss: -0.0025  | Total Loss: 4.73 | Total Steps: 44
---black cylinder---
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 534/175267  | Episode Reward: 7.5  | Average Reward 6.51  | Actor loss: 0.01 | Critic loss: 2.71 | Entropy loss: -0.0005  | Total Loss: 2.72 | Total Steps: 47
---green prism---
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 535/175267  | Episode Reward: 10.0  | Average Reward 6.54  | Actor loss: 0.01 | Critic loss: 1.71 

Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 561/175267  | Episode Reward: 10.0  | Average Reward 6.40  | Actor loss: 0.01 | Critic loss: 1.05 | Entropy loss: -0.0000  | Total Loss: 1.06 | Total Steps: 6
---blue prism---
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 562/175267  | Episode Reward: 10.0  | Average Reward 6.42  | Actor loss: 0.01 | Critic loss: 2.24 | Entropy loss: -0.0000  | Total Loss: 2.25 | Total Steps: 6
---green sphere---
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 563/175267  | Episode Reward: 7.5  | Average Reward 6.42  | Actor loss: 0.06 | Critic loss: 2.82 | Entropy loss: -0.0005  | Total Loss: 2.88 | Total Steps: 30
---black capsule---
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 564/175267  | Episode Reward: 10.0  | Average Reward 6.42  | Actor loss: 0.01 | Critic loss: 0.90 | Entropy loss: -0.0000  | Total Loss: 0.91 | Total St

Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 589/175267  | Episode Reward: 10.0  | Average Reward 6.62  | Actor loss: 0.01 | Critic loss: 1.72 | Entropy loss: -0.0000  | Total Loss: 1.73 | Total Steps: 6
---green cylinder---
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 590/175267  | Episode Reward: 7.5  | Average Reward 6.64  | Actor loss: -0.32 | Critic loss: 2.64 | Entropy loss: -0.0021  | Total Loss: 2.32 | Total Steps: 45
---blue cube---
Decision Step reward: -2.5
Decision Step reward: -2.5
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 591/175267  | Episode Reward: 2.5  | Average Reward 6.59  | Actor loss: -0.16 | Critic loss: 10.93 | Entropy loss: -0.0009  | Total Loss: 10.76 | Total Steps: 53
---green cylinder---
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 592/175267  | Episode Reward: 10.0  | Average Reward 6.62  | Acto

Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 618/175267  | Episode Reward: 10.0  | Average Reward 6.58  | Actor loss: 0.03 | Critic loss: 6.97 | Entropy loss: -0.0000  | Total Loss: 7.00 | Total Steps: 6
---red prism---
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 619/175267  | Episode Reward: 10.0  | Average Reward 6.58  | Actor loss: 0.01 | Critic loss: 0.78 | Entropy loss: -0.0000  | Total Loss: 0.78 | Total Steps: 6
---green prism---
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 620/175267  | Episode Reward: 7.5  | Average Reward 6.58  | Actor loss: 0.11 | Critic loss: 6.12 | Entropy loss: -0.0008  | Total Loss: 6.23 | Total Steps: 34
---yellow cube---
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 621/175267  | Episode Reward: 7.5  | Average Reward 6.56  | Actor loss: -0.15 | Critic loss: 3.85 | Entropy loss: -0.0013  | Tota

Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 648/175267  | Episode Reward: 7.5  | Average Reward 7.03  | Actor loss: 0.15 | Critic loss: 4.52 | Entropy loss: -0.0009  | Total Loss: 4.68 | Total Steps: 29
---blue cube---
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 649/175267  | Episode Reward: 10.0  | Average Reward 7.03  | Actor loss: 0.30 | Critic loss: 5.57 | Entropy loss: -0.0012  | Total Loss: 5.86 | Total Steps: 30
---red sphere---
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 650/175267  | Episode Reward: 7.5  | Average Reward 7.05  | Actor loss: -0.18 | Critic loss: 4.63 | Entropy loss: -0.0011  | Total Loss: 4.45 | Total Steps: 30
---yellow cube---
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 651/175267  | Episode Reward: 10.0  | Average Reward 7.11  | Actor loss: 0.01 | Critic loss: 2.03 | Entropy loss: -0.0000  | Tot

Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 678/175267  | Episode Reward: 7.5  | Average Reward 7.24  | Actor loss: 0.13 | Critic loss: 3.52 | Entropy loss: -0.0031  | Total Loss: 3.65 | Total Steps: 31
---green prism---
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 679/175267  | Episode Reward: 7.5  | Average Reward 7.24  | Actor loss: 0.04 | Critic loss: 3.87 | Entropy loss: -0.0005  | Total Loss: 3.91 | Total Steps: 30
---black capsule---
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 680/175267  | Episode Reward: 7.5  | Average Reward 7.31  | Actor loss: 0.00 | Critic loss: 0.77 | Entropy loss: -0.0004  | Total Loss: 0.78 | Total Steps: 38
---blue prism---
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 681/175267  | Episode Reward: 7.5  | Average Reward 7.29  | Actor loss: 

Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 708/175267  | Episode Reward: 10.0  | Average Reward 7.39  | Actor loss: 0.04 | Critic loss: 2.31 | Entropy loss: -0.0023  | Total Loss: 2.34 | Total Steps: 90
---blue capsule---
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 709/175267  | Episode Reward: 10.0  | Average Reward 7.39  | Actor loss: -0.17 | Critic loss: 6.90 | Entropy loss: -0.0050  | Total Loss: 6.73 | Total Steps: 168
---green cylinder---
Step: 250
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 710/175267  | Episode Reward: 7.5  | Average Reward 7.42  | Actor loss: 0.11 | Critic loss: 3.46 | Entropy loss: -0.0017  | Total Loss: 3.58 | Total Steps: 308
---black prism---
Decision Step reward: -2.5
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 711/175267  | Episode Reward: 5.0  | Average Reward 7.45  | Actor loss: -0.28 | C

Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 738/175267  | Episode Reward: 1.5  | Average Reward 7.57  | Actor loss: -0.38 | Critic loss: 6.46 | Entropy loss: -0.0037  | Total Loss: 6.07 | Total Steps: 50
---yellow cube---
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 739/175267  | Episode Reward: 10.0  | Average Reward 7.59  | Actor loss: 0.05 | Critic loss: 2.71 | Entropy loss: -0.0020  | Total Loss: 2.76 | Total Steps: 47
---black cylinder---
Step: 250
Decision Step reward: -1.0
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 740/175267  | Episode Reward: 6.5  | Average Reward 7.63  | Actor loss: -0.14 | Critic loss: 2.76 | Entropy loss: -0.0064  | Total Loss: 2.61 | Total Steps: 488
---yellow sphere---
Decision Step reward: -2.5
Decision Step reward: -1.0
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 741/175267  | Episode Reward: 6.5  | Average Reward 7.

Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 766/175267  | Episode Reward: 7.5  | Average Reward 6.98  | Actor loss: 0.20 | Critic loss: 6.01 | Entropy loss: -0.0016  | Total Loss: 6.21 | Total Steps: 44
---yellow cylinder---
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 767/175267  | Episode Reward: 10.0  | Average Reward 7.04  | Actor loss: 0.01 | Critic loss: 1.19 | Entropy loss: -0.0000  | Total Loss: 1.20 | Total Steps: 6
---yellow cube---
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 768/175267  | Episode Reward: 10.0  | Average Reward 7.04  | Actor loss: 0.01 | Critic loss: 0.96 | Entropy loss: -0.0000  | Total Loss: 0.97 | Total Steps: 6
---yellow cylinder---
Decision Step reward: -2.5
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 769/175267  | Episode Reward: 5.0  | Average Reward 6.99  | Actor loss: -0.01 | Critic loss:

Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 797/175267  | Episode Reward: 10.0  | Average Reward 7.01  | Actor loss: 0.02 | Critic loss: 3.06 | Entropy loss: -0.0001  | Total Loss: 3.08 | Total Steps: 31
---yellow cylinder---
Decision Step reward: -2.5
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 798/175267  | Episode Reward: 5.0  | Average Reward 6.99  | Actor loss: -0.21 | Critic loss: 6.03 | Entropy loss: -0.0051  | Total Loss: 5.81 | Total Steps: 190
---green capsule---
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 799/175267  | Episode Reward: 10.0  | Average Reward 7.01  | Actor loss: -0.09 | Critic loss: 6.49 | Entropy loss: -0.0026  | Total Loss: 6.40 | Total Steps: 112
---yellow sphere---
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 800/175267  | Episode Reward: 10.0  | Average Reward 7.04  | Actor loss: 0.01 | Critic loss: 0.24 | Entropy loss:

Training  | Episode: 825/175267  | Episode Reward: 5.0  | Average Reward 6.50  | Actor loss: -0.02 | Critic loss: 2.88 | Entropy loss: -0.0024  | Total Loss: 2.86 | Total Steps: 42
---black capsule---
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 826/175267  | Episode Reward: 10.0  | Average Reward 6.55  | Actor loss: 0.02 | Critic loss: 2.70 | Entropy loss: -0.0012  | Total Loss: 2.71 | Total Steps: 35
---green sphere---
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 827/175267  | Episode Reward: 10.0  | Average Reward 6.58  | Actor loss: 0.42 | Critic loss: 4.23 | Entropy loss: -0.0019  | Total Loss: 4.65 | Total Steps: 30
---yellow capsule---
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 828/175267  | Episode Reward: 10.0  | Average Reward 6.60  | Actor loss: 0.01 | Critic loss: 0.39 | Entropy loss: -0.0001  | Total Loss: 0.40 | Total Steps: 6
---blue capsule---
Agent in terminal steps
Terminal Step reward:

Decision Step reward: -2.5
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 855/175267  | Episode Reward: 0.0  | Average Reward 7.19  | Actor loss: -0.19 | Critic loss: 12.67 | Entropy loss: -0.0048  | Total Loss: 12.47 | Total Steps: 182
---red cube---
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 856/175267  | Episode Reward: 10.0  | Average Reward 7.19  | Actor loss: 0.01 | Critic loss: 2.21 | Entropy loss: -0.0000  | Total Loss: 2.21 | Total Steps: 6
---yellow sphere---
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 857/175267  | Episode Reward: 7.5  | Average Reward 7.17  | Actor loss: 0.06 | Critic loss: 3.87 | Entropy loss: -0.0006  | Total Loss: 3.92 | Total Steps: 32
---blue capsule---
Decision Step reward: -2.5
Decision Step reward: -2.5
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 858/175267  | E

Decision Step reward: -2.5
Decision Step reward: -2.5
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 884/175267  | Episode Reward: 2.5  | Average Reward 7.02  | Actor loss: -0.14 | Critic loss: 7.68 | Entropy loss: -0.0010  | Total Loss: 7.54 | Total Steps: 52
---blue cube---
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 885/175267  | Episode Reward: 7.5  | Average Reward 7.00  | Actor loss: -0.21 | Critic loss: 3.62 | Entropy loss: -0.0022  | Total Loss: 3.40 | Total Steps: 64
---green capsule---
Decision Step reward: -2.5
Decision Step reward: -2.5
Decision Step reward: -2.5
Decision Step reward: -2.5
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 886/175267  | Episode Reward: -2.5  | Average Reward 6.87  | Actor loss: -0.07 | Critic loss: 11.43 | Entropy loss: -0.0024  | Total Loss: 11.37 | Total Steps: 101
---blue sphere---
Age

Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 913/175267  | Episode Reward: 10.0  | Average Reward 7.04  | Actor loss: 0.49 | Critic loss: 3.40 | Entropy loss: -0.0029  | Total Loss: 3.89 | Total Steps: 32
---green prism---
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 914/175267  | Episode Reward: 7.5  | Average Reward 7.01  | Actor loss: -0.04 | Critic loss: 4.58 | Entropy loss: -0.0010  | Total Loss: 4.54 | Total Steps: 30
---green sphere---
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 915/175267  | Episode Reward: 10.0  | Average Reward 7.08  | Actor loss: 0.25 | Critic loss: 3.40 | Entropy loss: -0.0034  | Total Loss: 3.64 | Total Steps: 31
---red prism---
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 916/175267  | Episode Reward: 7.5  | Average Reward 7.06  | Actor loss: -0.23 | Critic loss: 2.91 | Entropy loss: -0.0027  | 

Decision Step reward: -2.5
Decision Step reward: -2.5
Decision Step reward: -2.5
Decision Step reward: -2.5
Decision Step reward: -2.5
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 943/175267  | Episode Reward: -5.0  | Average Reward 7.09  | Actor loss: -1.03 | Critic loss: 25.05 | Entropy loss: -0.0038  | Total Loss: 24.02 | Total Steps: 85
---blue prism---
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 944/175267  | Episode Reward: 7.5  | Average Reward 7.07  | Actor loss: 0.10 | Critic loss: 6.23 | Entropy loss: -0.0014  | Total Loss: 6.32 | Total Steps: 43
---blue capsule---
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 945/175267  | Episode Reward: 7.5  | Average Reward 7.07  | Actor loss: 0.29 | Critic loss: 5.79 | Entropy loss: -0.0009  | Total Loss: 6.08 | Total Steps: 29
---black cylinder---
Agent in terminal steps
Termin

Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 971/175267  | Episode Reward: 10.0  | Average Reward 6.58  | Actor loss: 0.00 | Critic loss: 1.24 | Entropy loss: -0.0000  | Total Loss: 1.24 | Total Steps: 6
---red sphere---
Decision Step reward: -2.5
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 972/175267  | Episode Reward: 5.0  | Average Reward 6.54  | Actor loss: -0.32 | Critic loss: 6.12 | Entropy loss: -0.0049  | Total Loss: 5.80 | Total Steps: 112
---yellow cube---
Decision Step reward: -2.5
Decision Step reward: -2.5
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 973/175267  | Episode Reward: 2.5  | Average Reward 6.49  | Actor loss: -0.15 | Critic loss: 7.33 | Entropy loss: -0.0009  | Total Loss: 7.18 | Total Steps: 53
---black prism---
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 974/175267  | Episode Reward: 10.0  | Averag

Decision Step reward: -2.5
Decision Step reward: -2.5
Decision Step reward: -2.5
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1001/175267  | Episode Reward: -2.5  | Average Reward 6.99  | Actor loss: -0.59 | Critic loss: 16.65 | Entropy loss: -0.0045  | Total Loss: 16.06 | Total Steps: 91
---yellow cylinder---
Decision Step reward: -2.5
Decision Step reward: -2.5
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1002/175267  | Episode Reward: 2.5  | Average Reward 6.92  | Actor loss: -0.58 | Critic loss: 6.47 | Entropy loss: -0.0035  | Total Loss: 5.89 | Total Steps: 44
---black cube---
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1003/175267  | Episode Reward: 7.5  | Average Reward 6.89  | Actor loss: 0.09 | Critic loss: 4.66 | Entropy loss: -0.0004  | Total Loss: 4.75 | Total Steps: 30
---red prism---
Decision Step reward: -2.5
D

Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1030/175267  | Episode Reward: 3.0  | Average Reward 7.06  | Actor loss: -0.56 | Critic loss: 5.73 | Entropy loss: -0.0073  | Total Loss: 5.15 | Total Steps: 143
---yellow cube---
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1031/175267  | Episode Reward: 10.0  | Average Reward 7.06  | Actor loss: 0.01 | Critic loss: 2.92 | Entropy loss: -0.0000  | Total Loss: 2.93 | Total Steps: 6
---yellow capsule---
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1032/175267  | Episode Reward: 7.5  | Average Reward 7.06  | Actor loss: 0.08 | Critic loss: 3.66 | Entropy loss: -0.0004  | Total Loss: 3.74 | Total Steps: 30
---black cube---
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1033/175267  | Episode Reward: 10.0  | Average Reward 7.11  | Actor loss: 0.01 | Critic loss: 1.26 | Entropy loss: -0.00

Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1058/175267  | Episode Reward: 5.0  | Average Reward 6.96  | Actor loss: -0.18 | Critic loss: 9.62 | Entropy loss: -0.0039  | Total Loss: 9.44 | Total Steps: 87
---yellow cube---
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1059/175267  | Episode Reward: 10.0  | Average Reward 7.01  | Actor loss: 0.00 | Critic loss: 0.83 | Entropy loss: -0.0000  | Total Loss: 0.84 | Total Steps: 6
---blue sphere---
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1060/175267  | Episode Reward: 7.5  | Average Reward 7.06  | Actor loss: 0.14 | Critic loss: 3.11 | Entropy loss: -0.0009  | Total Loss: 3.25 | Total Steps: 31
---red prism---
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1061/175267  | Episode Reward: 10.0  | Average Reward 7.06  | Actor loss: 0.01 | Critic loss: 1.37 | Entropy loss: -0.0000  | Total Loss: 1.38 | Total S

Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1086/175267  | Episode Reward: 10.0  | Average Reward 6.60  | Actor loss: 0.01 | Critic loss: 3.66 | Entropy loss: -0.0000  | Total Loss: 3.68 | Total Steps: 6
---blue prism---
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1087/175267  | Episode Reward: 7.5  | Average Reward 6.60  | Actor loss: -0.05 | Critic loss: 1.06 | Entropy loss: -0.0011  | Total Loss: 1.01 | Total Steps: 68
---blue sphere---
Decision Step reward: -2.5
Decision Step reward: -1.0
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1088/175267  | Episode Reward: 6.5  | Average Reward 6.57  | Actor loss: -0.32 | Critic loss: 7.33 | Entropy loss: -0.0020  | Total Loss: 7.01 | Total Steps: 45
---red prism---
Decision Step reward: -2.5
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1089/175267  | Episode Reward: 5.0  | Averag

Decision Step reward: -2.5
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1116/175267  | Episode Reward: 5.0  | Average Reward 6.80  | Actor loss: -0.08 | Critic loss: 3.28 | Entropy loss: -0.0013  | Total Loss: 3.20 | Total Steps: 42
---red cube---
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1117/175267  | Episode Reward: 10.0  | Average Reward 6.80  | Actor loss: 0.01 | Critic loss: 2.68 | Entropy loss: -0.0000  | Total Loss: 2.69 | Total Steps: 6
---yellow cube---
Decision Step reward: -2.5
Decision Step reward: -2.5
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1118/175267  | Episode Reward: 2.5  | Average Reward 6.72  | Actor loss: -0.02 | Critic loss: 9.36 | Entropy loss: -0.0022  | Total Loss: 9.34 | Total Steps: 50
---yellow cube---
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1119/175267  | Episode Reward: 10.0  | Avera

Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1145/175267  | Episode Reward: 7.5  | Average Reward 6.46  | Actor loss: 0.02 | Critic loss: 6.71 | Entropy loss: -0.0001  | Total Loss: 6.72 | Total Steps: 34
---blue prism---
Decision Step reward: -2.5
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1146/175267  | Episode Reward: 5.0  | Average Reward 6.41  | Actor loss: -0.05 | Critic loss: 8.06 | Entropy loss: -0.0019  | Total Loss: 8.01 | Total Steps: 90
---black prism---
Decision Step reward: -2.5
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1147/175267  | Episode Reward: 5.0  | Average Reward 6.38  | Actor loss: -0.17 | Critic loss: 4.51 | Entropy loss: -0.0013  | Total Loss: 4.34 | Total Steps: 52
---blue prism---
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1148/175267  | Episode Reward: 10.0  | Aver

Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1175/175267  | Episode Reward: 10.0  | Average Reward 7.29  | Actor loss: 0.09 | Critic loss: 3.24 | Entropy loss: -0.0009  | Total Loss: 3.32 | Total Steps: 7
---green cylinder---
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1176/175267  | Episode Reward: 10.0  | Average Reward 7.31  | Actor loss: 0.00 | Critic loss: 0.66 | Entropy loss: -0.0000  | Total Loss: 0.67 | Total Steps: 6
---red prism---
Decision Step reward: -2.5
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1177/175267  | Episode Reward: 5.0  | Average Reward 7.26  | Actor loss: -0.03 | Critic loss: 7.98 | Entropy loss: -0.0003  | Total Loss: 7.95 | Total Steps: 43
---green sphere---
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1178/175267  | Episode Reward: 10.0  | Average Reward 7.26  | Actor loss: 0.02 | Critic loss: 0.37 | Entropy loss: -0.000

Decision Step reward: -2.5
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1204/175267  | Episode Reward: 5.0  | Average Reward 7.10  | Actor loss: -0.13 | Critic loss: 9.37 | Entropy loss: -0.0048  | Total Loss: 9.23 | Total Steps: 55
---black prism---
Decision Step reward: -1.0
Decision Step reward: -2.5
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1205/175267  | Episode Reward: 4.0  | Average Reward 7.07  | Actor loss: -0.20 | Critic loss: 2.27 | Entropy loss: -0.0090  | Total Loss: 2.06 | Total Steps: 225
---blue prism---
Decision Step reward: -2.5
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1206/175267  | Episode Reward: 5.0  | Average Reward 7.01  | Actor loss: 0.58 | Critic loss: 7.84 | Entropy loss: -0.0036  | Total Loss: 8.42 | Total Steps: 54
---yellow capsule---
Agent in terminal steps
Terminal Step reward: 10.0
Train

Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1235/175267  | Episode Reward: 7.5  | Average Reward 7.49  | Actor loss: -0.12 | Critic loss: 6.74 | Entropy loss: -0.0007  | Total Loss: 6.62 | Total Steps: 52
---black cube---
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1236/175267  | Episode Reward: 7.5  | Average Reward 7.49  | Actor loss: -0.01 | Critic loss: 6.19 | Entropy loss: -0.0006  | Total Loss: 6.18 | Total Steps: 43
---yellow sphere---
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1237/175267  | Episode Reward: 7.5  | Average Reward 7.49  | Actor loss: 0.30 | Critic loss: 3.10 | Entropy loss: -0.0035  | Total Loss: 3.40 | Total Steps: 32
---red cylinder---
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1238/175267  | Episode Reward: 10.0  | Average Reward 7.51  | Actor loss: 0.01 | Critic loss:

Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1263/175267  | Episode Reward: 10.0  | Average Reward 7.33  | Actor loss: 0.02 | Critic loss: 0.89 | Entropy loss: -0.0000  | Total Loss: 0.91 | Total Steps: 6
---black cube---
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1264/175267  | Episode Reward: 10.0  | Average Reward 7.43  | Actor loss: 0.06 | Critic loss: 5.66 | Entropy loss: -0.0003  | Total Loss: 5.72 | Total Steps: 31
---black cylinder---
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1265/175267  | Episode Reward: 10.0  | Average Reward 7.43  | Actor loss: 0.12 | Critic loss: 4.08 | Entropy loss: -0.0020  | Total Loss: 4.20 | Total Steps: 32
---red cube---
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1266/175267  | Episode Reward: 10.0  | Average Reward 7.46  | Actor loss: 0.02 | Critic loss: 1.78 | Entropy loss: -0.0002  | Total Loss: 1.80 | Total Steps: 38
---black prism--

Decision Step reward: -2.5
Decision Step reward: -2.5
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1294/175267  | Episode Reward: 2.5  | Average Reward 7.66  | Actor loss: -0.33 | Critic loss: 6.73 | Entropy loss: -0.0042  | Total Loss: 6.40 | Total Steps: 59
---yellow capsule---
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1295/175267  | Episode Reward: 7.5  | Average Reward 7.63  | Actor loss: 0.14 | Critic loss: 3.35 | Entropy loss: -0.0009  | Total Loss: 3.49 | Total Steps: 29
---black capsule---
Decision Step reward: -2.5
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1296/175267  | Episode Reward: 5.0  | Average Reward 7.58  | Actor loss: 0.05 | Critic loss: 7.88 | Entropy loss: -0.0014  | Total Loss: 7.92 | Total Steps: 43
---yellow cube---
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1297/17526

Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1323/175267  | Episode Reward: 7.5  | Average Reward 7.41  | Actor loss: 0.18 | Critic loss: 2.46 | Entropy loss: -0.0020  | Total Loss: 2.64 | Total Steps: 40
---red sphere---
Decision Step reward: -2.5
Decision Step reward: -2.5
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1324/175267  | Episode Reward: 2.5  | Average Reward 7.33  | Actor loss: -0.13 | Critic loss: 4.39 | Entropy loss: -0.0010  | Total Loss: 4.27 | Total Steps: 43
---green sphere---
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1325/175267  | Episode Reward: 10.0  | Average Reward 7.33  | Actor loss: 0.23 | Critic loss: 3.20 | Entropy loss: -0.0015  | Total Loss: 3.43 | Total Steps: 31
---blue sphere---
Decision Step reward: -1.0
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1326/175267  |

Decision Step reward: -2.5
Decision Step reward: -2.5
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1353/175267  | Episode Reward: 2.5  | Average Reward 7.51  | Actor loss: -0.32 | Critic loss: 6.88 | Entropy loss: -0.0024  | Total Loss: 6.56 | Total Steps: 44
---blue prism---
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1354/175267  | Episode Reward: 7.5  | Average Reward 7.49  | Actor loss: 0.26 | Critic loss: 7.03 | Entropy loss: -0.0009  | Total Loss: 7.29 | Total Steps: 30
---green sphere---
Decision Step reward: -2.5
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1355/175267  | Episode Reward: 5.0  | Average Reward 7.44  | Actor loss: -0.00 | Critic loss: 4.55 | Entropy loss: -0.0011  | Total Loss: 4.54 | Total Steps: 43
---red cylinder---
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1356/175267  

Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1381/175267  | Episode Reward: 10.0  | Average Reward 7.11  | Actor loss: 0.02 | Critic loss: 3.87 | Entropy loss: -0.0001  | Total Loss: 3.89 | Total Steps: 31
---blue cube---
Decision Step reward: -2.5
Decision Step reward: -1.0
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1382/175267  | Episode Reward: 4.0  | Average Reward 7.05  | Actor loss: -0.47 | Critic loss: 10.92 | Entropy loss: -0.0058  | Total Loss: 10.44 | Total Steps: 57
---blue cube---
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1383/175267  | Episode Reward: 10.0  | Average Reward 7.05  | Actor loss: 0.05 | Critic loss: 4.58 | Entropy loss: -0.0002  | Total Loss: 4.62 | Total Steps: 29
---black cylinder---
Decision Step reward: -2.5
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1384/175267  | Episode Reward: 5.0  | A

Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1411/175267  | Episode Reward: 7.5  | Average Reward 7.06  | Actor loss: 0.20 | Critic loss: 5.34 | Entropy loss: -0.0010  | Total Loss: 5.54 | Total Steps: 30
---blue prism---
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1412/175267  | Episode Reward: 10.0  | Average Reward 7.06  | Actor loss: 0.01 | Critic loss: 1.88 | Entropy loss: -0.0000  | Total Loss: 1.89 | Total Steps: 6
---yellow capsule---
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1413/175267  | Episode Reward: 7.5  | Average Reward 7.04  | Actor loss: 0.13 | Critic loss: 7.35 | Entropy loss: -0.0010  | Total Loss: 7.48 | Total Steps: 29
---yellow cylinder---
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1414/175267  | Episode Reward: 10.0  | Average Reward 7.04  | Actor loss: 0.12 | Critic loss: 4.25 | Entropy loss: -0.

Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1439/175267  | Episode Reward: 10.0  | Average Reward 6.86  | Actor loss: 0.05 | Critic loss: 3.93 | Entropy loss: -0.0002  | Total Loss: 3.98 | Total Steps: 29
---black cube---
Decision Step reward: -2.5
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1440/175267  | Episode Reward: 5.0  | Average Reward 6.83  | Actor loss: -0.15 | Critic loss: 7.65 | Entropy loss: -0.0018  | Total Loss: 7.50 | Total Steps: 52
---yellow sphere---
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1441/175267  | Episode Reward: 10.0  | Average Reward 6.86  | Actor loss: 0.02 | Critic loss: 0.94 | Entropy loss: -0.0006  | Total Loss: 0.96 | Total Steps: 7
---black capsule---
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1442/175267  | Episode Reward: 10.0  | Average Reward 6.86  | Actor loss: 0.09 | Critic loss: 4.02 | Entropy loss: -0.0

Decision Step reward: -1.0
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1468/175267  | Episode Reward: 6.5  | Average Reward 6.92  | Actor loss: -0.31 | Critic loss: 3.80 | Entropy loss: -0.0030  | Total Loss: 3.49 | Total Steps: 55
---black capsule---
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1469/175267  | Episode Reward: 10.0  | Average Reward 6.92  | Actor loss: 0.03 | Critic loss: 3.29 | Entropy loss: -0.0003  | Total Loss: 3.33 | Total Steps: 29
---green capsule---
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1470/175267  | Episode Reward: 7.5  | Average Reward 6.95  | Actor loss: -0.56 | Critic loss: 3.02 | Entropy loss: -0.0031  | Total Loss: 2.46 | Total Steps: 48
---green sphere---
Decision Step reward: -2.5
Decision Step reward: -2.5
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1471/175267  | Episode Reward: 2.5 

Decision Step reward: -2.5
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1498/175267  | Episode Reward: 5.0  | Average Reward 7.12  | Actor loss: -0.20 | Critic loss: 5.08 | Entropy loss: -0.0026  | Total Loss: 4.88 | Total Steps: 52
---yellow cube---
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1499/175267  | Episode Reward: 7.5  | Average Reward 7.14  | Actor loss: -0.15 | Critic loss: 5.84 | Entropy loss: -0.0012  | Total Loss: 5.70 | Total Steps: 53
---red sphere---
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1500/175267  | Episode Reward: 10.0  | Average Reward 7.14  | Actor loss: 0.04 | Critic loss: 1.73 | Entropy loss: -0.0001  | Total Loss: 1.77 | Total Steps: 6
---red prism---
Decision Step reward: -2.5
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1501/175267  | Episode Reward: 5.0  | Averag

Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1527/175267  | Episode Reward: 10.0  | Average Reward 7.15  | Actor loss: 0.01 | Critic loss: 0.71 | Entropy loss: -0.0000  | Total Loss: 0.72 | Total Steps: 6
---blue sphere---
Decision Step reward: -2.5
Decision Step reward: -2.5
Decision Step reward: -2.5
Decision Step reward: -2.5
Decision Step reward: -2.5
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1528/175267  | Episode Reward: -5.0  | Average Reward 7.00  | Actor loss: -0.81 | Critic loss: 16.15 | Entropy loss: -0.0063  | Total Loss: 15.33 | Total Steps: 130
---yellow capsule---
Decision Step reward: -2.5
Decision Step reward: -2.5
Decision Step reward: -1.0
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1529/175267  | Episode Reward: 4.0  | Average Reward 7.01  | Actor loss: -0.45 | Critic loss: 5.47 | Entropy loss: -0.0047  | Total Loss: 5.02 | Total Steps: 83
---black prism--

Decision Step reward: -2.5
Decision Step reward: -2.5
Decision Step reward: -2.5
Decision Step reward: -2.5
Step: 250
Decision Step reward: -2.5
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1555/175267  | Episode Reward: -7.5  | Average Reward 6.83  | Actor loss: -0.33 | Critic loss: 5.71 | Entropy loss: -0.0045  | Total Loss: 5.37 | Total Steps: 304
---blue capsule---
Decision Step reward: -2.5
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1556/175267  | Episode Reward: 5.0  | Average Reward 6.80  | Actor loss: -0.19 | Critic loss: 5.68 | Entropy loss: -0.0055  | Total Loss: 5.48 | Total Steps: 121
---red prism---
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1557/175267  | Episode Reward: 10.0  | Average Reward 6.83  | Actor loss: 0.01 | Critic loss: 1.37 | Entropy loss: -0.0000  | Total Loss: 1.38 | Total Steps: 6
---green sphere---
Decision Step rewar

Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1584/175267  | Episode Reward: 5.0  | Average Reward 6.88  | Actor loss: 0.14 | Critic loss: 6.75 | Entropy loss: -0.0018  | Total Loss: 6.88 | Total Steps: 53
---yellow capsule---
Decision Step reward: -2.5
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1585/175267  | Episode Reward: 5.0  | Average Reward 6.85  | Actor loss: -0.32 | Critic loss: 8.03 | Entropy loss: -0.0025  | Total Loss: 7.70 | Total Steps: 73
---red cube---
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1586/175267  | Episode Reward: 7.5  | Average Reward 6.85  | Actor loss: 0.06 | Critic loss: 7.23 | Entropy loss: -0.0006  | Total Loss: 7.30 | Total Steps: 39
---blue sphere---
Decision Step reward: -2.5
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1587/175267  | 

Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1613/175267  | Episode Reward: 10.0  | Average Reward 6.84  | Actor loss: 0.01 | Critic loss: 0.21 | Entropy loss: -0.0000  | Total Loss: 0.22 | Total Steps: 6
---black cube---
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1614/175267  | Episode Reward: 10.0  | Average Reward 6.84  | Actor loss: 0.03 | Critic loss: 4.34 | Entropy loss: -0.0002  | Total Loss: 4.37 | Total Steps: 31
---green prism---
Decision Step reward: -2.5
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1615/175267  | Episode Reward: 5.0  | Average Reward 6.84  | Actor loss: -0.27 | Critic loss: 5.89 | Entropy loss: -0.0024  | Total Loss: 5.62 | Total Steps: 49
---green prism---
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1616/175267  | Episode Reward: 10.0  | Average Reward 6.87  | Actor loss: 0.01 | Critic loss: 2.10 | Entropy loss: -0.0000 

Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1643/175267  | Episode Reward: 10.0  | Average Reward 7.21  | Actor loss: 0.61 | Critic loss: 2.73 | Entropy loss: -0.0042  | Total Loss: 3.34 | Total Steps: 32
---green cylinder---
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1644/175267  | Episode Reward: 7.5  | Average Reward 7.18  | Actor loss: -0.00 | Critic loss: 0.98 | Entropy loss: -0.0006  | Total Loss: 0.98 | Total Steps: 38
---yellow sphere---
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1645/175267  | Episode Reward: 10.0  | Average Reward 7.18  | Actor loss: 0.01 | Critic loss: 0.36 | Entropy loss: -0.0000  | Total Loss: 0.37 | Total Steps: 6
---black capsule---
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1646/175267  | Episode Reward: 7.5  | Average Reward 7.18  | Actor loss: 0.31 | Critic loss: 4.69 | Entropy loss: -

Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1672/175267  | Episode Reward: 10.0  | Average Reward 7.16  | Actor loss: 0.07 | Critic loss: 3.18 | Entropy loss: -0.0008  | Total Loss: 3.25 | Total Steps: 30
---yellow cylinder---
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1673/175267  | Episode Reward: 10.0  | Average Reward 7.21  | Actor loss: 0.04 | Critic loss: 2.86 | Entropy loss: -0.0010  | Total Loss: 2.90 | Total Steps: 43
---green capsule---
Decision Step reward: -2.5
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1674/175267  | Episode Reward: 7.5  | Average Reward 7.25  | Actor loss: 0.06 | Critic loss: 7.86 | Entropy loss: -0.0004  | Total Loss: 7.92 | Total Steps: 32
---green capsule---
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 1675/175267  | Episode Reward: 10.0  | Average Reward 7.28  | Actor loss: 0.01 | Critic loss: 1.59 | Entropy loss: -0.0000  | Total Loss: 1.6

In [9]:

        data = {
                    'all_average_reward': all_average_reward,
                    'all_episode_reward': all_episode_reward,
                    'all_actor_loss': all_actor_loss,
                    'all_critic_loss': all_critic_loss,
                    'all_entropy_loss': all_entropy_loss,
                    'all_total_loss': all_total_loss,
                    'all_steps': all_steps,
                } 
        file_path = f'result/{ALG_NAME}_{ENV_ID}.txt'
        with open(file_path, 'w') as file:
            json.dump(data, file)


# training loop to certain parts of pretrained model

In [10]:
#

# import warnings
# # Disable all warnings
# warnings.filterwarnings("ignore")

from transformers import logging
logging.set_verbosity_error()

import argparse
import time
import json
# import matplotlib.pyplot as plt
import os
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical
from collections import OrderedDict
# add arguments in command --train/test
# parser = argparse.ArgumentParser(description='Train or test neural net motor controller.')
# parser.add_argument('--train', dest='train', action='store_true', default=False)
# parser.add_argument('--test', dest='test', action='store_true', default=True)
# args = parser.parse_args()
train = True
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device) 

S0_ALG_NAME = 'S2'
S0_ENV_ID = '23'
S0_episode = 24037

ALG_NAME = 'S2'
ENV_ID = '26'

TRAIN_EPISODES = 175267  # number of overall episodes for training
MAX_STEPS = 500  # maximum time step in one episode
LAM = 0.95  # reward discount in TD error
lr = 2.5e-5  #LR
speed = 3
num_steps = 250 # the step for updating the network
max_step_reward = -10

if __name__ == '__main__':
    #initialise pretrained model class 
    agent = Agent1(num_words, embedding_dim, vision_output_dim, language_output_dim, mixing_dim, lstm_hidden_dim,num_actions)
    agent.load(S0_episode,S0_ALG_NAME,S0_ENV_ID)
    pretrained_dict = agent.state_dict()

    # intialise new model class
    agent = Agent2(bert_output_dim, embedding_dim, vision_output_dim, language_output_dim, mixing_dim, lstm_hidden_dim,num_actions)
    bert_dict = agent.state_dict()

    new_dict = OrderedDict()
    for k in bert_dict.keys():
        if k in pretrained_dict.keys():
            new_dict[k] = pretrained_dict[k]
        else:
            new_dict[k] = bert_dict[k]
        
    agent.load_state_dict(new_dict)
    agent.to(device)
    
    optimizer = optim.RMSprop(agent.parameters(), lr=lr)
    best_score = float('-inf')
    hashmap = {
    0: 'capsule',
    1: 'cube',
    2: 'cylinder',
    3: 'prism',
    4: 'sphere',
    5: 'red',
    6: 'green',
    7: 'blue',
    8: 'yellow',
    9: 'black'}
    if train:
        entropy_term = 0
        all_episode_reward = []
        all_average_reward = []
        all_steps = []
        all_actor_loss = []
        all_critic_loss = []
        all_entropy_loss = []
        all_total_loss = []
        tracked_agent = -1
        for episode in range(TRAIN_EPISODES):
            t0 = time.time()
            episode_reward = 0
            # env.reset()
            behavior_name=list(env.behavior_specs)[0]
            spec=env.behavior_specs[behavior_name]
            # state = env.reset().astype(np.float32)
            STEPS = 0
            decision_steps, terminal_steps = env.get_steps(behavior_name)
            # state -- vt, lt, lstm
            vt = torch.tensor(decision_steps.obs[0]).reshape(1,3,128,128).to(device)
            index1 = int(decision_steps.obs[1][0][0])
            index2 = int(decision_steps.obs[1][0][1])+5
            print(f'---{hashmap[index2]} {hashmap[index1]}---')
            input_caption = f"{hashmap[index2]} {hashmap[index1]}"
            bert_encoder=BertEncoder()
            torch_token_ids,torch_attention_mask=bert_encoder(input_caption)
#             text_encoder=BertEncoder(bert_output_dim,num_words)
#             lt=text_encoder(input_caption).to(device).detach()
            torch_token_ids=torch_token_ids.to(device)
            torch_attention_mask=torch_attention_mask.to(device)
        
            # 0-capsule,1-cube,2-cylinder,3-prism,4-sphere 
#             lt = torch.zeros(35).to(device)
#             lt[index1],lt[index2] = 1,1

            lstm_hidden_state = (torch.zeros(1, lstm_hidden_dim).to(device), torch.zeros(1, lstm_hidden_dim).to(device))
            done = False
            while True:

                # Need to use when calculating the loss
                log_probs = []
                # values = []
                values = torch.empty(0).to(device)
                rewards = []

                for steps in range(num_steps):
                    
                    lstm_hidden_state = tuple(tensor.detach() for tensor in lstm_hidden_state)
                    policy_dist, value, lstm_hidden_state = agent(vt,torch_token_ids,torch_attention_mask,lstm_hidden_state)
                    STEPS += 1
                    dist = F.softmax(policy_dist.detach(),dim=1).cpu().numpy()
                    

                    action_dist = Categorical(F.softmax(policy_dist.detach(),dim=1))
                    # action_dist = Categorical(F.softmax(policy_dist,dim=1))
                    action = action_dist.sample() # sample an action from action_dist
                    action_onehot = F.one_hot(torch.tensor(action),num_actions).cpu()
                    
                    log_prob = torch.log(F.softmax(policy_dist,dim=1)[0][action])
                    # log_prob = torch.log(F.softmax(policy_dist,dim=1)[0][action])
                    # entropy = -np.sum(np.mean(dist)* np.log(dist))
                    entropy = F.cross_entropy(policy_dist.detach(), action)

                    discrete_actions = np.array(action_onehot).reshape(1,4)*speed
                    action_tuple = ActionTuple()
                    action_tuple.add_discrete(discrete_actions)
                    env.set_actions(behavior_name,action_tuple)
                    
                    env.step()
                    decision_steps, terminal_steps = env.get_steps(behavior_name)

                    if tracked_agent == -1 and len(decision_steps) >= 1:
                        tracked_agent = decision_steps.agent_id[0]
                        # print(tracked_agent)

                    if tracked_agent in terminal_steps: # roll over or hit the target
                        print('Agent in terminal steps')
                        done = True
                        reward = terminal_steps[tracked_agent].reward
                        if reward > 0:
                            pass
                        else: reward = -1 # roll over or other unseen conditions

                        print(f'Terminal Step reward: {reward}')

                    elif tracked_agent in decision_steps: # the agent which requires action
                        reward = decision_steps[tracked_agent].reward
                        # print(f'Decision Step reward: {reward}')
                        if reward<0:
                            print(f'Decision Step reward: {reward}')
                            # if reward<-1: hit = 1
                    if STEPS >= MAX_STEPS:
                        reward = max_step_reward
                        print(f'Max Step Reward: {reward}')
                        env.reset()
                        done = True
                    if STEPS % num_steps == 0:
                        print (f'Step: {STEPS}')

                    episode_reward = episode_reward + reward

                    rewards.append(reward)
                    # values.append(value)
                    values = torch.cat((values, value), dim=0)
                    log_probs.append(log_prob)
                    entropy_term = entropy_term + entropy
                    vt_new = torch.tensor(decision_steps.obs[0]).reshape(1,3,128,128).to(device)
                    vt = vt_new

                    if done or steps == num_steps-1:
                        # _, Qval,_ = agent(vt_new,lt,lstm_hidden_state)
                        # Qval = Qval.detach()
                        break
                
                
                discounted_rewards = np.zeros_like(values.cpu().detach().numpy())
                cumulative = 0
                # print(len(rewards))
                for t in reversed(range(len(rewards))):
                    cumulative = rewards[t] + LAM * cumulative # Monte Carlo
                    discounted_rewards[t] = cumulative
                # print(f'rewards:{rewards}, discounted_rewards:{discounted_rewards}')
                # Advantage Actor Critic

                # Qvals[-1] = rewards[t] + LAM * Qval      or       Qvals[-1] = rewards[t]                   
                # for t in range(len(rewards)-1):
                #         Qvals[t] = rewards[t] + LAM * values[t+1]
                
                # r_(t+1) = R(s_t|a_t)--> reward[t]        a_t, V_t = agent(s_t)
                # A_t = r_(t+1) + LAM * V_(t+1) - V_t 
                #     = Q_t - V_t
                
                # Monte Carlo Advantage = reward + LAM * cumulative_reward
                # Actor_loss = -log(pai(s_t|a_t))*A_t
                # Critic_loss = A_t.pow(2) *0.5
                # Entropy_loss = -F.entropy(pai(St),index) * 0.001

                # entropy = -np.sum(np.mean(dist) * np.log(dist))
                
                #update actor critic
                
                # values = torch.FloatTensor(values).requires_grad_(True).to(device)
                discounted_rewards = torch.FloatTensor(discounted_rewards.astype(np.float32)).to(device)
                log_probs = torch.stack(log_probs)
                advantage = discounted_rewards - values
                actor_loss = (-log_probs * advantage).mean()
                critic_loss = 0.5 * torch.square(advantage).mean()
                entropy_term /= num_steps
                entropy_loss = -0.1 * entropy_term
                ac_loss = actor_loss + critic_loss + entropy_loss
                # ac_loss = values.mean()
                optimizer.zero_grad()
                ac_loss.backward()
                optimizer.step()
                # print('updated')
                # for name, param in agent.named_parameters():
                #     if param.grad is not None:
                #         print(name, param.grad)
                #     else:
                #         print(name, "gradients not computed")
                # for name, param in agent.named_parameters():
                #     if name == 'value_estimator.weight':
                #         print(name, param)
                
                
                if done: break


            all_episode_reward.append(float(episode_reward))
            all_steps.append(STEPS)
            all_actor_loss.append(float(actor_loss))
            all_critic_loss.append(float(critic_loss))
            all_entropy_loss.append(float(entropy_loss))
            all_total_loss.append(float(ac_loss))
            if episode >= 100:
                avg_score = np.mean(all_episode_reward[-100:])
                all_average_reward.append(avg_score)
                if avg_score > best_score:
                    best_score = avg_score
                    agent.save(episode, ALG_NAME, ENV_ID)
                    print(f'-----The best score for averaging previous 200 episode reward is {best_score}. Model has been saved-----')
                print('Training  | Episode: {}/{}  | Episode Reward: {:.1f}  | Average Reward {:.2f}  | Actor loss: {:.2f} | Critic loss: {:.2f} | Entropy loss: {:.4f}  | Total Loss: {:.2f} | Total Steps: {}' \
                    .format(episode + 1, TRAIN_EPISODES, episode_reward, avg_score, actor_loss, critic_loss,entropy_loss,  ac_loss, STEPS))
            else:  print('Training  | Episode: {}/{}  | Episode Reward: {:.1f}  | Actor loss: {:.2f} | Critic loss: {:.2f} | Entropy loss: {:.4f}  | Total Loss: {:.2f} | Total Steps: {}' \
                    .format(episode + 1, TRAIN_EPISODES, episode_reward, actor_loss, critic_loss, entropy_loss,  ac_loss, STEPS))
            if episode%5000 == 0:
                    agent.save(episode, ALG_NAME, ENV_ID)
                    print("Model has been saved")
        print(all_average_reward)
        agent.save(episode ,ALG_NAME, ENV_ID)
        print("Model has been saved")

        data = {
                    'all_average_reward': all_average_reward,
                    'all_episode_reward': all_episode_reward,
                    'all_actor_loss': all_actor_loss,
                    'all_critic_loss': all_critic_loss,
                    'all_entropy_loss': all_entropy_loss,
                    'all_total_loss': all_total_loss,
                    'all_steps': all_steps,
                } 
        file_path = f'result/{ALG_NAME}_{ENV_ID}.txt'
        with open(file_path, 'w') as file:
            json.dump(data, file)



cuda


RuntimeError: Error(s) in loading state_dict for Agent1:
	Missing key(s) in state_dict: "language_module.embedding.weight", "language_module.embedding.bias". 
	Unexpected key(s) in state_dict: "bert_language_module.bert_model.embeddings.position_ids", "bert_language_module.bert_model.embeddings.word_embeddings.weight", "bert_language_module.bert_model.embeddings.position_embeddings.weight", "bert_language_module.bert_model.embeddings.token_type_embeddings.weight", "bert_language_module.bert_model.embeddings.LayerNorm.weight", "bert_language_module.bert_model.embeddings.LayerNorm.bias", "bert_language_module.bert_model.encoder.layer.0.attention.self.query.weight", "bert_language_module.bert_model.encoder.layer.0.attention.self.query.bias", "bert_language_module.bert_model.encoder.layer.0.attention.self.key.weight", "bert_language_module.bert_model.encoder.layer.0.attention.self.key.bias", "bert_language_module.bert_model.encoder.layer.0.attention.self.value.weight", "bert_language_module.bert_model.encoder.layer.0.attention.self.value.bias", "bert_language_module.bert_model.encoder.layer.0.attention.output.dense.weight", "bert_language_module.bert_model.encoder.layer.0.attention.output.dense.bias", "bert_language_module.bert_model.encoder.layer.0.attention.output.LayerNorm.weight", "bert_language_module.bert_model.encoder.layer.0.attention.output.LayerNorm.bias", "bert_language_module.bert_model.encoder.layer.0.intermediate.dense.weight", "bert_language_module.bert_model.encoder.layer.0.intermediate.dense.bias", "bert_language_module.bert_model.encoder.layer.0.output.dense.weight", "bert_language_module.bert_model.encoder.layer.0.output.dense.bias", "bert_language_module.bert_model.encoder.layer.0.output.LayerNorm.weight", "bert_language_module.bert_model.encoder.layer.0.output.LayerNorm.bias", "bert_language_module.bert_model.encoder.layer.1.attention.self.query.weight", "bert_language_module.bert_model.encoder.layer.1.attention.self.query.bias", "bert_language_module.bert_model.encoder.layer.1.attention.self.key.weight", "bert_language_module.bert_model.encoder.layer.1.attention.self.key.bias", "bert_language_module.bert_model.encoder.layer.1.attention.self.value.weight", "bert_language_module.bert_model.encoder.layer.1.attention.self.value.bias", "bert_language_module.bert_model.encoder.layer.1.attention.output.dense.weight", "bert_language_module.bert_model.encoder.layer.1.attention.output.dense.bias", "bert_language_module.bert_model.encoder.layer.1.attention.output.LayerNorm.weight", "bert_language_module.bert_model.encoder.layer.1.attention.output.LayerNorm.bias", "bert_language_module.bert_model.encoder.layer.1.intermediate.dense.weight", "bert_language_module.bert_model.encoder.layer.1.intermediate.dense.bias", "bert_language_module.bert_model.encoder.layer.1.output.dense.weight", "bert_language_module.bert_model.encoder.layer.1.output.dense.bias", "bert_language_module.bert_model.encoder.layer.1.output.LayerNorm.weight", "bert_language_module.bert_model.encoder.layer.1.output.LayerNorm.bias", "bert_language_module.bert_model.encoder.layer.2.attention.self.query.weight", "bert_language_module.bert_model.encoder.layer.2.attention.self.query.bias", "bert_language_module.bert_model.encoder.layer.2.attention.self.key.weight", "bert_language_module.bert_model.encoder.layer.2.attention.self.key.bias", "bert_language_module.bert_model.encoder.layer.2.attention.self.value.weight", "bert_language_module.bert_model.encoder.layer.2.attention.self.value.bias", "bert_language_module.bert_model.encoder.layer.2.attention.output.dense.weight", "bert_language_module.bert_model.encoder.layer.2.attention.output.dense.bias", "bert_language_module.bert_model.encoder.layer.2.attention.output.LayerNorm.weight", "bert_language_module.bert_model.encoder.layer.2.attention.output.LayerNorm.bias", "bert_language_module.bert_model.encoder.layer.2.intermediate.dense.weight", "bert_language_module.bert_model.encoder.layer.2.intermediate.dense.bias", "bert_language_module.bert_model.encoder.layer.2.output.dense.weight", "bert_language_module.bert_model.encoder.layer.2.output.dense.bias", "bert_language_module.bert_model.encoder.layer.2.output.LayerNorm.weight", "bert_language_module.bert_model.encoder.layer.2.output.LayerNorm.bias", "bert_language_module.bert_model.encoder.layer.3.attention.self.query.weight", "bert_language_module.bert_model.encoder.layer.3.attention.self.query.bias", "bert_language_module.bert_model.encoder.layer.3.attention.self.key.weight", "bert_language_module.bert_model.encoder.layer.3.attention.self.key.bias", "bert_language_module.bert_model.encoder.layer.3.attention.self.value.weight", "bert_language_module.bert_model.encoder.layer.3.attention.self.value.bias", "bert_language_module.bert_model.encoder.layer.3.attention.output.dense.weight", "bert_language_module.bert_model.encoder.layer.3.attention.output.dense.bias", "bert_language_module.bert_model.encoder.layer.3.attention.output.LayerNorm.weight", "bert_language_module.bert_model.encoder.layer.3.attention.output.LayerNorm.bias", "bert_language_module.bert_model.encoder.layer.3.intermediate.dense.weight", "bert_language_module.bert_model.encoder.layer.3.intermediate.dense.bias", "bert_language_module.bert_model.encoder.layer.3.output.dense.weight", "bert_language_module.bert_model.encoder.layer.3.output.dense.bias", "bert_language_module.bert_model.encoder.layer.3.output.LayerNorm.weight", "bert_language_module.bert_model.encoder.layer.3.output.LayerNorm.bias", "bert_language_module.bert_model.encoder.layer.4.attention.self.query.weight", "bert_language_module.bert_model.encoder.layer.4.attention.self.query.bias", "bert_language_module.bert_model.encoder.layer.4.attention.self.key.weight", "bert_language_module.bert_model.encoder.layer.4.attention.self.key.bias", "bert_language_module.bert_model.encoder.layer.4.attention.self.value.weight", "bert_language_module.bert_model.encoder.layer.4.attention.self.value.bias", "bert_language_module.bert_model.encoder.layer.4.attention.output.dense.weight", "bert_language_module.bert_model.encoder.layer.4.attention.output.dense.bias", "bert_language_module.bert_model.encoder.layer.4.attention.output.LayerNorm.weight", "bert_language_module.bert_model.encoder.layer.4.attention.output.LayerNorm.bias", "bert_language_module.bert_model.encoder.layer.4.intermediate.dense.weight", "bert_language_module.bert_model.encoder.layer.4.intermediate.dense.bias", "bert_language_module.bert_model.encoder.layer.4.output.dense.weight", "bert_language_module.bert_model.encoder.layer.4.output.dense.bias", "bert_language_module.bert_model.encoder.layer.4.output.LayerNorm.weight", "bert_language_module.bert_model.encoder.layer.4.output.LayerNorm.bias", "bert_language_module.bert_model.encoder.layer.5.attention.self.query.weight", "bert_language_module.bert_model.encoder.layer.5.attention.self.query.bias", "bert_language_module.bert_model.encoder.layer.5.attention.self.key.weight", "bert_language_module.bert_model.encoder.layer.5.attention.self.key.bias", "bert_language_module.bert_model.encoder.layer.5.attention.self.value.weight", "bert_language_module.bert_model.encoder.layer.5.attention.self.value.bias", "bert_language_module.bert_model.encoder.layer.5.attention.output.dense.weight", "bert_language_module.bert_model.encoder.layer.5.attention.output.dense.bias", "bert_language_module.bert_model.encoder.layer.5.attention.output.LayerNorm.weight", "bert_language_module.bert_model.encoder.layer.5.attention.output.LayerNorm.bias", "bert_language_module.bert_model.encoder.layer.5.intermediate.dense.weight", "bert_language_module.bert_model.encoder.layer.5.intermediate.dense.bias", "bert_language_module.bert_model.encoder.layer.5.output.dense.weight", "bert_language_module.bert_model.encoder.layer.5.output.dense.bias", "bert_language_module.bert_model.encoder.layer.5.output.LayerNorm.weight", "bert_language_module.bert_model.encoder.layer.5.output.LayerNorm.bias", "bert_language_module.bert_model.encoder.layer.6.attention.self.query.weight", "bert_language_module.bert_model.encoder.layer.6.attention.self.query.bias", "bert_language_module.bert_model.encoder.layer.6.attention.self.key.weight", "bert_language_module.bert_model.encoder.layer.6.attention.self.key.bias", "bert_language_module.bert_model.encoder.layer.6.attention.self.value.weight", "bert_language_module.bert_model.encoder.layer.6.attention.self.value.bias", "bert_language_module.bert_model.encoder.layer.6.attention.output.dense.weight", "bert_language_module.bert_model.encoder.layer.6.attention.output.dense.bias", "bert_language_module.bert_model.encoder.layer.6.attention.output.LayerNorm.weight", "bert_language_module.bert_model.encoder.layer.6.attention.output.LayerNorm.bias", "bert_language_module.bert_model.encoder.layer.6.intermediate.dense.weight", "bert_language_module.bert_model.encoder.layer.6.intermediate.dense.bias", "bert_language_module.bert_model.encoder.layer.6.output.dense.weight", "bert_language_module.bert_model.encoder.layer.6.output.dense.bias", "bert_language_module.bert_model.encoder.layer.6.output.LayerNorm.weight", "bert_language_module.bert_model.encoder.layer.6.output.LayerNorm.bias", "bert_language_module.bert_model.encoder.layer.7.attention.self.query.weight", "bert_language_module.bert_model.encoder.layer.7.attention.self.query.bias", "bert_language_module.bert_model.encoder.layer.7.attention.self.key.weight", "bert_language_module.bert_model.encoder.layer.7.attention.self.key.bias", "bert_language_module.bert_model.encoder.layer.7.attention.self.value.weight", "bert_language_module.bert_model.encoder.layer.7.attention.self.value.bias", "bert_language_module.bert_model.encoder.layer.7.attention.output.dense.weight", "bert_language_module.bert_model.encoder.layer.7.attention.output.dense.bias", "bert_language_module.bert_model.encoder.layer.7.attention.output.LayerNorm.weight", "bert_language_module.bert_model.encoder.layer.7.attention.output.LayerNorm.bias", "bert_language_module.bert_model.encoder.layer.7.intermediate.dense.weight", "bert_language_module.bert_model.encoder.layer.7.intermediate.dense.bias", "bert_language_module.bert_model.encoder.layer.7.output.dense.weight", "bert_language_module.bert_model.encoder.layer.7.output.dense.bias", "bert_language_module.bert_model.encoder.layer.7.output.LayerNorm.weight", "bert_language_module.bert_model.encoder.layer.7.output.LayerNorm.bias", "bert_language_module.bert_model.encoder.layer.8.attention.self.query.weight", "bert_language_module.bert_model.encoder.layer.8.attention.self.query.bias", "bert_language_module.bert_model.encoder.layer.8.attention.self.key.weight", "bert_language_module.bert_model.encoder.layer.8.attention.self.key.bias", "bert_language_module.bert_model.encoder.layer.8.attention.self.value.weight", "bert_language_module.bert_model.encoder.layer.8.attention.self.value.bias", "bert_language_module.bert_model.encoder.layer.8.attention.output.dense.weight", "bert_language_module.bert_model.encoder.layer.8.attention.output.dense.bias", "bert_language_module.bert_model.encoder.layer.8.attention.output.LayerNorm.weight", "bert_language_module.bert_model.encoder.layer.8.attention.output.LayerNorm.bias", "bert_language_module.bert_model.encoder.layer.8.intermediate.dense.weight", "bert_language_module.bert_model.encoder.layer.8.intermediate.dense.bias", "bert_language_module.bert_model.encoder.layer.8.output.dense.weight", "bert_language_module.bert_model.encoder.layer.8.output.dense.bias", "bert_language_module.bert_model.encoder.layer.8.output.LayerNorm.weight", "bert_language_module.bert_model.encoder.layer.8.output.LayerNorm.bias", "bert_language_module.bert_model.encoder.layer.9.attention.self.query.weight", "bert_language_module.bert_model.encoder.layer.9.attention.self.query.bias", "bert_language_module.bert_model.encoder.layer.9.attention.self.key.weight", "bert_language_module.bert_model.encoder.layer.9.attention.self.key.bias", "bert_language_module.bert_model.encoder.layer.9.attention.self.value.weight", "bert_language_module.bert_model.encoder.layer.9.attention.self.value.bias", "bert_language_module.bert_model.encoder.layer.9.attention.output.dense.weight", "bert_language_module.bert_model.encoder.layer.9.attention.output.dense.bias", "bert_language_module.bert_model.encoder.layer.9.attention.output.LayerNorm.weight", "bert_language_module.bert_model.encoder.layer.9.attention.output.LayerNorm.bias", "bert_language_module.bert_model.encoder.layer.9.intermediate.dense.weight", "bert_language_module.bert_model.encoder.layer.9.intermediate.dense.bias", "bert_language_module.bert_model.encoder.layer.9.output.dense.weight", "bert_language_module.bert_model.encoder.layer.9.output.dense.bias", "bert_language_module.bert_model.encoder.layer.9.output.LayerNorm.weight", "bert_language_module.bert_model.encoder.layer.9.output.LayerNorm.bias", "bert_language_module.bert_model.encoder.layer.10.attention.self.query.weight", "bert_language_module.bert_model.encoder.layer.10.attention.self.query.bias", "bert_language_module.bert_model.encoder.layer.10.attention.self.key.weight", "bert_language_module.bert_model.encoder.layer.10.attention.self.key.bias", "bert_language_module.bert_model.encoder.layer.10.attention.self.value.weight", "bert_language_module.bert_model.encoder.layer.10.attention.self.value.bias", "bert_language_module.bert_model.encoder.layer.10.attention.output.dense.weight", "bert_language_module.bert_model.encoder.layer.10.attention.output.dense.bias", "bert_language_module.bert_model.encoder.layer.10.attention.output.LayerNorm.weight", "bert_language_module.bert_model.encoder.layer.10.attention.output.LayerNorm.bias", "bert_language_module.bert_model.encoder.layer.10.intermediate.dense.weight", "bert_language_module.bert_model.encoder.layer.10.intermediate.dense.bias", "bert_language_module.bert_model.encoder.layer.10.output.dense.weight", "bert_language_module.bert_model.encoder.layer.10.output.dense.bias", "bert_language_module.bert_model.encoder.layer.10.output.LayerNorm.weight", "bert_language_module.bert_model.encoder.layer.10.output.LayerNorm.bias", "bert_language_module.bert_model.encoder.layer.11.attention.self.query.weight", "bert_language_module.bert_model.encoder.layer.11.attention.self.query.bias", "bert_language_module.bert_model.encoder.layer.11.attention.self.key.weight", "bert_language_module.bert_model.encoder.layer.11.attention.self.key.bias", "bert_language_module.bert_model.encoder.layer.11.attention.self.value.weight", "bert_language_module.bert_model.encoder.layer.11.attention.self.value.bias", "bert_language_module.bert_model.encoder.layer.11.attention.output.dense.weight", "bert_language_module.bert_model.encoder.layer.11.attention.output.dense.bias", "bert_language_module.bert_model.encoder.layer.11.attention.output.LayerNorm.weight", "bert_language_module.bert_model.encoder.layer.11.attention.output.LayerNorm.bias", "bert_language_module.bert_model.encoder.layer.11.intermediate.dense.weight", "bert_language_module.bert_model.encoder.layer.11.intermediate.dense.bias", "bert_language_module.bert_model.encoder.layer.11.output.dense.weight", "bert_language_module.bert_model.encoder.layer.11.output.dense.bias", "bert_language_module.bert_model.encoder.layer.11.output.LayerNorm.weight", "bert_language_module.bert_model.encoder.layer.11.output.LayerNorm.bias", "bert_language_module.bert_model.pooler.dense.weight", "bert_language_module.bert_model.pooler.dense.bias", "bert_language_module.embedding.weight", "bert_language_module.embedding.bias". 