In [3]:
import mlagents
from mlagents_envs.environment import UnityEnvironment as UE
import numpy as np
from mlagents_envs.environment import ActionTuple


In [4]:
# env =  UE(file_name="stage0_160523\stage0_copy",seed=1,side_channels=[])
env =  UE(file_name="3\\build",seed=1,side_channels=[],no_graphics = False)
env.reset()
behavior_names = list(env.behavior_specs.keys())
behavior_value = list(env.behavior_specs.values())
# for i in range(len(behavior_names)):
#     print(behavior_names[i])
#     print("obs:",behavior_value[i].observation_specs, "   act:", behavior_value[0].action_spec)
DecisionSteps, TerminalSteps = env.get_steps(behavior_names[0])
agentsNum = len(DecisionSteps.agent_id)
# print("exist:",DecisionSteps.agent_id,"   Dead:",TerminalSteps.agent_id)
# print("reward:",DecisionSteps.reward,"reward_dead:",TerminalSteps.reward)
# print("obs:",DecisionSteps.obs,"DeadObs:",TerminalSteps.obs)
# print("interrupted:", TerminalSteps.interrupted)


In [5]:
import torch
import torch.nn as nn
#(3,128,128) --> (64,7,7)
image = torch.tensor(DecisionSteps.obs[0]).reshape(1,3,128,128)
image.size()

torch.Size([1, 3, 128, 128])

In [6]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F

vision_output_dim = 3136
num_words = 35  # Number of unique words in the vocabulary
language_output_dim = 128
embedding_dim = 128
mixing_dim = 256
lstm_hidden_dim = 256
num_actions = 4

# (3,128,128) --> (64,7,7) = 3136 (3-layer CNN)
class VisualModule(nn.Module): 
    def __init__(self):
        super(VisualModule, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, stride=2, padding=1),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=3, stride=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=3, padding=0),
            nn.ReLU()
        )
        # self.conv = nn.Sequential(
        #     nn.Conv2d(3, 32, kernel_size=5, stride=2, padding=2),
        #     nn.ReLU(),
        #     nn.Conv2d(32, 64, kernel_size=5, stride=2, padding=2),
        #     nn.ReLU(),
        #     nn.Conv2d(64, 128, kernel_size=5, stride=2, padding=2),
        #     nn.ReLU(),
        #     nn.Conv2d(128, 64, kernel_size=5, stride=2, padding=1),
        #     nn.ReLU(),
        #     nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1),
        #     nn.ReLU(),
        #     nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1),
        #     nn.ReLU(),
        # )

    def forward(self, vt):
        encoded_vt = self.conv(vt)
        return encoded_vt.view(vt.size(0), -1).squeeze()

# one-hot encoding [0 0 1 0 0] --> 128 dimensional embedding (FF)
# S1:5 S2:5 S3:11 S4:9 --> 30 + 5 (noun) = 35 in total
class LanguageModule(nn.Module): 
    def __init__(self, num_words, embedding_dim):
        super(LanguageModule, self).__init__()
        self.embedding = nn.Linear(num_words, embedding_dim)

    def forward(self, lt):
        embedded_lt = self.embedding(lt)
        return embedded_lt

# 3136(vision) + 128 (language) --> 256 dimensional embedding (FF)
class MixingModule(nn.Module):
    def __init__(self, vision_output_dim, language_output_dim, mixing_dim):
        super(MixingModule, self).__init__()
        self.linear = nn.Linear(vision_output_dim + language_output_dim, mixing_dim)

    def forward(self, vision_output, language_output):
        combined_output = torch.cat((vision_output, language_output), dim=0)
        mixed_output = self.linear(combined_output)
        return mixed_output

class LSTMModule(nn.Module):
    def __init__(self,mixing_dim,lstm_hidden_dim):
        super(LSTMModule, self).__init__()
        self.lstm = nn.LSTMCell(mixing_dim, lstm_hidden_dim)
    
    def forward(self,mixed_output,lstm_hidden_state):
        lstm_hidden_state = self.lstm(mixed_output, lstm_hidden_state) 
        # lstm_output = lstm_hidden_state[0] # output is (hidden_state,cell_state), we need hidden state, shape (1,256)
        return lstm_hidden_state

class Agent(nn.Module):
    def __init__(self, num_words, embedding_dim, vision_output_dim, language_output_dim, mixing_dim, lstm_hidden_dim,num_actions):
        super(Agent, self).__init__()
        self.language_module = LanguageModule(num_words, embedding_dim)
        self.visual_module = VisualModule()
        self.mixing_module = MixingModule(vision_output_dim, language_output_dim, mixing_dim)
        self.lstm_module = LSTMModule(mixing_dim, lstm_hidden_dim)
        self.action_predictor = nn.Linear(lstm_hidden_dim, num_actions)
        self.value_estimator = nn.Linear(lstm_hidden_dim, 1)

    def forward(self, vt, lt, lstm_hidden_state):
        vision_output = self.visual_module(vt)
        language_output = self.language_module(lt)
        mixed_output = self.mixing_module(vision_output, language_output).unsqueeze(0)
        lstm_output = self.lstm_module(mixed_output,lstm_hidden_state)
        action_probs = self.action_predictor(lstm_output[0]) 
        value_estimate = self.value_estimator(lstm_output[0])
        return action_probs,value_estimate,lstm_output
        
        
    def save(self, episode):
        ALG_NAME = 'A2C'
        ENV_ID = 'S0'
        path = os.path.join('model', '_'.join([ALG_NAME, ENV_ID]))
        if not os.path.exists(path):
            os.makedirs(path)
        torch.save(self.state_dict(), os.path.join(path, f'agent_{episode}.pt'))

    def load(self, episode):
        ALG_NAME = 'A2C'
        ENV_ID = 'S0'
        path = os.path.join('model', '_'.join([ALG_NAME, ENV_ID]))
        self.load_state_dict(torch.load(os.path.join(path, f'agent_{episode}.pt')))    

        


In [7]:
visual_model = VisualModule()
vt = image
image_emb = visual_model(vt)
print(f'The shape of image embedding is {image_emb.size()}')

index = 5
language_model = LanguageModule(num_words,embedding_dim)
lt = torch.eye(num_words)[:, index]
language_emb = language_model(lt)
print(f'The shape of language embedding is {language_emb.size()}')

mixing_model = MixingModule(vision_output_dim,language_output_dim,mixing_dim)
mix_emb = mixing_model(image_emb,language_emb)
print(f'The shape of mix embedding is {mix_emb.size()}')

lstm = LSTMModule(mixing_dim,lstm_hidden_dim)
lstm_hidden_state = (torch.zeros(1, lstm_hidden_dim), torch.zeros(1, lstm_hidden_dim))
hidden_state = lstm(mix_emb.unsqueeze(0),lstm_hidden_state)
print(f'The shape of lstm hidden state is {hidden_state[0].size()}')

agent = Agent(num_words, embedding_dim, vision_output_dim, language_output_dim, mixing_dim, lstm_hidden_dim,num_actions)
action, value, lstm_hidden_state= agent(vt,lt,lstm_hidden_state)


The shape of image embedding is torch.Size([3136])
The shape of language embedding is torch.Size([128])
The shape of mix embedding is torch.Size([256])
The shape of lstm hidden state is torch.Size([1, 256])


In [8]:
# import numpy as np
# from mlagents_envs.environment import ActionTuple

# behavior_name=list(env.behavior_specs)[0]
# spec=env.behavior_specs[behavior_name]
# env.reset()

# while True:
#     agent = Agent(num_words, embedding_dim, vision_output_dim, language_output_dim, mixing_dim, lstm_hidden_dim,num_actions)
#     vt = torch.tensor(DecisionSteps.obs).reshape(1,3,128,128)
#     index = 5 #e.g
#     lt = torch.eye(num_words)[:, index]
#     lstm_hidden_state = (torch.zeros(1, lstm_hidden_dim), torch.zeros(1, lstm_hidden_dim))
#     action, value,_ = agent(vt,lt,lstm_hidden_state)
#     action = F.one_hot(torch.argmax(action),num_actions)
#     print(action)

#     continuous_actions = np.empty((1, 0))
#     discrete_actions = np.array(action).reshape(1,4) #[forward, backward, right, left]
#     action_tuple = ActionTuple(continuous_actions,discrete_actions)
#     env.set_actions(behavior_name,action_tuple)
#     env.step()
#     decision_steps, _ = env.get_steps(behavior_name)
#     tracked_agent = decision_steps.agent_id[0]
#     reward = decision_steps[tracked_agent].reward
#     print(reward)
  

In [9]:
# import numpy as np
# from mlagents_envs.environment import ActionTuple

# behavior_name=list(env.behavior_specs)[0]
# spec=env.behavior_specs[behavior_name]
# env.reset()


# while True:
#     agent = Agent(num_words, embedding_dim, vision_output_dim, language_output_dim, mixing_dim, lstm_hidden_dim,num_actions)
#     vt = torch.tensor(DecisionSteps.obs).reshape(1,3,128,128)
#     index = 5 #e.g
#     lt = torch.eye(num_words)[:, index]
#     lstm_hidden_state = (torch.zeros(1, lstm_hidden_dim), torch.zeros(1, lstm_hidden_dim))
#     # action, value, lstm_output = agent(vt,lt,lstm_hidden_state)
#     # action = F.one_hot(torch.argmax(action),num_actions)
#     action = torch.tensor([1,0,1,0])
#     print(action)

#     continuous_actions = np.empty((1, 0))
#     discrete_actions = np.array(action).reshape(1,4) #[forward, backward, right, left]
#     action_tuple = ActionTuple(continuous_actions,discrete_actions)
#     env.set_actions(behavior_name,action_tuple)
#     env.step()

In [10]:
import argparse
import time
import matplotlib.pyplot as plt
import os
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical
# add arguments in command --train/test
# parser = argparse.ArgumentParser(description='Train or test neural net motor controller.')
# parser.add_argument('--train', dest='train', action='store_true', default=False)
# parser.add_argument('--test', dest='test', action='store_true', default=True)
# args = parser.parse_args()
train = True
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

ALG_NAME = 'A2C'
ENV_ID = 'S0'
TRAIN_EPISODES = 200000  # number of overall episodes for training
TEST_EPISODES = 10  # number of overall episodes for testing
MAX_STEPS = 500  # maximum time step in one episode
LAM = 0.95  # reward discount in TD error
lr = 0.001
speed = 3
num_steps = 100 # the step for updating the network

if __name__ == '__main__':
    agent = Agent(num_words, embedding_dim, vision_output_dim, language_output_dim, mixing_dim, lstm_hidden_dim,num_actions)
    agent.to(device)
    optimizer = optim.RMSprop(agent.parameters(), lr=lr)
    best_score = 0
    if train:
        all_lengths = []
        average_lengths = []
        all_rewards = []
        entropy_term = 0
        all_episode_reward = []
        tracked_agent = -1
        for episode in range(TRAIN_EPISODES):
            t0 = time.time()
            episode_reward = 0
            # env.reset()
            behavior_name=list(env.behavior_specs)[0]
            spec=env.behavior_specs[behavior_name]
            # state = env.reset().astype(np.float32)
            STEPS = 0

            # state -- vt, lt, lstm
            vt = torch.tensor(DecisionSteps.obs[0]).reshape(1,3,128,128).to(device)
            index = 5 #e.g
            lt = torch.eye(num_words)[:, index].to(device)
            lstm_hidden_state = (torch.zeros(1, lstm_hidden_dim).to(device), torch.zeros(1, lstm_hidden_dim).to(device))
            done = False
            while True:

                # Need to use when calculating the loss
                log_probs = []
                values = []
                rewards = []

                for steps in range(num_steps):
                    lstm_hidden_state = tuple(tensor.detach() for tensor in lstm_hidden_state)
                    STEPS += 1
                    policy_dist, value, lstm_hidden_state = agent(vt,lt,lstm_hidden_state)
                    value = value.detach()
                    dist = F.softmax(policy_dist.detach(),dim=1).cpu().numpy()
                    

                    action_dist = Categorical(F.softmax(policy_dist.detach(),dim=1))
                    # action_dist = Categorical(F.softmax(policy_dist,dim=1))
                    action = action_dist.sample() # sample an action from action_dist
                    action_onehot = F.one_hot(torch.tensor(action),num_actions).cpu()
                    
                    log_prob = torch.log(F.softmax(policy_dist,dim=1)[0][action])
                    # log_prob = torch.log(F.softmax(policy_dist,dim=1)[0][action])
                    # entropy = -np.sum(np.mean(dist)* np.log(dist))
                    entropy = F.cross_entropy(policy_dist.detach(), action)

                    discrete_actions = np.array(action_onehot).reshape(1,4)*speed
                    action_tuple = ActionTuple()
                    action_tuple.add_discrete(discrete_actions)
                    env.set_actions(behavior_name,action_tuple)
                    env.step()
                    decision_steps, terminal_steps = env.get_steps(behavior_name)

                    if tracked_agent == -1 and len(decision_steps) >= 1:
                        tracked_agent = decision_steps.agent_id[0]
                        # print(tracked_agent)

                    if tracked_agent in terminal_steps: # roll over or hit the target
                        print('Agent in terminal steps')
                        done = True
                        reward = terminal_steps[tracked_agent].reward
                        if reward > 0:
                            pass
                        else: reward = -1 # roll over or other unseen conditions

                        print(f'Terminal Step reward: {reward}')

                    elif tracked_agent in decision_steps: # the agent which requires action
                        reward = decision_steps[tracked_agent].reward
                        # print(f'Decision Step reward: {reward}')
                        if reward<0:
                            print(f'Decision Step reward: {reward}')
                    if STEPS >= MAX_STEPS:
                        reward = -10
                        print(f'Max Step Reward: {reward}')
                        env.reset()
                        done = True
                    if STEPS % 100 == 0:
                        print (f'Step: {STEPS}')

                    episode_reward = episode_reward + reward

                    rewards.append(reward)
                    values.append(value)
                    log_probs.append(log_prob)
                    entropy_term = entropy_term + entropy
                    vt_new = torch.tensor(decision_steps.obs[0]).reshape(1,3,128,128).to(device)
                    vt = vt_new

                    if done or steps == num_steps-1:
                        _, Qval,_ = agent(vt_new,lt,lstm_hidden_state)
                        Qval = Qval.detach()
                        all_rewards.append(np.sum(rewards))
                        all_lengths.append(steps)
                        average_lengths.append(np.mean(all_lengths[-10:]))
                        break
                
                Qvals = np.zeros_like(values)

                for t in reversed(range(len(rewards))):
                    Qval = rewards[t] + LAM * Qval # Monte Carlo
                    Qvals[t] = Qval

                # Advantage Actor Critic
                # Qvals[-1] = rewards[t] + LAM * Qval      or       Qvals[-1] = rewards[t]                   
                # for t in range(len(rewards)-1):
                #         Qvals[t] = rewards[t] + LAM * values[t+1]
                
                # r_(t+1) = R(s_t|a_t)--> reward[t]        a_t, V_t = agent(s_t)
                # A_t = r_(t+1) + LAM * V_(t+1) - V_t 
                #     = Q_t - V_t
                
                # Actor_loss = -log(pai(s_t|a_t))*A_t
                # Critic_loss = A_t.pow(2) *0.5
                # Entropy_loss = -F.entropy(pai(St),index) * 0.001
                
                # entropy = -np.sum(np.mean(dist) * np.log(dist))
                
                #update actor critic
                values = torch.FloatTensor(values).to(device)
                Qvals = torch.FloatTensor(Qvals.astype(np.float32)).to(device)
                log_probs = torch.stack(log_probs)
                advantage = Qvals - values
                actor_loss = (-log_probs * advantage).mean()
                critic_loss = 0.5 * torch.square(advantage).mean()
                
                ac_loss = actor_loss + critic_loss - 0.001 * entropy_term
                optimizer.zero_grad()
                ac_loss.backward(retain_graph=True)
                optimizer.step()
                # for name, param in agent.named_parameters():
                #     if param.grad is not None:
                #         print(name, param.grad)
                #     else:
                #         print(name, "gradients not computed")
                
                
                if done: break

            print('Training  | Episode: {}/{}  | Episode Reward: {:.0f}  | Total Loss: {:.2f} | Running Time: {:.4f}' \
                    .format(episode + 1, TRAIN_EPISODES, episode_reward, ac_loss, time.time() - t0))

            all_episode_reward.append(episode_reward)
            if episode >= 100:
                avg_score = np.mean(all_episode_reward[-100:])
                if avg_score > best_score:
                    best_score = avg_score
                    agent.save(episode)
                    print(f'-----The best score for averaging previous 100 episode reward is {best_score}. Model has been saved-----')
            
            if episode%500 == 0:
                    agent.save(episode)
                    print("Model has been saved")
                
            



cuda




Decision Step reward: -1.0
Step: 100
Decision Step reward: -1.0
Step: 200
Decision Step reward: -1.0
Decision Step reward: -1.0
Decision Step reward: -1.0
Decision Step reward: -1.0
Decision Step reward: -1.0
Decision Step reward: -1.0
Step: 300
Decision Step reward: -1.0
Decision Step reward: -1.0
Decision Step reward: -1.0
Step: 400
Decision Step reward: -1.0
Decision Step reward: -1.0
Decision Step reward: -1.0
Decision Step reward: -1.0
Decision Step reward: -1.0
Decision Step reward: -1.0
Max Step Reward: -10
Step: 500
Training  | Episode: 1/200000  | Episode Reward: -26  | Total Loss: 5.07 | Running Time: 13.0733
Model has been saved
Agent in terminal steps
Terminal Step reward: 10.0
Training  | Episode: 2/200000  | Episode Reward: 10  | Total Loss: 30.78 | Running Time: 0.1677
Decision Step reward: -1.0
Decision Step reward: -1.0
Step: 100
Decision Step reward: -1.0
Decision Step reward: -1.0
Step: 200
Decision Step reward: -1.0
Decision Step reward: -1.0
Decision Step reward: -

In [None]:
import numpy as np
import torch
import torch.nn.functional as F
# dist = np.array((2,1,2,2))
# entropy = -np.sum(np.mean(dist)* np.log(dist))
# entropy

action = torch.tensor([[0.1,0.2,0.1,1]])
index = torch.tensor([0])
print(action,index)
entropy_loss = F.cross_entropy(action, index)
entropy_loss

tensor([[0.1000, 0.2000, 0.1000, 1.0000]]) tensor([0])


tensor(1.7165)

In [None]:
# Do I need env.reset() at the beginning of each episode or unity file will reset it
# The utility of workstation GPU is only 1 percent (Memory Leak?) in laptop GPU 8 seconds / episode on average
# Can we make the unity environment accept action from GPU?
#  Model is still hard to learn meaningful actions -- Make sure the algorithm is correct (Weight Initialization? Experience Replay?) value layer how to update?

In [None]:
smoothed_rewards = pd.Series.rolling(pd.Series(all_rewards), 10).mean()
smoothed_rewards = [elem for elem in smoothed_rewards]
plt.plot(all_rewards)
plt.plot(smoothed_rewards)
plt.plot()
plt.xlabel('Episode')
plt.ylabel('Reward')
plt.show()

plt.plot(all_lengths)
plt.plot(average_lengths)
plt.xlabel('Episode')
plt.ylabel('Episode length')
plt.show()

In [None]:
import argparse
import time
import matplotlib.pyplot as plt
import os
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical
# add arguments in command --train/test
# parser = argparse.ArgumentParser(description='Train or test neural net motor controller.')
# parser.add_argument('--train', dest='train', action='store_true', default=False)
# parser.add_argument('--test', dest='test', action='store_true', default=True)
# args = parser.parse_args()
train = True
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
#####################  hyper parameters  ####################

# ENV_ID = 'CartPole-v1'  # environment id
# RANDOM_SEED = 2  # random seed, can be either an int number or None
# RENDER = False  # render while training

# ALG_NAME = 'AC'
ALG_NAME = 'A2C'
ENV_ID = 'S0'
TRAIN_EPISODES = 200000  # number of overall episodes for training
TEST_EPISODES = 10  # number of overall episodes for testing
MAX_STEPS = 1200  # maximum time step in one episode
LAM = 0.95  # reward discount in TD error
lr = 0.001
speed = 2
# LR_A = 0.001  # learning rate for actor
# LR_C = 0.01  # learning rate for critic


###############################  Actor-Critic  ####################################


# class Actor(nn.Module):
#     def __init__(self, state_dim, action_num, lr=0.001):
#         super(Actor, self).__init__()
#         self.model = nn.Sequential(
#             nn.Linear(state_dim, 30),
#             nn.ReLU(),
#             nn.Linear(30, action_num)
#         )
#         self.optimizer = optim.Adam(self.parameters(), lr=lr)

#     def learn(self, state, action, td_error):
#         self.optimizer.zero_grad()
#         logits = self.model(torch.FloatTensor(state))
#         loss = td_error * torch.nn.functional.cross_entropy(logits, torch.LongTensor([action]))
#         loss.backward()
#         self.optimizer.step()
#         return loss.item()

#     def get_action(self, state, greedy=False):
#         logits = self.model(torch.FloatTensor(state))
#         probs = torch.nn.functional.softmax(logits, dim=-1).detach().numpy()
#         if greedy:
#             return np.argmax(probs)
#         return np.random.choice(len(probs[0]), p=probs[0])

#     def save(self):
#         path = os.path.join('model', '_'.join([ALG_NAME, ENV_ID]))
#         if not os.path.exists(path):
#             os.makedirs(path)
#         torch.save(self.state_dict(), os.path.join(path, 'model_actor.pt'))

#     def load(self):
#         path = os.path.join('model', '_'.join([ALG_NAME, ENV_ID]))
#         self.load_state_dict(torch.load(os.path.join(path, 'model_actor.pt')))


# class Critic(nn.Module):
#     def __init__(self, state_dim, lr=0.01):
#         super(Critic, self).__init__()
#         self.model = nn.Sequential(
#             nn.Linear(state_dim, 30),
#             nn.ReLU(),
#             nn.Linear(30, 1)
#         )
#         self.optimizer = optim.Adam(self.parameters(), lr=lr)

#     def learn(self, state, reward, state_, done):
#         self.optimizer.zero_grad()
#         d = 0 if done else 1
#         v_ = self.model(torch.FloatTensor(state_))
#         v = self.model(torch.FloatTensor(state))
#         td_error = reward + d * LAM * v_ - v
#         loss = td_error ** 2
#         loss.backward()
#         self.optimizer.step()
#         return td_error.item()

#     def save(self):
#         path = os.path.join('model', '_'.join([ALG_NAME, ENV_ID]))
#         if not os.path.exists(path):
#             os.makedirs(path)
#         torch.save(self.state_dict(), os.path.join(path, 'model_critic.pt'))

#     def load(self):
#         path = os.path.join('model', '_'.join([ALG_NAME, ENV_ID]))
#         self.load_state_dict(torch.load(os.path.join(path, 'model_critic.pt')))


if __name__ == '__main__':
    ''' 
    choose environment
    1. Openai gym:
    env = gym.make()
    2. DeepMind Control Suite:
    env = dm_control2gym.make()
    '''
    # env = gym.make(ENV_ID).unwrapped
    # # dm_control2gym.create_render_mode('example mode', show=True, return_pixel=False, height=240, width=320, camera_id=-1, overlays=(),
    # #              depth=False, scene_option=None)
    # # env = dm_control2gym.make(domain_name="cartpole", task_name="balance")

    # env.seed(RANDOM_SEED)  # reproducible
    # np.random.seed(RANDOM_SEED)
    # torch.manual_seed(RANDOM_SEED)  # reproducible

    # N_F = env.observation_space.shape[0]
    # N_A = env.action_space.n

    # print("observation dimension: %d" % N_F)  # 4
    # print("observation high: %s" % env.observation_space.high)  # [ 2.4 , inf , 0.41887902 , inf]
    # print("observation low : %s" % env.observation_space.low)  # [-2.4 , -inf , -0.41887902 , -inf]
    # print("num of actions: %d" % N_A)  # 2 : left or right

    # actor = Actor(state_dim=N_F, action_num=N_A, lr=LR_A)
    # # we need a good teacher, so the teacher should learn faster than the actor
    # critic = Critic(state_dim=N_F, lr=LR_C)

    agent = Agent(num_words, embedding_dim, vision_output_dim, language_output_dim, mixing_dim, lstm_hidden_dim,num_actions)
    agent.to(device)
    optimizer = optim.RMSprop(agent.parameters(), lr=lr)
    t0 = time.time()
    best_score = 0
    if train:
        all_episode_reward = []
        tracked_agent = -1
        for episode in range(TRAIN_EPISODES):
            print(f'Episode: {episode}')
            # state = env.reset().astype(np.float32)
            env.reset()
            behavior_name=list(env.behavior_specs)[0]
            spec=env.behavior_specs[behavior_name]

            # state -- vt, lt, lstm
            vt = torch.tensor(DecisionSteps.obs[0]).reshape(1,3,128,128).to(device)
            index = 5 #e.g
            lt = torch.eye(num_words)[:, index].to(device)
            step = 0  # number of step in this episode
            episode_reward = 0  # rewards of all steps
            lstm_hidden_state = (torch.zeros(1, lstm_hidden_dim).to(device), torch.zeros(1, lstm_hidden_dim).to(device))
            done = False
            while True:
                # if RENDER:
                #     env.render()

                # action = actor.get_action(state) state --> vt,lt
                
                action, value, lstm_hidden_state = agent(vt,lt,lstm_hidden_state) #action is the direct linear output from the model
                action_dist = Categorical(F.softmax(action,dim=1))
                index = action_dist.sample() # sample an action from action_dist
                action_onehot = F.one_hot(torch.tensor(index),num_actions).cpu()
                
                
                # action_onehot = F.one_hot(torch.argmax(action),num_actions)
                # print(action_onehot)

                # state_new, reward, done, info = env.step(action)
                # state_new = state_new.astype(np.float32)
                # if episode<50:
                #     if episode%2:
                #         action_onehot = torch.tensor([1,0,1,0])
                #     else: action_onehot = torch.tensor([1,0,0,1])                 
                # continuous_actions = np.empty((1, 0))
                discrete_actions = np.array(action_onehot).reshape(1,4)*speed #[forward, backward, right, left]
                
                # action_tuple = ActionTuple(continuous_actions,discrete_actions)
                action_tuple = ActionTuple()
                action_tuple.add_discrete(discrete_actions)
                env.set_actions(behavior_name,action_tuple)
                env.step()
                step += 1

                decision_steps, terminal_steps = env.get_steps(behavior_name)

                # print((vt == vt_new).all())

                if tracked_agent == -1 and len(decision_steps) >= 1:
                    tracked_agent = decision_steps.agent_id[0]
                    # print(tracked_agent)

                if tracked_agent in terminal_steps: # roll over or hit the target
                    print('Agent in terminal steps')
                    done = True
                    reward = terminal_steps[tracked_agent].reward
                    if reward > 0:
                        pass
                    else: reward = -1 # roll over or other unseen conditions

                    print(f'Terminal Step reward: {reward}')

                elif tracked_agent in decision_steps: # the agent which requires action
                    reward = decision_steps[tracked_agent].reward
                    # print(f'Decision Step reward: {reward}')
                    if reward<0:
                        print(f'Decision Step reward: {reward}')
                    vt_new = torch.tensor(decision_steps.obs[0]).reshape(1,3,128,128).to(device)
                    vt = vt_new
                

                if step >= MAX_STEPS:
                    reward = -1
                    print(f'Max Step Reward: {reward}')
                    done = True
                if step % 100 == 0:
                    print (f'Step: {step}')

                
                    
                episode_reward = episode_reward + reward


                

                # try:
                #     td_error = critic.learn(
                #         state, reward, state_new, done
                #     )  # learn Value-function : gradient = grad[r + lambda * V(s_new) - V(s)]
                #     actor.learn(state, action, td_error)  # learn Policy : true_gradient = grad[logPi(s, a) * td_error]                   
                try:
                    # d = 0 if done else 1
                    # v_ = self.model(torch.FloatTensor(state_))
                    # v = self.model(torch.FloatTensor(state))
                    if not done:
                        action_new, value_new, lstm_hidden_state_new = agent(vt_new,lt,lstm_hidden_state)

                        # initialize the value first
                        td_error = reward + LAM * value_new - value
                    else: td_error = reward - value


                    critic_loss = td_error ** 2

                    log_prob = action_dist.log_prob(index)
                    actor_loss = -(log_prob * td_error)

                    # logits = action # the actual model output (without softmax)
                    # labels = index  # one hot vector of the sampled action
                    print(action,index)
                    entropy_loss = F.cross_entropy(action, index)
                    print(entropy_loss)
                    total_loss = actor_loss + 0.5* critic_loss - 0.001*entropy_loss
                    # entropy loss entropy = tf.nn.softmax_cross_entropy_with_logits(logits=logit, labels=tf.nn.softmax(logit))
                    # comb_loss = tf.reduce_mean((0.5 * value_loss + policy_loss - 0.01 * entropy))

                    total_loss = total_loss.to(device)

                    optimizer.zero_grad()
                    total_loss.backward(retain_graph=True)
                    print('hi')
                    optimizer.step()   
                    
              
                except KeyboardInterrupt:  # if Ctrl+C at running actor.learn(), then save model, or exit if not at actor.learn()
                    agent.save(episode)
                    print('model has been saved')

                # state = state_new
                # print(f'episode: {episode}, step:{step}')
                if done:
                    break
            
            if episode%500 == 0:
                agent.save(episode)
                print("Model has been saved")
                
            # all_episode_reward.append(episode_reward)
            # if episode >= 100:
            #     avg_score = np.mean(all_episode_reward[-100:])
            #     if avg_score > best_score:
            #         best_score = avg_score
            #         agent.save()
            #         print(f'The best score for averaging previous 100 episode reward is {best_score}. Model has been saved')



            # if episode == 0:
            #     all_episode_reward.append(episode_reward)
            # else:
            #     reference_value = all_episode_reward[-1] * 0.9 + episode_reward * 0.1
            #     if reference_value > all_episode_reward_best:
            #         agent.save()
            #         print(f'The best all_episode_reward is {reference_value}. Model has been saved')
            #         all_episode_reward_best = reference_value
            #     all_episode_reward.append(reference_value)

            print('Training  | Episode: {}/{}  | Episode Reward: {:.0f}  | Running Time: {:.4f}' \
                  .format(episode + 1, TRAIN_EPISODES, episode_reward, time.time() - t0))
            
            

            
            
        

        plt.plot(all_episode_reward)
        if not os.path.exists('image'):
            os.makedirs('image')
        plt.savefig(os.path.join('image', '_'.join([ALG_NAME, ENV_ID])))
        

    if not train:
        agent.load(episode)
        print("model has been loaded")
        tracked_agent = -1

        for episode in range(TEST_EPISODES):
            episode_time = time.time()
            # state = env.reset().astype(np.float32)
            vt = torch.tensor(DecisionSteps.obs[0]).reshape(1,3,128,128)
            index = 5 #e.g
            lt = torch.eye(num_words)[:, index]
            t = 0  # number of step in this episode
            episode_reward = 0
            while True:
                env.render()

                # action = actor.get_action(state, greedy=True)
                # state_new, reward, done, info = env.step(action)
                # state_new = state_new.astype(np.float32)
                # if done:
                #     reward = -20

                
                action, value, lstm_hidden_state = agent(vt,lt,lstm_hidden_state)
                action = F.one_hot(torch.argmax(action),num_actions)

                # state_new, reward, done, info = env.step(action)
                # state_new = state_new.astype(np.float32)
                continuous_actions = np.empty((1, 0))
                discrete_actions = np.array(action).reshape(1,4) #[forward, backward, right, left]
                action_tuple = ActionTuple(continuous_actions,discrete_actions)
                env.set_actions(behavior_name,action_tuple)
                env.step()
                decision_steps, terminal_steps = env.get_steps(behavior_name)
                vt_new = torch.tensor(decision_steps.obs).reshape(1,3,128,128)
                
                if tracked_agent == -1 and len(decision_steps) >= 1:
                    tracked_agent = decision_steps.agent_id[0]
                if tracked_agent in decision_steps:
                    # tracked_agent = decision_steps.agent_id[0]
                    reward = decision_steps[tracked_agent].reward 
                # if done:
                #     reward = -20  # reward shaping trick
                if tracked_agent in terminal_steps:
                    done = True
                    reward = -20

                episode_reward += reward
                vt = vt_new
                t += 1

                if done or t >= MAX_STEPS:
                    print('Testing  | Episode: {}/{}  | Episode Reward: {:.0f}  | Running Time: {:.4f}' \
                          .format(episode + 1, TEST_EPISODES, episode_reward, time.time() - t0))
                    break



cuda
Episode: 0




tensor([[ 0.0199, -0.0344,  0.0193, -0.0582]], device='cuda:0',
       grad_fn=<AddmmBackward0>) tensor([3], device='cuda:0')
tensor(1.4318, device='cuda:0', grad_fn=<NllLossBackward0>)
hi
tensor([[-0.2298, -0.1373,  0.1180, -0.0827]], device='cuda:0',
       grad_fn=<AddmmBackward0>) tensor([3], device='cuda:0')
tensor(1.3944, device='cuda:0', grad_fn=<NllLossBackward0>)


RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.FloatTensor [1024]] is at version 2; expected version 1 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True).

In [None]:
# Do we need to update the network every step?

In [None]:
# Try to find the optimized MAX_STEP
# speed 1: 10000+ speed 2:1822 speed 3: 1918 speed 4: 900 speed 5: 951 speed 6:964 speed 7: 1181 
import random
import torch
import torch.nn.functional as F
# speed = 1
TRAIN_EPISODES = 20
tracked_agent = -1
num_actions = 4
average = 0
# for speed in range(30,70,5):
#     speed = speed/10
speed = 3
for episode in range(TRAIN_EPISODES):
    env.reset()
    behavior_name=list(env.behavior_specs)[0]
    step = 0
    while True:
        index = random.randint(0, 3) # sample an action from action_dist
        action_onehot = F.one_hot(torch.tensor(index),num_actions).cpu()
        discrete_actions = np.array(action_onehot).reshape(1,4)*speed #[forward, backward, right, left]
        action_tuple = ActionTuple()
        action_tuple.add_discrete(discrete_actions)
        env.set_actions(behavior_name,action_tuple)
        env.step()
        step += 1

        decision_steps, terminal_steps = env.get_steps(behavior_name)
        if tracked_agent == -1 and len(decision_steps) >= 1:
            tracked_agent = decision_steps.agent_id[0]
            
        if tracked_agent in terminal_steps: # roll over or hit the target
            reward = terminal_steps[tracked_agent].reward
            if reward > 0: # hit the target
                print(f'{episode}: {step} in total')
                average += step
                break
            else:           # roll over or other conditions
                env.reset()
                step = 0
                continue # roll over or other unseen conditions
        if tracked_agent in decision_steps: # the agent which requires action
            continue
average /= TRAIN_EPISODES
print(f'For speed {speed}, average random step for hitting the target is {average}')
      

In [None]:
# the unity could not open in work station
# The agent in terminal step will also in decison step

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Sample data
object_sizes = [1, 2, 3, 4, 5]  # Object sizes on the x-axis
max_steps = [[4721, 2468, 1587, 1249, 1084],   # Max steps for each object size
             [4027, 1550, 738, 640, 505],
             [3320, 1391, 464, 331, 307],
             [3289, 882, 679, 424, 211],
             [3057, 1203, 470, 378, 222]]
speeds = [1, 2, 3, 4, 5]  # Discrete speeds for color-coding

# Color mapping for each speed
speed_color_mapping = {
    1: 'red',
    2: 'blue',
    3: 'green',
    4: 'orange',
    5: 'purple'
}

# Generate scatter plot
for i, size in enumerate(object_sizes):
    for j, steps in enumerate(max_steps[i]):
        speed = speeds[j]
        color = speed_color_mapping[speed]
        plt.scatter(size, steps, c=color)

# Set labels and title
plt.xlabel('Object Size')
plt.ylabel('Max Step')
plt.title('Scatter Plot')

plt.xlim(0, 6)
plt.xticks(np.arange(0, 7, 1))
plt.ylim(0, 5000)
plt.yticks(np.arange(0, 5001, 200))

# Create legend
legend_labels = [f'Speed {speed}' for speed, _ in speed_color_mapping.items()]
plt.legend(legend_labels, loc='upper right')

# Display the plot
plt.show()




In [None]:
import torch
import torch.nn.functional as F
a = torch.tensor([[0.2460, 0.2503, 0.2647, 0.2389]])
b = F.softmax(a,dim=1)
b

In [None]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

In [None]:
import numpy as np

a = [1,2,3]
avg = np.mean(a[-10:])
avg