# TwoMission Hyper PPO

In [1]:
import os
from pathlib import Path
import numpy as np
import datetime
import platform
import torch
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter
from mlagents_envs.environment import UnityEnvironment, ActionTuple
from mlagents_envs.side_channel.engine_configuration_channel\
                             import EngineConfigurationChannel
from mlagents_envs.side_channel.environment_parameters_channel\
                             import EnvironmentParametersChannel


## Setting environments

In [2]:
# Global Setting
cur_dir = os.getcwd()
env_dir = os.path.abspath(os.path.join(cur_dir, "..", "Unity6000_Envs"))
test_dir = os.path.abspath(os.path.join(cur_dir, "temp", "pytorch_output"))


### Pytorch Device

In [3]:
# Pytorch Device
if torch.backends.mps.is_available():
    g_device = torch.device("mps")
elif torch.cuda.is_available():
    g_device = torch.device("cuda")
else:
    g_device = torch.device("cpu")

print(g_device)


mps


### Unity Enviroment

In [4]:
# Unity Enviroment
game = "TwoMissions"
os_name = platform.system()

if os_name == 'Linux':
    env_name = os.path.join(env_dir, f"{game}_{os_name}.x86_64")
elif os_name == 'Darwin':
    env_name = os.path.join(env_dir, f"{game}_{os_name}.app")

### Seting parameters

In [5]:
# Parameters
state_size = 127 # Ray(19 * 6 = 114) & position(3) & rotation(4) & velocity(3) & ball velocity(3)
action_size = 4 # Rotate(2) & Move(2)
goal_size = 2 # goal_signal

GOAL_OBS = 0
RAY_OBS = 1
VECTOR_OBS = 2

load_model = False
train_mode = True

discount_factor = 0.99
learning_rate = 3e-4
n_step = 2560
batch_size = 256
n_epoch = 3
_lambda = 0.95
epsilon = 0.2

run_step = 1000000 if train_mode else 0
test_step = 10000

print_interval = 10
save_interval = 100


unity_base_port = 1597

In [6]:
# NN model : Save and Load
date_time = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
save_path = os.path.join(test_dir, f"saved_models/{game}/Hyper_PPO/{date_time}")
Path(save_path).mkdir(parents=True, exist_ok=True)
save_model_path = os.path.join(save_path, 'TwoMission_HyperPPO.ckpt')
# print(f"save_path :{save_path}")
# print(f"save_model_path :{save_model_path}")
load_model_path = "" # Need to update

## HyperNetwork

In [7]:
# class HyperNetwork(torch.nn.Module):
#     def __init__(self, input_unit_size, action_size, hyper_input_size, **kwargs):
#         super(HyperNetwork, self).__init__(**kwargs)
#         self.input_unit_size = input_unit_size
#         self.action_size = action_size
#         self.hyper_input_size = hyper_input_size

#         self.d1 = torch.nn.Linear(self.hyper_input_size, 256)
#         self.d2 = torch.nn.Linear(256, 256)
#         self.pi = torch.nn.Linear(256, self.input_unit_size * self.action_size)
#         self.v = torch.nn.Linear(256, self.input_unit_size)

#     def forward(self, x, h):
#         h = F.relu(self.d1(h))
#         h = F.relu(self.d2(h))
#         target_weights_pi = F.tanh(self.pi(h))
#         target_weights_v = F.tanh(self.v(h))

#         x = x.unsqueeze(dim=1)
#         target_weights_pi = target_weights_pi.view(-1, self.input_unit_size, self.action_size)
#         result_pi = torch.bmm(x, target_weights_pi)
#         result_pi = result_pi.squeeze(dim = 1)
#         target_weights_v = target_weights_v.view(-1, self.input_unit_size, 1)
#         result_v = torch.bmm(x, target_weights_v)

#         return F.softmax(result_pi, dim=1), result_v.squeeze(dim=1)

## ActorCritic Network


In [8]:
class ActorCritic(torch.nn.Module):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.d1 = torch.nn.Linear(state_size+2, 256)
        self.d2 = torch.nn.Linear(256, 256)
        self.pi = torch.nn.Linear(256, action_size)
        self.v = torch.nn.Linear(256, 1)

    def forward(self, x, h):
        x = torch.concat([x, h], dim = 1)
        x = F.relu(self.d1(x))
        x = F.relu(self.d2(x))
        return F.softmax(self.pi(x), dim=-1), self.v(x)

## Agent class


In [12]:
# Agent Class
class HyperPPOAgent:
    def __init__(self):
        self.network = ActorCritic().to(g_device)
        self.optimizer = torch.optim.Adam(self.network.parameters(), lr=learning_rate)
        self.memory = list()
        self.writer = SummaryWriter(save_path)

        if load_model == True:
            print(f"... Load Model from {load_model_path} ...")
            checkpoint = torch.load(load_model_path, map_location=g_device)
            self.network.load_state_dict(checkpoint["network"])
            self.optimizer.load_state_dict(checkpoint["optimizer"])

    # get action
    def get_action(self, state, goal_signal, training=True):
        self.network.train(training)

        pi, _ = self.network(torch.FloatTensor(state).to(g_device), torch.FloatTensor(goal_signal).to(g_device))
        action = torch.multinomial(pi, num_samples=1).cpu().numpy()
        return action

    # Add replay memory
    def append_sample(self, state, goal_signal, action, reward, next_state, next_goal_signal, done):
        self.memory.append((state, goal_signal, action, reward, next_state, next_goal_signal, done))

    # training
    def train_model(self):
        self.network.train()

        state      = np.stack([m[0] for m in self.memory], axis=0)
        goal_signal= np.stack([m[1] for m in self.memory], axis=0)
        action     = np.stack([m[2] for m in self.memory], axis=0)
        reward     = np.stack([m[3] for m in self.memory], axis=0)
        next_state = np.stack([m[4] for m in self.memory], axis=0)
        next_goal_signal = np.stack([m[5] for m in self.memory], axis=0)
        done       = np.stack([m[6] for m in self.memory], axis=0)
        self.memory.clear()

        state, goal_signal, action, reward, next_state, next_goal_signal, done = map(lambda x: torch.FloatTensor(x).to(g_device),
                                                        [state, goal_signal, action, reward, next_state, next_goal_signal, done])
        # prob_old, adv, ret
        with torch.no_grad():
            pi_old, value = self.network(state, goal_signal)
            prob_old = pi_old.gather(1, action.long())

            _, next_value = self.network(next_state, next_goal_signal)
            delta = reward + (1 - done) * discount_factor * next_value - value
            adv = delta.clone()
            adv, done = map(lambda x: x.view(n_step, -1).transpose(0,1).contiguous(), [adv, done])
            for t in reversed(range(n_step-1)):
                adv[:, t] += (1 - done[:, t]) * discount_factor * _lambda * adv[:, t+1]
            adv = adv.transpose(0,1).contiguous().view(-1, 1)

            ret = adv + value

        # training loop
        actor_losses, critic_losses = [], []
        idxs = np.arange(len(reward))
        for _ in range(n_epoch):
            np.random.shuffle(idxs)
            for offset in range(0, len(reward), batch_size):
                idx = idxs[offset : offset + batch_size]

                _state, _goal_signal, _action, _ret, _adv, _prob_old =\
                    map(lambda x: x[idx], [state, goal_signal, action, ret, adv, prob_old])

                pi, value = self.network(_state, _goal_signal)
                prob = pi.gather(1, _action.long())

                # loss function for policy function
                ratio = prob / (_prob_old + 1e-7)
                surr1 = ratio * _adv
                surr2 = torch.clamp(ratio, min=1-epsilon, max=1+epsilon) * _adv
                actor_loss = -torch.min(surr1, surr2).mean()

                # loss function for value function
                critic_loss = F.mse_loss(value, _ret).mean()

                total_loss = actor_loss + critic_loss

                self.optimizer.zero_grad()
                total_loss.backward()
                self.optimizer.step()

                actor_losses.append(actor_loss.item())
                critic_losses.append(critic_loss.item())

        return np.mean(actor_losses), np.mean(critic_losses)

    # save model
    def save_model(self):
        print(f"... Save Model to {save_model_path} ...")
        torch.save({
            "network" : self.network.state_dict(),
            "optimizer" : self.optimizer.state_dict(),
        }, save_model_path)

    # logging
    def write_summary(self, score, actor_loss, critic_loss, step):
        self.writer.add_scalar("HyperPPOAgent_run/score", score, step)
        self.writer.add_scalar("HyperPPOAgent_model/actor_loss", actor_loss, step)
        self.writer.add_scalar("HyperPPOAgent_model/critic_loss", critic_loss, step)

## Train Model

In [11]:
env.close()

In [13]:
engine_configuration_channel = EngineConfigurationChannel()
environment_parameters_channel = EnvironmentParametersChannel()
env = UnityEnvironment(file_name=env_name,
                       side_channels=[engine_configuration_channel, environment_parameters_channel],
                       base_port=unity_base_port)
env.reset()

# Setup Unity Behavior
behavior_name = list(env.behavior_specs.keys())[0]
spec = env.behavior_specs[behavior_name]
engine_configuration_channel.set_configuration_parameters(time_scale=6.0)
dec, term = env.get_steps(behavior_name)
num_worker = len(dec)

# HyperPPOAgent
agent = HyperPPOAgent()
actor_losses, critic_losses, scores, episode, score = [], [], [], 0, 0
for step in range(run_step + test_step):
    if step == run_step:
        if train_mode:
            agent.save_model()
        print("TEST START")
        train_mode = False
        engine_configuration_channel.set_configuration_parameters(time_scale=1.0)

    goal_signal = dec.obs[GOAL_OBS]
    state = np.concatenate([dec.obs[RAY_OBS], dec.obs[VECTOR_OBS]], axis=-1)
    action = agent.get_action(state, goal_signal, train_mode)
    action_tuple = ActionTuple()
    action_tuple.add_discrete(action)
    env.set_actions(behavior_name, action_tuple)
    env.step()

    # info from env
    dec, term = env.get_steps(behavior_name)
    done = [False] * num_worker
    next_goal_signal = dec.obs[GOAL_OBS]
    next_state = np.concatenate([dec.obs[RAY_OBS], dec.obs[VECTOR_OBS]], axis=-1)
    reward = dec.reward
    for id in term.agent_id:
        _id = list(term.agent_id).index(id)
        done[id] = True
        next_goal_signal[id] = term.obs[GOAL_OBS][_id]
        next_state[id] = np.concatenate([term.obs[RAY_OBS][_id], term.obs[VECTOR_OBS][_id]], axis=-1)
        reward[id] = term.reward[_id]
    score += reward[0]

    if train_mode:
        for id in range(num_worker):
            agent.append_sample(state[id], goal_signal[id], action[id], [reward[id]], next_state[id], next_goal_signal[id], [done[id]])
        # training
        if (step+1) % n_step == 0:
            actor_loss, critic_loss = agent.train_model()
            actor_losses.append(actor_loss)
            critic_losses.append(critic_loss)

    if done[0]:
        episode +=1
        scores.append(score)
        score = 0

        # logging
        if episode % print_interval == 0:
            mean_score = np.mean(scores)
            mean_actor_loss = np.mean(actor_losses) if len(actor_losses) > 0 else 0
            mean_critic_loss = np.mean(critic_losses)  if len(critic_losses) > 0 else 0
            agent.write_summary(mean_score, mean_actor_loss, mean_critic_loss, step)
            actor_losses, critic_losses, scores = [], [], []

            print(f"{episode} Episode / Step: {step} / Score: {mean_score:.2f} / " +\
                    f"Actor loss: {mean_actor_loss:.2f} / Critic loss: {mean_critic_loss:.4f}" )

        # save model
        if train_mode and episode % save_interval == 0:
            agent.save_model()
env.close()



[UnityMemory] Configuration Parameters - Can be set up in boot.config
    "memorysetup-allocator-temp-initial-block-size-main=262144"
    "memorysetup-allocator-temp-initial-block-size-worker=262144"
    "memorysetup-bucket-allocator-granularity=16"
    "memorysetup-bucket-allocator-bucket-count=8"
    "memorysetup-bucket-allocator-block-size=4194304"
    "memorysetup-bucket-allocator-block-count=1"
    "memorysetup-main-allocator-block-size=16777216"
    "memorysetup-thread-allocator-block-size=16777216"
    "memorysetup-gfx-main-allocator-block-size=16777216"
    "memorysetup-gfx-thread-allocator-block-size=16777216"
    "memorysetup-cache-allocator-block-size=4194304"
    "memorysetup-typetree-allocator-block-size=2097152"
    "memorysetup-profiler-bucket-allocator-granularity=16"
    "memorysetup-profiler-bucket-allocator-bucket-count=8"
    "memorysetup-profiler-bucket-allocator-block-size=4194304"
    "memorysetup-profiler-bucket-allocator-block-count=1"
    "memorysetup-profiler

## Test the pretrained model

In [14]:
load_model = True
train_mode = False

load_model_path = save_model_path

engine_configuration_channel = EngineConfigurationChannel()
environment_parameters_channel = EnvironmentParametersChannel()
env = UnityEnvironment(file_name=env_name,
                       side_channels=[engine_configuration_channel, environment_parameters_channel],
                       base_port=unity_base_port)
env.reset()

# Setup Unity Behavior
behavior_name = list(env.behavior_specs.keys())[0]
spec = env.behavior_specs[behavior_name]
engine_configuration_channel.set_configuration_parameters(time_scale=6.0)
dec, term = env.get_steps(behavior_name)
num_worker = len(dec)

# HyperPPOAgent
agent = HyperPPOAgent()
actor_losses, critic_losses, scores, episode, score = [], [], [], 0, 0
for step in range(test_step):
    goal_signal = dec.obs[GOAL_OBS]
    state = np.concatenate([dec.obs[RAY_OBS], dec.obs[VECTOR_OBS]], axis=-1)
    action = agent.get_action(state, goal_signal, train_mode)
    action_tuple = ActionTuple()
    action_tuple.add_discrete(action)
    env.set_actions(behavior_name, action_tuple)
    env.step()

    # info from env
    dec, term = env.get_steps(behavior_name)
    done = [False] * num_worker
    next_goal_signal = dec.obs[GOAL_OBS]
    next_state = np.concatenate([dec.obs[RAY_OBS], dec.obs[VECTOR_OBS]], axis=-1)
    reward = dec.reward
    for id in term.agent_id:
        _id = list(term.agent_id).index(id)
        done[id] = True
        next_goal_signal[id] = term.obs[GOAL_OBS][_id]
        next_state[id] = np.concatenate([term.obs[RAY_OBS][_id], term.obs[VECTOR_OBS][_id]], axis=-1)
        reward[id] = term.reward[_id]
    score += reward[0]

    if done[0]:
        episode +=1
        scores.append(score)
        score = 0

        # logging
        if episode % print_interval == 0:
            mean_score = np.mean(scores)
            mean_actor_loss = np.mean(actor_losses) if len(actor_losses) > 0 else 0
            mean_critic_loss = np.mean(critic_losses)  if len(critic_losses) > 0 else 0
            agent.write_summary(mean_score, mean_actor_loss, mean_critic_loss, step)
            actor_losses, critic_losses, scores = [], [], []

            print(f"{episode} Episode / Step: {step} / Score: {mean_score:.2f} / " +\
                    f"Actor loss: {mean_actor_loss:.2f} / Critic loss: {mean_critic_loss:.4f}" )

env.close()





[UnityMemory] Configuration Parameters - Can be set up in boot.config
    "memorysetup-allocator-temp-initial-block-size-main=262144"
    "memorysetup-allocator-temp-initial-block-size-worker=262144"
    "memorysetup-bucket-allocator-granularity=16"
    "memorysetup-bucket-allocator-bucket-count=8"
    "memorysetup-bucket-allocator-block-size=4194304"
    "memorysetup-bucket-allocator-block-count=1"
    "memorysetup-main-allocator-block-size=16777216"
    "memorysetup-thread-allocator-block-size=16777216"
    "memorysetup-gfx-main-allocator-block-size=16777216"
    "memorysetup-gfx-thread-allocator-block-size=16777216"
    "memorysetup-cache-allocator-block-size=4194304"
    "memorysetup-typetree-allocator-block-size=2097152"
    "memorysetup-profiler-bucket-allocator-granularity=16"
    "memorysetup-profiler-bucket-allocator-bucket-count=8"
    "memorysetup-profiler-bucket-allocator-block-size=4194304"
    "memorysetup-profiler-bucket-allocator-block-count=1"
    "memorysetup-profiler