# Pong Adversarial

In [1]:
import os
from pathlib import Path
import numpy as np
import datetime
import platform
import torch
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter
from mlagents_envs.environment import UnityEnvironment, ActionTuple
from mlagents_envs.side_channel.engine_configuration_channel\
                             import EngineConfigurationChannel
from mlagents_envs.side_channel.environment_parameters_channel\
                             import EnvironmentParametersChannel


## Setting environments

In [2]:
# Global Setting
cur_dir = os.getcwd()
env_dir = os.path.abspath(os.path.join(cur_dir, "..", "Unity6000_Envs"))
test_dir = os.path.abspath(os.path.join(cur_dir, "temp", "pytorch_output"))


### Pytorch Device

In [3]:
# Pytorch Device
if torch.backends.mps.is_available():
    g_device = torch.device("mps")
elif torch.cuda.is_available():
    g_device = torch.device("cuda")
else:
    g_device = torch.device("cpu")

print(g_device)


mps


### Unity Enviroment

In [4]:
# Unity Enviroment
game = "Pong"
os_name = platform.system()

if os_name == 'Linux':
    env_name = os.path.join(env_dir, f"{game}_{os_name}.x86_64")
elif os_name == 'Darwin':
    env_name = os.path.join(env_dir, f"{game}_{os_name}.app")

### Seting parameters for PPO

In [5]:
# Parameters
state_size = 6
action_size = 3

load_model = False
train_mode = True

discount_factor = 0.99
learning_rate = 5e-4
n_step = 512
batch_size = 128
n_epoch = 3
_lambda = 0.95
epsilon = 0.2

run_step = 1000000 if train_mode else 0
test_step = 50000

print_interval = 10
save_interval = 100

# Setting parameters for Dodge Environments
env_static_config = {"ballSpeed": 4, "ballRandom": 0.2, "agentSpeed": 3}
env_dynamic_config = {"boardRadius": {"min":6, "max": 8, "seed": 77},
                      "ballNums": {"min": 10, "max": 15, "seed": 77}}


unity_base_port = 1766

In [6]:
# NN model : Save and Load
date_time = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
save_path = os.path.join(test_dir, f"saved_models/{game}/Adversarial/{date_time}")
Path(save_path).mkdir(parents=True, exist_ok=True)
save_model_path = os.path.join(save_path, 'Pong_Adversarial.ckpt')
# print(f"save_path :{save_path}")
# print(f"save_model_path :{save_model_path}")
load_path = "" # Need to update

## Model for Actor-Critic

In [7]:
class ActorCritic(torch.nn.Module):
    def __init__(self, **kwargs):
        super(ActorCritic, self).__init__(**kwargs)
        self.d1 = torch.nn.Linear(state_size, 128)
        self.d2 = torch.nn.Linear(128, 128)
        self.pi = torch.nn.Linear(128, action_size)
        self.v = torch.nn.Linear(128, 1)

    def forward(self, x):
        x = F.relu(self.d1(x))
        x = F.relu(self.d2(x))
        return F.softmax(self.pi(x), dim=-1), self.v(x)

## Agent class


In [17]:
# Agent Class
class PPOAgent:
    def __init__(self, id):
        self.network = ActorCritic().to(g_device)
        self.optimizer = torch.optim.Adam(self.network.parameters(), lr = learning_rate)
        self.memory = list()
        self.save_path = f"{save_path}/{id}"
        self.load_path = f"{load_path}/{id}"
        self.writer = SummaryWriter(self.save_path)
        self.save_model_path = os.path.join(self.save_path, 'Pong_Adversarial.ckpt')
        self.load_model_path = os.path.join(self.load_path, 'Pong_Adversarial.ckpt')

        if load_model == True:
            print(f"... Load Model from {self.load_model_path} ...")
            checkpoint = torch.load(self.load_model_path, map_location=g_device)
            self.network.load_state_dict(checkpoint["network"])
            self.optimizer.load_state_dict(checkpoint["optimizer"])

    def get_action(self, state, training = True):
        self.network.train(training)
        pi, _ = self.network(torch.FloatTensor(state).to(g_device))
        action = torch.multinomial(pi, num_samples=1).cpu().numpy()
        return action

    def append_sample(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def train_model(self):
        self.network.train()

        state      = np.stack([m[0] for m in self.memory], axis=0)
        action     = np.stack([m[1] for m in self.memory], axis=0)
        reward     = np.stack([m[2] for m in self.memory], axis=0)
        next_state = np.stack([m[3] for m in self.memory], axis=0)
        done       = np.stack([m[4] for m in self.memory], axis=0)
        self.memory.clear()

        state, action, reward, next_state, done = map(lambda x: torch.FloatTensor(x).to(g_device),
                                                        [state, action, reward, next_state, done])
        # prob_old, adv, ret
        with torch.no_grad():
            pi_old, value = self.network(state)
            prob_old = pi_old.gather(1, action.long())

            _, next_value = self.network(next_state)
            delta = reward + (1 - done) * discount_factor * next_value - value
            adv = delta.clone()
            adv, done = map(lambda x: x.view(n_step, -1).transpose(0,1).contiguous(), [adv, done])
            for t in reversed(range(n_step-1)):
                adv[:, t] += (1 - done[:, t]) * discount_factor * _lambda * adv[:, t+1]
            adv = adv.transpose(0,1).contiguous().view(-1, 1)

            ret = adv + value

        # training
        actor_losses, critic_losses = [], []
        idxs = np.arange(len(reward))
        for _ in range(n_epoch):
            np.random.shuffle(idxs)
            for offset in range(0, len(reward), batch_size):
                idx = idxs[offset : offset + batch_size]

                _state, _action, _ret, _adv, _prob_old =\
                    map(lambda x: x[idx], [state, action, ret, adv, prob_old])

                pi, value = self.network(_state)
                prob = pi.gather(1, _action.long())

                # loss function for policy network
                ratio = prob / (_prob_old + 1e-7)
                surr1 = ratio * _adv
                surr2 = torch.clamp(ratio, min=1-epsilon, max=1+epsilon) * _adv
                actor_loss = -torch.min(surr1, surr2).mean()

                # loss function for value network
                critic_loss = F.mse_loss(value, _ret).mean()

                total_loss = actor_loss + critic_loss

                self.optimizer.zero_grad()
                total_loss.backward()
                self.optimizer.step()

                actor_losses.append(actor_loss.item())
                critic_losses.append(critic_loss.item())

        return np.mean(actor_losses), np.mean(critic_losses)


    def save_model(self):
        print(f"... Save Model to {self.save_path}/ckpt ...")
        torch.save({
            "network" : self.network.state_dict(),
            "optimizer" : self.optimizer.state_dict(),
        }, self.save_model_path)

    def write_summary(self, score, actor_loss, critic_loss, step):
        self.writer.add_scalar("PPO_Random_run/score", score, step)
        self.writer.add_scalar("PPO_Random_model/actor_loss", actor_loss, step)
        self.writer.add_scalar("PPO_Random_model/critic_loss", critic_loss, step)
        # self.writer.add_scalar("PPO_Random_model/total_loss", total_loss, step)

## Train Model

In [9]:
engine_configuration_channel = EngineConfigurationChannel()
env = UnityEnvironment(file_name=env_name,
                       side_channels=[engine_configuration_channel],
                       base_port=unity_base_port)
env.reset()

# Setup Unity Behavior
behavior_name_list = list(env.behavior_specs.keys())
behavior_A = behavior_name_list[0]
behavior_B = behavior_name_list[1]
engine_configuration_channel.set_configuration_parameters(time_scale=12.0)
dec_A, term_A = env.get_steps(behavior_A)
dec_B, term_B = env.get_steps(behavior_B)

# Setup objects of PPOAgent for agent_A and agent_B
agent_A = PPOAgent("A")
agent_B = PPOAgent("B")

episode = 0
actor_losses_A, critic_losses_A, scores_A, score_A = [], [], [], 0
actor_losses_B, critic_losses_B, scores_B, score_B = [], [], [], 0
for step in range(run_step + test_step):
    if step == run_step:
        if train_mode:
            agent_A.save_model()
            agent_B.save_model()
        print("TEST START")
        train_mode = False
        engine_configuration_channel.set_configuration_parameters(time_scale=1.0)

    state_A = dec_A.obs[0]
    state_B = dec_B.obs[0]
    action_A = agent_A.get_action(state_A, train_mode)
    action_B = agent_B.get_action(state_B, train_mode)
    action_tuple_A, action_tuple_B = map(lambda x: ActionTuple(discrete=x), [action_A, action_B])
    env.set_actions(behavior_A, action_tuple_A)
    env.set_actions(behavior_B, action_tuple_B)
    env.step()

    dec_A, term_A = env.get_steps(behavior_A)
    dec_B, term_B = env.get_steps(behavior_B)
    done_A = len(term_A.agent_id) > 0
    done_B = len(term_B.agent_id) > 0
    next_state_A = term_A.obs[0] if done_A else dec_A.obs[0]
    next_state_B = term_B.obs[0] if done_B else dec_B.obs[0]
    reward_A = term_A.reward if done_A else dec_A.reward
    reward_B = term_B.reward if done_B else dec_B.reward
    score_A += reward_A[0]
    score_B += reward_B[0]

    if train_mode:
        agent_A.append_sample(state_A[0], action_A[0], reward_A, next_state_A[0], [done_A])
        agent_B.append_sample(state_B[0], action_B[0], reward_B, next_state_B[0], [done_B])

        if (step+1) % n_step == 0:
            # Training
            actor_loss_A, critic_loss_A = agent_A.train_model()
            actor_loss_B, critic_loss_B = agent_B.train_model()
            actor_losses_A.append(actor_loss_A)
            critic_losses_A.append(critic_loss_A)
            actor_losses_B.append(actor_loss_B)
            critic_losses_B.append(critic_loss_B)

    if done_A or done_B:
        episode +=1
        scores_A.append(score_A)
        scores_B.append(score_B)
        score_A = score_B = 0

        # logging tensorboard
        if episode % print_interval == 0:
            mean_score_A = np.mean(scores_A)
            mean_score_B = np.mean(scores_B)
            mean_actor_loss_A = np.mean(actor_losses_A) if len(actor_losses_A) > 0 else 0
            mean_critic_loss_A = np.mean(critic_losses_A) if len(critic_losses_A) > 0 else 0
            mean_actor_loss_B = np.mean(actor_losses_B) if len(actor_losses_B) > 0 else 0
            mean_critic_loss_B = np.mean(critic_losses_B) if len(critic_losses_B) > 0 else 0
            agent_A.write_summary(mean_score_A, mean_actor_loss_A, mean_critic_loss_A, step)
            agent_B.write_summary(mean_score_B, mean_actor_loss_B, mean_critic_loss_B, step)
            actor_losses_A, critic_losses_A, scores_A = [], [], []
            actor_losses_B, critic_losses_B, scores_B = [], [], []

            print(f"{episode} Episode / Step: {step} / "  +\
                    f"A Score: {mean_score_A:.2f} / B Score: {mean_score_B:.2f} / " +\
                    f"A Actor Loss: {mean_actor_loss_A:.4f} / A Critic Loss: {mean_critic_loss_A:.4f} / " +\
                    f"B Actor Loss: {mean_actor_loss_B:.4f} / B Critic Loss: {mean_critic_loss_B:.4f}")

        # save network model
        if train_mode and episode % save_interval == 0:
            agent_A.save_model()
            agent_B.save_model()

env.close()



[UnityMemory] Configuration Parameters - Can be set up in boot.config
    "memorysetup-allocator-temp-initial-block-size-main=262144"
    "memorysetup-allocator-temp-initial-block-size-worker=262144"
    "memorysetup-bucket-allocator-granularity=16"
    "memorysetup-bucket-allocator-bucket-count=8"
    "memorysetup-bucket-allocator-block-size=4194304"
    "memorysetup-bucket-allocator-block-count=1"
    "memorysetup-main-allocator-block-size=16777216"
    "memorysetup-thread-allocator-block-size=16777216"
    "memorysetup-gfx-main-allocator-block-size=16777216"
    "memorysetup-gfx-thread-allocator-block-size=16777216"
    "memorysetup-cache-allocator-block-size=4194304"
    "memorysetup-typetree-allocator-block-size=2097152"
    "memorysetup-profiler-bucket-allocator-granularity=16"
    "memorysetup-profiler-bucket-allocator-bucket-count=8"
    "memorysetup-profiler-bucket-allocator-block-size=4194304"
    "memorysetup-profiler-bucket-allocator-block-count=1"
    "memorysetup-profiler

## Test the pretrained model

In [19]:
env.close()

In [20]:
load_model = True
train_mode = False

load_path = save_path

engine_configuration_channel = EngineConfigurationChannel()
env = UnityEnvironment(file_name=env_name,
                       side_channels=[engine_configuration_channel],
                       base_port=unity_base_port)
env.reset()

# Setup Unity Behavior
behavior_name_list = list(env.behavior_specs.keys())
behavior_A = behavior_name_list[0]
behavior_B = behavior_name_list[1]
engine_configuration_channel.set_configuration_parameters(time_scale=1.0)
dec_A, term_A = env.get_steps(behavior_A)
dec_B, term_B = env.get_steps(behavior_B)

# Setup objects of PPOAgent for agent_A and agent_B
agent_A = PPOAgent("A")
agent_B = PPOAgent("B")


episode = 0
actor_losses_A, critic_losses_A, scores_A, score_A = [], [], [], 0
actor_losses_B, critic_losses_B, scores_B, score_B = [], [], [], 0
for step in range(test_step):
    state_A = dec_A.obs[0]
    state_B = dec_B.obs[0]
    action_A = agent_A.get_action(state_A, train_mode)
    action_B = agent_B.get_action(state_B, train_mode)
    action_tuple_A, action_tuple_B = map(lambda x: ActionTuple(discrete=x), [action_A, action_B])
    env.set_actions(behavior_A, action_tuple_A)
    env.set_actions(behavior_B, action_tuple_B)
    env.step()

    dec_A, term_A = env.get_steps(behavior_A)
    dec_B, term_B = env.get_steps(behavior_B)
    done_A = len(term_A.agent_id) > 0
    done_B = len(term_B.agent_id) > 0
    next_state_A = term_A.obs[0] if done_A else dec_A.obs[0]
    next_state_B = term_B.obs[0] if done_B else dec_B.obs[0]
    reward_A = term_A.reward if done_A else dec_A.reward
    reward_B = term_B.reward if done_B else dec_B.reward
    score_A += reward_A[0]
    score_B += reward_B[0]

    if done_A or done_B:
        episode +=1
        scores_A.append(score_A)
        scores_B.append(score_B)
        score_A = score_B = 0

        # logging tensorboard
        if episode % print_interval == 0:
            mean_score_A = np.mean(scores_A)
            mean_score_B = np.mean(scores_B)
            mean_actor_loss_A = np.mean(actor_losses_A) if len(actor_losses_A) > 0 else 0
            mean_critic_loss_A = np.mean(critic_losses_A) if len(critic_losses_A) > 0 else 0
            mean_actor_loss_B = np.mean(actor_losses_B) if len(actor_losses_B) > 0 else 0
            mean_critic_loss_B = np.mean(critic_losses_B) if len(critic_losses_B) > 0 else 0
            agent_A.write_summary(mean_score_A, mean_actor_loss_A, mean_critic_loss_A, step)
            agent_B.write_summary(mean_score_B, mean_actor_loss_B, mean_critic_loss_B, step)
            actor_losses_A, critic_losses_A, scores_A = [], [], []
            actor_losses_B, critic_losses_B, scores_B = [], [], []

            print(f"{episode} Episode / Step: {step} / "  +\
                    f"A Score: {mean_score_A:.2f} / B Score: {mean_score_B:.2f} / " +\
                    f"A Actor Loss: {mean_actor_loss_A:.4f} / A Critic Loss: {mean_critic_loss_A:.4f} / " +\
                    f"B Actor Loss: {mean_actor_loss_B:.4f} / B Critic Loss: {mean_critic_loss_B:.4f}")

env.close()



[UnityMemory] Configuration Parameters - Can be set up in boot.config
    "memorysetup-allocator-temp-initial-block-size-main=262144"
    "memorysetup-allocator-temp-initial-block-size-worker=262144"
    "memorysetup-bucket-allocator-granularity=16"
    "memorysetup-bucket-allocator-bucket-count=8"
    "memorysetup-bucket-allocator-block-size=4194304"
    "memorysetup-bucket-allocator-block-count=1"
    "memorysetup-main-allocator-block-size=16777216"
    "memorysetup-thread-allocator-block-size=16777216"
    "memorysetup-gfx-main-allocator-block-size=16777216"
    "memorysetup-gfx-thread-allocator-block-size=16777216"
    "memorysetup-cache-allocator-block-size=4194304"
    "memorysetup-typetree-allocator-block-size=2097152"
    "memorysetup-profiler-bucket-allocator-granularity=16"
    "memorysetup-profiler-bucket-allocator-bucket-count=8"
    "memorysetup-profiler-bucket-allocator-block-size=4194304"
    "memorysetup-profiler-bucket-allocator-block-count=1"
    "memorysetup-profiler