# Dron_DDPG

In [1]:
import numpy as np
import random
import copy
import datetime
import platform
import torch
import os
import torch.nn.functional as F
from torchvision.utils import save_image
from torch.utils.tensorboard import SummaryWriter
from collections import deque
from pathlib import Path
from mlagents_envs.environment import UnityEnvironment, ActionTuple
from mlagents_envs.side_channel.engine_configuration_channel\
                             import EngineConfigurationChannel


## Setting environments

In [2]:
# Global Setting
cur_dir = os.getcwd()
env_dir = os.path.abspath(os.path.join(cur_dir, "..", "Unity6000_Envs"))
test_dir = os.path.abspath(os.path.join(cur_dir, "temp", "pytorch_output"))


### Pytorch Device

In [3]:
# Pytorch Device
if torch.backends.mps.is_available():
    g_device = torch.device("mps")
elif torch.cuda.is_available():
    g_device = torch.device("cuda")
else:
    g_device = torch.device("cpu")

print(g_device)


mps


### Unity Enviroment

In [4]:
# Unity Enviroment
game = "Drone"
os_name = platform.system()

if os_name == 'Linux':
    env_name = os.path.join(env_dir, f"{game}_{os_name}.x86_64")
elif os_name == 'Darwin':
    env_name = os.path.join(env_dir, f"{game}_{os_name}.app")

### Seting parameters for DDPG Network

In [5]:
# Seting parameters for DDPG Network
state_size = 9
action_size = 3

load_model = False
train_mode = True

batch_size = 128
mem_maxlen = 30000
discount_factor = 0.9
actor_lr = 1e-4
critic_lr = 5e-4
tau = 1e-3

# OU noise Parameters
mu = 0
theta = 1e-3
sigma = 2e-3

run_step = 50000 if train_mode else 0
test_step = 10000
train_start_step = 5000

print_interval = 10
save_interval = 100

unity_base_port = 1900

In [6]:
# NN model : Save and Load
date_time = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
save_path = os.path.join(test_dir, f"saved_models/{game}/DDPG/{date_time}")
Path(save_path).mkdir(parents=True, exist_ok=True)
save_model_path = os.path.join(save_path, 'Drone_DDPG.ckpt')
# print(f"save_path :{save_path}")
# print(f"save_model_path :{save_model_path}")
load_path = os.path.join(test_dir, f"saved_models/{game}/DQN/20210514201212") # Need to update

## OU_noise sampling

In [7]:
# OU_noise sampling
class OU_noise:
    def __init__(self):
        self.reset()

    def reset(self):
        self.X = np.ones((1, action_size), dtype=np.float32) * mu

    def sample(self):
        dx = theta * (mu - self.X) + sigma * np.random.randn(len(self.X))
        self.X += dx
        return self.X

## Actor Network : DDPG-Actor


In [8]:
# Actor Network : DDPG-Actor

class Actor(torch.nn.Module):
    """
    Input : State - 9
    Output : Action - 3
    """
    def __init__(self):
        super(Actor, self).__init__()
        self.fc1 = torch.nn.Linear(state_size, 128)
        self.fc2 = torch.nn.Linear(128, 128)
        self.mu = torch.nn.Linear(128, action_size)

    def forward(self, state):
        x = torch.relu(self.fc1(state))
        x = torch.relu(self.fc2(x))
        return torch.tanh(self.mu(x))

## Critic Network : DDPG -Critic

In [9]:
# Critic Network : DDPG -Critic
class Critic(torch.nn.Module):
    """
    Input : State + Action
    Output : Q-Value
    """
    def __init__(self):
        super(Critic, self).__init__()

        self.fc1 = torch.nn.Linear(state_size, 128)
        self.fc2 = torch.nn.Linear(128+action_size, 128)
        self.q = torch.nn.Linear(128, 1)

    def forward(self, state, action):
        x = torch.relu(self.fc1(state))
        x = torch.cat((x, action), dim=-1)
        x = torch.relu(self.fc2(x))
        return self.q(x)

## DDPG Agent class

In [10]:
# DDPGAgent
class DDPGAgent():
    def __init__(self):
        self.actor = Actor().to(g_device)
        self.target_actor = copy.deepcopy(self.actor)
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=actor_lr)

        self.critic = Critic().to(g_device)
        self.target_critic = copy.deepcopy(self.critic)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=critic_lr)

        self.OU = OU_noise()
        self.memory = deque(maxlen=mem_maxlen)
        self.writer = SummaryWriter(save_path)

        if load_model == True:
            print(f"... Load Model from {load_path}/ckpt ...")
            checkpoint = torch.load(load_path, map_location=g_device)
            self.actor.load_state_dict(checkpoint["actor"])
            self.target_actor.load_state_dict(checkpoint["actor"])
            self.actor_optimizer.load_state_dict(checkpoint["actor_optimizer"])
            self.critic.load_state_dict(checkpoint["critic"])
            self.target_critic.load_state_dict(checkpoint["critic"])
            self.critic_optimizer.load_state_dict(checkpoint["critic_optimizer"])

    # Get Action with OU noise
    def get_action(self, state, training=True):
        self.actor.train(training)

        action = self.actor(torch.FloatTensor(state).to(g_device)).cpu().detach().numpy()
        return action + self.OU.sample() if training else action

    # Add sample data into memory
    def append_sample(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    # Training model
    def train_model(self):
        batch = random.sample(self.memory, batch_size)
        state      = np.stack([b[0] for b in batch], axis=0)
        action     = np.stack([b[1] for b in batch], axis=0)
        reward     = np.stack([b[2] for b in batch], axis=0)
        next_state = np.stack([b[3] for b in batch], axis=0)
        done       = np.stack([b[4] for b in batch], axis=0)

        state, action, reward, next_state, done = map(lambda x: torch.FloatTensor(x).to(g_device),
                                                        [state, action, reward, next_state, done])

        # Update Critic
        next_actions = self.target_actor(next_state)
        next_q = self.target_critic(next_state, next_actions)
        target_q = reward + (1 - done) * discount_factor * next_q
        q = self.critic(state, action)
        critic_loss = F.mse_loss(target_q, q)

        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # Update Actor
        action_pred = self.actor(state)
        actor_loss = -self.critic(state, action_pred).mean()

        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        return actor_loss.item(), critic_loss.item()

    # Soft Update for target network
    def soft_update_target(self):
        for target_param, local_param in zip(self.target_actor.parameters(), self.actor.parameters()):
            target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
        for target_param, local_param in zip(self.target_critic.parameters(), self.critic.parameters()):
            target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)

    # Save Model
    def save_model(self):
        print(f"... Save Model to {save_model_path} ...")
        torch.save({
            "actor" : self.actor.state_dict(),
            "actor_optimizer" : self.actor_optimizer.state_dict(),
            "critic" : self.critic.state_dict(),
            "critic_optimizer" : self.critic_optimizer.state_dict(),
        }, save_model_path)

    # 학습 기록
    def write_summray(self, score, actor_loss, critic_loss, step):
        self.writer.add_scalar("ddpg_run/score", score, step)
        self.writer.add_scalar("ddpg_model/actor_loss", actor_loss, step)
        self.writer.add_scalar("ddpg_model/critic_loss", critic_loss, step)

## Train DQN Model

In [11]:
engine_configuration_channel = EngineConfigurationChannel()
env = UnityEnvironment(file_name=env_name,
                        side_channels=[engine_configuration_channel], base_port=unity_base_port)
env.reset()

# Setup Unitu MLAgent
behavior_name = list(env.behavior_specs.keys())[0]
spec = env.behavior_specs[behavior_name]
engine_configuration_channel.set_configuration_parameters(time_scale=12.0)
# engine_configuration_channel.set_configuration_parameters(time_scale=1.0)
dec, term = env.get_steps(behavior_name)

#
agent = DDPGAgent()

actor_losses, critic_losses, scores, episode, score = [], [], [], 0, 0
for step in range(run_step + test_step):
    if step == run_step:
        if train_mode:
            agent.save_model()
        print("TEST START")
        train_mode = False
        engine_configuration_channel.set_configuration_parameters(time_scale=1.0)

    state = dec.obs[0]
    action = agent.get_action(state, train_mode)
    action_tuple = ActionTuple()
    action_tuple.add_continuous(action)
    env.set_actions(behavior_name, action_tuple)
    env.step()

    dec, term = env.get_steps(behavior_name)
    done = len(term.agent_id) > 0
    reward = term.reward if done else dec.reward
    next_state = term.obs[0] if done else dec.obs[0]
    score += reward[0]

    if train_mode:
        agent.append_sample(state[0], action[0], reward, next_state[0], [done])

    if train_mode and step > max(batch_size, train_start_step):
        # Traing Agent's networks
        actor_loss, critic_loss = agent.train_model()
        actor_losses.append(actor_loss)
        critic_losses.append(critic_loss)

        # Update the target network
        agent.soft_update_target()

    if done:
        episode += 1
        scores.append(score)
        score = 0

        # logging tensorboard
        if episode % print_interval == 0:
            mean_score = np.mean(scores)
            mean_actor_loss = np.mean(actor_losses)
            mean_critic_loss = np.mean(critic_losses)
            agent.write_summray(mean_score, mean_actor_loss, mean_critic_loss, step)
            actor_losses, critic_losses, scores = [], [], []

            print(f"{episode} Episode / Step: {step} / Score: {mean_score:.2f} / " +\
                    f"Actor loss: {mean_actor_loss:.2f} / Critic loss: {mean_critic_loss:.4f}")

        # Saveing model
        if train_mode and episode % save_interval == 0:
            agent.save_model()

env.close()

[UnityMemory] Configuration Parameters - Can be set up in boot.config
    "memorysetup-allocator-temp-initial-block-size-main=262144"
    "memorysetup-allocator-temp-initial-block-size-worker=262144"
    "memorysetup-bucket-allocator-granularity=16"
    "memorysetup-bucket-allocator-bucket-count=8"
    "memorysetup-bucket-allocator-block-size=4194304"
    "memorysetup-bucket-allocator-block-count=1"
    "memorysetup-main-allocator-block-size=16777216"
    "memorysetup-thread-allocator-block-size=16777216"
    "memorysetup-gfx-main-allocator-block-size=16777216"
    "memorysetup-gfx-thread-allocator-block-size=16777216"
    "memorysetup-cache-allocator-block-size=4194304"
    "memorysetup-typetree-allocator-block-size=2097152"
    "memorysetup-profiler-bucket-allocator-granularity=16"
    "memorysetup-profiler-bucket-allocator-bucket-count=8"
    "memorysetup-profiler-bucket-allocator-block-size=4194304"
    "memorysetup-profiler-bucket-allocator-block-count=1"
    "memorysetup-profiler

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


720 Episode / Step: 50493 / Score: 4.79 / Actor loss: nan / Critic loss: nan
730 Episode / Step: 50783 / Score: 4.30 / Actor loss: nan / Critic loss: nan
740 Episode / Step: 51178 / Score: 4.94 / Actor loss: nan / Critic loss: nan
750 Episode / Step: 51495 / Score: 4.31 / Actor loss: nan / Critic loss: nan
760 Episode / Step: 51854 / Score: 4.99 / Actor loss: nan / Critic loss: nan
770 Episode / Step: 52156 / Score: 4.31 / Actor loss: nan / Critic loss: nan
780 Episode / Step: 52521 / Score: 4.73 / Actor loss: nan / Critic loss: nan
790 Episode / Step: 52897 / Score: 4.68 / Actor loss: nan / Critic loss: nan
800 Episode / Step: 53346 / Score: 5.67 / Actor loss: nan / Critic loss: nan
810 Episode / Step: 53722 / Score: 5.01 / Actor loss: nan / Critic loss: nan
820 Episode / Step: 54062 / Score: 4.57 / Actor loss: nan / Critic loss: nan
830 Episode / Step: 54465 / Score: 5.25 / Actor loss: nan / Critic loss: nan
840 Episode / Step: 54869 / Score: 5.74 / Actor loss: nan / Critic loss: nan

## Test the pretrained DDPG Model

In [13]:
load_model = True
train_mode = False

load_path = save_model_path

engine_configuration_channel = EngineConfigurationChannel()
env = UnityEnvironment(file_name=env_name,
                        side_channels=[engine_configuration_channel], base_port=unity_base_port)
env.reset()

# Setup Unitu MLAgent
behavior_name = list(env.behavior_specs.keys())[0]
# spec = env.behavior_specs[behavior_name]
engine_configuration_channel.set_configuration_parameters(time_scale=1.0)
dec, term = env.get_steps(behavior_name)

# DQNAgent
agent = DDPGAgent()
score = 0
score_lst = {}
episode = 0
pre_step = 0
for step in range(test_step):
    state = dec.obs[0]
    action = agent.get_action(state, train_mode)
    action_tuple = ActionTuple()
    action_tuple.add_continuous(action)
    env.set_actions(behavior_name, action_tuple)
    env.step()
    dec, term = env.get_steps(behavior_name)
    done = len(term.agent_id) > 0
    reward = term.reward if done else dec.reward
    score += reward[0]
    if done:
        episode +=1
        score_lst[episode] = score
        print(f"{episode} - episode ({step - pre_step} steps) is done with the score of {score}")
        pre_step = step
        score = 0

env.close()

print(score_lst)

[UnityMemory] Configuration Parameters - Can be set up in boot.config
    "memorysetup-allocator-temp-initial-block-size-main=262144"
    "memorysetup-allocator-temp-initial-block-size-worker=262144"
    "memorysetup-bucket-allocator-granularity=16"
    "memorysetup-bucket-allocator-bucket-count=8"
    "memorysetup-bucket-allocator-block-size=4194304"
    "memorysetup-bucket-allocator-block-count=1"
    "memorysetup-main-allocator-block-size=16777216"
    "memorysetup-thread-allocator-block-size=16777216"
    "memorysetup-gfx-main-allocator-block-size=16777216"
    "memorysetup-gfx-thread-allocator-block-size=16777216"
    "memorysetup-cache-allocator-block-size=4194304"
    "memorysetup-typetree-allocator-block-size=2097152"
    "memorysetup-profiler-bucket-allocator-granularity=16"
    "memorysetup-profiler-bucket-allocator-bucket-count=8"
    "memorysetup-profiler-bucket-allocator-block-size=4194304"
    "memorysetup-profiler-bucket-allocator-block-count=1"
    "memorysetup-profiler