# GridWorld_A2C

In [1]:
import numpy as np
import random
import copy
import datetime
import platform
import torch
import os
import torch.nn.functional as F
from torchvision.utils import save_image
from torch.utils.tensorboard import SummaryWriter
from collections import deque
from pathlib import Path
from mlagents_envs.environment import UnityEnvironment, ActionTuple
from mlagents_envs.side_channel.engine_configuration_channel\
                             import EngineConfigurationChannel


## Setting environments

In [2]:
# Global Setting
cur_dir = os.getcwd()
env_dir = os.path.abspath(os.path.join(cur_dir, "..", "Unity6000_Envs"))
output_dir = os.path.abspath(os.path.join(cur_dir, "temp", "pytorch_output"))


### Pytorch Device

In [3]:
# Pytorch Device
if torch.backends.mps.is_available():
    g_device = torch.device("mps")
elif torch.cuda.is_available():
    g_device = torch.device("cuda")
else:
    g_device = torch.device("cpu")

print(g_device)


mps


### Unity Enviroment

In [4]:
# Unity Enviroment
game = "GridWorld"
os_name = platform.system()

if os_name == 'Linux':
    env_name = os.path.join(env_dir, f"{game}_{os_name}.x86_64")
elif os_name == 'Darwin':
    env_name = os.path.join(env_dir, f"{game}_{os_name}.app")

### Seting parameters for DQN Network

In [5]:
# Seting parameters for A2C Network
state_size = 6*2
action_size = 4

load_model = False
train_mode = True

discount_factor = 0.9
learning_rate = 0.00025

run_step = 50000 if train_mode else 0
test_step = 500

print_interval = 10
save_interval = 100

VISUAL_OBS = 0
GOAL_OBS = 1
VECTOR_OBS = 2
OBS = VECTOR_OBS

unity_base_port = 1901

In [6]:
# NN model : Save and Load
date_time = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
save_path = os.path.join(output_dir, f"saved_models/{game}/A2C/{date_time}")
Path(save_path).mkdir(parents=True, exist_ok=True)
save_model_path = os.path.join(save_path, 'GridWrod_A2C.ckpt')
# print(f"save_path :{save_path}")
# print(f"save_model_path :{save_model_path}")
load_path = "" # Need to update

## A2C Class

In [7]:
# A2C Class
class A2C(torch.nn.Module):
    def __init__(self, **kwargs):
        super(A2C, self).__init__(**kwargs)
        self.d1 = torch.nn.Linear(state_size, 128)
        self.d2 = torch.nn.Linear(128, 128)
        self.pi = torch.nn.Linear(128, action_size)
        self.v = torch.nn.Linear(128, 1)

    def forward(self, x):
        x = F.relu(self.d1(x))
        x = F.relu(self.d2(x))
        return F.softmax(self.pi(x), dim=1), self.v(x)

## A2C Agent Class

In [8]:
# A2CAgent class
class A2CAgent:
    def __init__(self):
        self.a2c = A2C().to(g_device)
        self.optimizer = torch.optim.Adam(self.a2c.parameters(), lr=learning_rate)
        self.writer = SummaryWriter(save_path)

        if load_model == True:
            print(f"... Load Model from {load_path} ...")
            checkpoint = torch.load(load_path, map_location=g_device)
            self.a2c.load_state_dict(checkpoint["network"])
            self.optimizer.load_state_dict(checkpoint["optimizer"])

    # Action based on Epsilon greedy
    def get_action(self, state, training=True):
        #
        self.a2c.train(training)

        # Policy Gradient
        pi, _ = self.a2c(torch.FloatTensor(state).to(g_device))
        action = torch.multinomial(pi, num_samples=1).cpu().numpy()
        return action

    # Train model
    def train_model(self, state, action, reward, next_state, done):
        state, action, reward, next_state, done = map(lambda x: torch.FloatTensor(x).to(g_device),
                                                        [state, action, reward, next_state, done])
        pi, value = self.a2c(state)

        #Value Gradient
        with torch.no_grad():
            _, next_value = self.a2c(next_state)
            target_value  = reward + (1-done) * discount_factor * next_value
        critic_loss = F.mse_loss(target_value, value)

        #Policy Gradient
        eye = torch.eye(action_size).to(g_device)
        one_hot_action = eye[action.view(-1).long()]
        advantage = (target_value - value).detach()
        actor_loss = -(torch.log((one_hot_action * pi).sum(1))*advantage).mean()
        total_loss = critic_loss + actor_loss

        self.optimizer.zero_grad()
        total_loss.backward()
        self.optimizer.step()

        return actor_loss.item(), critic_loss.item()


    # Save Network Modelk
    def save_model(self):
        print(f"... Save Model to {save_model_path} ...")
        torch.save({
            "network" : self.a2c.state_dict(),
            "optimizer" : self.optimizer.state_dict(),
        }, save_model_path)

    # 학습 기록
    def write_summray(self, score, actor_loss, critic_loss, step):
        self.writer.add_scalar("a2c_run/score", score, step)
        self.writer.add_scalar("a2c_model/actor_loss", actor_loss, step)
        self.writer.add_scalar("a2c_model/critic_loss", critic_loss, step)
        self.writer.add_scalar("a2c_model/total_loss", actor_loss + critic_loss, step)

## Train DQN Model

In [9]:
engine_configuration_channel = EngineConfigurationChannel()
env = UnityEnvironment(file_name=env_name,
                        side_channels=[engine_configuration_channel],
                        base_port=unity_base_port)
env.reset()

# Setup Unitu MLAgent
behavior_name = list(env.behavior_specs.keys())[0]
spec = env.behavior_specs[behavior_name]
engine_configuration_channel.set_configuration_parameters(time_scale=12.0)
dec, term = env.get_steps(behavior_name)

agent = A2CAgent()
actor_losses, critic_losses, scores, episode, score = [], [], [], 0, 0
for step in range(run_step + test_step):
    if step == run_step:
        if train_mode:
            agent.save_model()
        print("TEST START")
        train_mode = False
        engine_configuration_channel.set_configuration_parameters(time_scale=1.0)

    preprocess = lambda obs, goal: np.concatenate((obs*goal[0][0], obs*goal[0][1]), axis=-1)
    state = preprocess(dec.obs[OBS],dec.obs[GOAL_OBS])
    action = agent.get_action(state, train_mode)
    real_action = action + 1
    action_tuple = ActionTuple()
    action_tuple.add_discrete(real_action)
    env.set_actions(behavior_name, action_tuple)
    env.step()

    #information from environment
    dec, term = env.get_steps(behavior_name)
    done = len(term.agent_id) > 0
    reward = term.reward if done else dec.reward
    next_state = preprocess(term.obs[OBS], term.obs[GOAL_OBS]) if done\
                    else preprocess(dec.obs[OBS], dec.obs[GOAL_OBS])
    score += reward[0]

    if train_mode:
        #training
        actor_loss, critic_loss = agent.train_model(state, action[0], [reward], next_state, [done])
        actor_losses.append(actor_loss)
        critic_losses.append(critic_loss)

    if done:
        episode +=1
        scores.append(score)
        score = 0

        # tensorboard
        if episode % print_interval == 0:
            mean_score = np.mean(scores)
            mean_actor_loss = np.mean(actor_losses) if len(actor_losses) > 0 else 0
            mean_critic_loss = np.mean(critic_losses)  if len(critic_losses) > 0 else 0
            agent.write_summray(mean_score, mean_actor_loss, mean_critic_loss, step)
            actor_losses, critic_losses, scores = [], [], []

            print(f"{episode} Episode / Step: {step} / Score: {mean_score:.2f} / " +\
                    f"Actor loss: {mean_actor_loss:.2f} / Critic loss: {mean_critic_loss:.4f}")

        # model saving
        if train_mode and episode % save_interval == 0:
            agent.save_model()
env.close()

[UnityMemory] Configuration Parameters - Can be set up in boot.config
    "memorysetup-bucket-allocator-granularity=16"
    "memorysetup-bucket-allocator-bucket-count=8"
    "memorysetup-bucket-allocator-block-size=4194304"
    "memorysetup-bucket-allocator-block-count=1"
    "memorysetup-main-allocator-block-size=16777216"
    "memorysetup-thread-allocator-block-size=16777216"
    "memorysetup-gfx-main-allocator-block-size=16777216"
    "memorysetup-gfx-thread-allocator-block-size=16777216"
    "memorysetup-cache-allocator-block-size=4194304"
    "memorysetup-typetree-allocator-block-size=2097152"
    "memorysetup-profiler-bucket-allocator-granularity=16"
    "memorysetup-profiler-bucket-allocator-bucket-count=8"
    "memorysetup-profiler-bucket-allocator-block-size=4194304"
    "memorysetup-profiler-bucket-allocator-block-count=1"
    "memorysetup-profiler-allocator-block-size=16777216"
    "memorysetup-profiler-editor-allocator-block-size=1048576"
    "memorysetup-temp-allocator-siz

  state, action, reward, next_state, done = map(lambda x: torch.FloatTensor(x).to(g_device),


10 Episode / Step: 293 / Score: -0.18 / Actor loss: 0.00 / Critic loss: 0.0315
20 Episode / Step: 598 / Score: -0.49 / Actor loss: 0.00 / Critic loss: 0.0364
30 Episode / Step: 1000 / Score: -0.39 / Actor loss: 0.00 / Critic loss: 0.0203
40 Episode / Step: 1355 / Score: 0.15 / Actor loss: 0.01 / Critic loss: 0.0256
50 Episode / Step: 1699 / Score: -0.34 / Actor loss: -0.01 / Critic loss: 0.0223
60 Episode / Step: 2020 / Score: -0.31 / Actor loss: -0.00 / Critic loss: 0.0349
70 Episode / Step: 2454 / Score: -0.12 / Actor loss: 0.01 / Critic loss: 0.0232
80 Episode / Step: 2648 / Score: -0.18 / Actor loss: -0.01 / Critic loss: 0.0471
90 Episode / Step: 2864 / Score: -0.31 / Actor loss: -0.01 / Critic loss: 0.0428
100 Episode / Step: 3058 / Score: 0.02 / Actor loss: 0.02 / Critic loss: 0.0551
... Save Model to /Users/hyunjae.k/110_HyunJae_Git/2025_Playgrounds/Unity_Robotics_Playgrounds/Agent_Scripts/temp/pytorch_output/saved_models/GridWorld/A2C/20250918071211/GridWrod_A2C.ckpt ...
110 Ep

## Test the pretrained DQN Model

In [10]:
load_model = True
train_mode = False

load_path = save_model_path

engine_configuration_channel = EngineConfigurationChannel()
env = UnityEnvironment(file_name=env_name,
                        side_channels=[engine_configuration_channel],
                        base_port=unity_base_port)
env.reset()

# Setup Unitu MLAgent
behavior_name = list(env.behavior_specs.keys())[0]
# spec = env.behavior_specs[behavior_name]
engine_configuration_channel.set_configuration_parameters(time_scale=1.0)
dec, term = env.get_steps(behavior_name)

# DQNAgent
agent = A2CAgent()
score = 0
score_lst = {}
episode = 0
pre_step = 0
for step in range(test_step):
    preprocess = lambda obs, goal: np.concatenate((obs*goal[0][0], obs*goal[0][1]), axis=-1)
    state = preprocess(dec.obs[OBS],dec.obs[GOAL_OBS])
    action = agent.get_action(state, train_mode)
    real_action = action + 1
    action_tuple = ActionTuple()
    action_tuple.add_discrete(real_action)
    env.set_actions(behavior_name, action_tuple)
    env.step()
    dec, term = env.get_steps(behavior_name)
    done = len(term.agent_id) > 0
    reward = term.reward if done else dec.reward
    score += reward[0]
    if done:
        episode +=1
        score_lst[episode] = score
        print(f"{episode} - episode ({step - pre_step} steps) is done with the score of {score}")
        pre_step = step
        score = 0

env.close()

print(score_lst)

[UnityMemory] Configuration Parameters - Can be set up in boot.config
    "memorysetup-bucket-allocator-granularity=16"
    "memorysetup-bucket-allocator-bucket-count=8"
    "memorysetup-bucket-allocator-block-size=4194304"
    "memorysetup-bucket-allocator-block-count=1"
    "memorysetup-main-allocator-block-size=16777216"
    "memorysetup-thread-allocator-block-size=16777216"
    "memorysetup-gfx-main-allocator-block-size=16777216"
    "memorysetup-gfx-thread-allocator-block-size=16777216"
    "memorysetup-cache-allocator-block-size=4194304"
    "memorysetup-typetree-allocator-block-size=2097152"
    "memorysetup-profiler-bucket-allocator-granularity=16"
    "memorysetup-profiler-bucket-allocator-bucket-count=8"
    "memorysetup-profiler-bucket-allocator-block-size=4194304"
    "memorysetup-profiler-bucket-allocator-block-count=1"
    "memorysetup-profiler-allocator-block-size=16777216"
    "memorysetup-profiler-editor-allocator-block-size=1048576"
    "memorysetup-temp-allocator-siz