# GridWorld_DQN

In [1]:
import numpy as np
import random
import copy
import datetime
import platform
import torch
import os
import torch.nn.functional as F
from torchvision.utils import save_image
from torch.utils.tensorboard import SummaryWriter
from collections import deque
from pathlib import Path
from mlagents_envs.environment import UnityEnvironment, ActionTuple
from mlagents_envs.side_channel.engine_configuration_channel\
                             import EngineConfigurationChannel


## Setting environments

In [2]:
# Global Setting
cur_dir = os.getcwd()
env_dir = os.path.abspath(os.path.join(cur_dir, "..", "Unity6000_Envs"))
test_dir = os.path.abspath(os.path.join(cur_dir, "temp", "pytorch_output"))


### Pytorch Device

In [3]:
# Pytorch Device
if torch.backends.mps.is_available():
    g_device = torch.device("mps")
elif torch.cuda.is_available():
    g_device = torch.device("cuda")
else:
    g_device = torch.device("cpu")

print(g_device)


mps


### Unity Enviroment

In [4]:
# Unity Enviroment
game = "GridWorld"
os_name = platform.system()

if os_name == 'Linux':
    env_name = os.path.join(env_dir, f"{game}_{os_name}.x86_64")
elif os_name == 'Darwin':
    env_name = os.path.join(env_dir, f"{game}_{os_name}.app")

### Seting parameters for DQN Network

In [5]:
# Seting parameters for DQN Network
state_size = [3*2, 64, 84]  # Channel, Height, Width
action_size = 4

load_model = False
train_mode = True

batch_size = 32
mem_maxlen = 10000
discount_factor = 0.9
learning_rate = 0.00025

run_step = 50000 if train_mode else 0
test_step = 5000
train_start_step = 5000
target_update_step = 500

print_interval = 10
save_interval = 100

epsilon_eval = 0.05
epsilon_init = 1.0 if train_mode else epsilon_eval
epsilon_min = 0.1
explore_step = run_step * 0.8
eplsilon_delta = (epsilon_init - epsilon_min)/explore_step if train_mode else 0.

VISUAL_OBS = 0
GOAL_OBS = 1
VECTOR_OBS = 2
OBS = VISUAL_OBS

unity_base_port = 1901

In [6]:
# NN model : Save and Load
date_time = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
save_path = os.path.join(test_dir, f"saved_models/{game}/DQN/{date_time}")
Path(save_path).mkdir(parents=True, exist_ok=True)
save_model_path = os.path.join(save_path, 'GridWrod_DQN.ckpt')
# print(f"save_path :{save_path}")
# print(f"save_model_path :{save_model_path}")
load_path = os.path.join(test_dir, f"saved_models/{game}/DQN/20210514201212") # Need to update

## DQN Class

In [7]:
# DQN Class
class DQN(torch.nn.Module):
    def __init__(self, **kwargs):
        # super(DQN, self).__init__(**kwargs)
        super().__init__(**kwargs)
        self.conv1 = torch.nn.Conv2d(in_channels=state_size[0], out_channels=32,
                                     kernel_size=8, stride=4)
        dim1 = ((state_size[1] - 8)//4 + 1, (state_size[2] - 8)//4 + 1)
        self.conv2 = torch.nn.Conv2d(in_channels=32, out_channels=64,
                                     kernel_size=4, stride=2)
        dim2 = ((dim1[0] - 4)//2 + 1, (dim1[1] - 4)//2 + 1)
        self.conv3 = torch.nn.Conv2d(in_channels=64, out_channels=64,
                                     kernel_size=3, stride=1)
        dim3 = ((dim2[0] - 3)//1 + 1, (dim2[1] - 3)//1 + 1)

        self.flat = torch.nn.Flatten()
        self.fc1 = torch.nn.Linear(64*dim3[0]*dim3[1], 512)
        self.q = torch.nn.Linear(512, action_size)

    def forward(self, x):
        # x = x.permute(0, 3, 1, 2)
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = self.flat(x)
        x = F.relu(self.fc1(x))
        return self.q(x)

## DQN Agent Class

In [8]:
# DQNAgent Class
class DQNAgent:
    def __init__(self):
        self.network = DQN().to(g_device)
        self.target_network = copy.deepcopy(self.network)
        self.optimizer = torch.optim.Adam(self.network.parameters(), lr=learning_rate)
        self.memory = deque(maxlen=mem_maxlen)
        self.epsilon = epsilon_init
        self.writer = SummaryWriter(save_path)

        if load_model == True:
            print(f"... Load Model from {load_path} ...")
            checkpoint = torch.load(load_path, map_location=g_device)
            self.network.load_state_dict(checkpoint["network"])
            self.target_network.load_state_dict(checkpoint["network"])
            self.optimizer.load_state_dict(checkpoint["optimizer"])

    # Action based on Epsilon greedy
    def get_action(self, state, training=True):
        self.network.train(training)
        epsilon = self.epsilon if training else epsilon_eval

        # Random action
        if epsilon > random.random():
            action = np.random.randint(0, action_size, size=(state.shape[0],1))
        # Action based on neetwork output
        else:
            q = self.network(torch.FloatTensor(state).to(g_device))
            action = torch.argmax(q, axis=-1, keepdim=True).data.cpu().numpy()
        return action

    # Replay Memory (State, action, Reward, next_state, done)
    def append_sample(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    # Train model
    def train_model(self):
        batch = random.sample(self.memory, batch_size)
        state      = np.stack([b[0] for b in batch], axis=0)
        action     = np.stack([b[1] for b in batch], axis=0)
        reward     = np.stack([b[2] for b in batch], axis=0)
        next_state = np.stack([b[3] for b in batch], axis=0)
        done       = np.stack([b[4] for b in batch], axis=0)

        state, action, reward, next_state, done = map(lambda x: torch.FloatTensor(x).to(g_device),
                                                        [state, action, reward, next_state, done])

        eye = torch.eye(action_size).to(g_device)
        one_hot_action = eye[action.view(-1).long()]
        q = (self.network(state) * one_hot_action).sum(1, keepdims=True)

        with torch.no_grad():
            next_q = self.target_network(next_state)
            target_q = reward + next_q.max(1, keepdims=True).values * ((1 - done) * discount_factor)

        loss = F.smooth_l1_loss(q, target_q)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # Update Epsilon
        self.epsilon = max(epsilon_min, self.epsilon - eplsilon_delta)

        return loss.item()

    # Update target Network
    def update_target(self):
        self.target_network.load_state_dict(self.network.state_dict())

    # Save Network Modelk
    def save_model(self):
        print(f"... Save Model to {save_model_path}")
        torch.save({
            "network" : self.network.state_dict(),
            "optimizer" : self.optimizer.state_dict(),
        }, save_model_path)

    # tensorboard
    def write_summray(self, score, loss, epsilon, step):
        self.writer.add_scalar("run/score", score, step)
        self.writer.add_scalar("model/loss", loss, step)
        self.writer.add_scalar("model/epsilon", epsilon, step)

## Train DQN Model

In [9]:
engine_configuration_channel = EngineConfigurationChannel()
env = UnityEnvironment(file_name=env_name,
                        side_channels=[engine_configuration_channel],
                        base_port=unity_base_port)
env.reset()

# Setup Unitu MLAgent
behavior_name = list(env.behavior_specs.keys())[0]
spec = env.behavior_specs[behavior_name]
engine_configuration_channel.set_configuration_parameters(time_scale=12.0)
dec, term = env.get_steps(behavior_name)

#### Debug
img1 = (dec.obs[OBS])[0]
img2 = (dec.obs[OBS]*dec.obs[GOAL_OBS][0][0])[0]
img3 = (dec.obs[OBS]*dec.obs[GOAL_OBS][0][1])[0]
save_image(torch.FloatTensor(img1), os.path.join(save_path, "image1.png"))
save_image(torch.FloatTensor(img2), os.path.join(save_path, "image2.png"))
save_image(torch.FloatTensor(img3), os.path.join(save_path, "image3.png"))
####

# DQNAgent
agent = DQNAgent()

losses, scores, episode, score = [], [], 0, 0
for step in range(run_step + test_step):
    if step == run_step:
        if train_mode:
            agent.save_model()
        print("TEST START")
        train_mode = False
        engine_configuration_channel.set_configuration_parameters(time_scale=1.0)

    # preprocess = lambda obs, goal: np.concatenate((obs*goal[0][0], obs*goal[0][1]), axis=-1)
    preprocess = lambda obs, goal: np.concatenate((obs*goal[0][0], obs*goal[0][1]), axis=1)
    state = preprocess(dec.obs[OBS],dec.obs[GOAL_OBS])

    action = agent.get_action(state, train_mode)
    real_action = action + 1
    action_tuple = ActionTuple()
    action_tuple.add_discrete(real_action)
    env.set_actions(behavior_name, action_tuple)
    env.step()

    dec, term = env.get_steps(behavior_name)
    done = len(term.agent_id) > 0
    reward = term.reward if done else dec.reward
    next_state = preprocess(term.obs[OBS], term.obs[GOAL_OBS]) if done\
                    else preprocess(dec.obs[OBS], dec.obs[GOAL_OBS])
    score += reward[0]

    if train_mode:
        agent.append_sample(state[0], action[0], reward, next_state[0], [done])

    if train_mode and step > max(batch_size, train_start_step):
        # Training
        loss = agent.train_model()
        losses.append(loss)

        # Update Target Network
        if step % target_update_step == 0:
            agent.update_target()

    if done:
        episode +=1
        scores.append(score)
        score = 0

        # Logging record
        if episode % print_interval == 0:
            mean_score = np.mean(scores)
            mean_loss = np.mean(losses)
            agent.write_summray(mean_score, mean_loss, agent.epsilon, step)
            losses, scores = [], []

            print(f"{episode} Episode / Step: {step} / Score: {mean_score:.2f} / " +\
                    f"Loss: {mean_loss:.4f} / Epsilon: {agent.epsilon:.4f}")

        # Save Model
        if train_mode and episode % save_interval == 0:
            agent.save_model()

env.close()

[UnityMemory] Configuration Parameters - Can be set up in boot.config
    "memorysetup-bucket-allocator-granularity=16"
    "memorysetup-bucket-allocator-bucket-count=8"
    "memorysetup-bucket-allocator-block-size=4194304"
    "memorysetup-bucket-allocator-block-count=1"
    "memorysetup-main-allocator-block-size=16777216"
    "memorysetup-thread-allocator-block-size=16777216"
    "memorysetup-gfx-main-allocator-block-size=16777216"
    "memorysetup-gfx-thread-allocator-block-size=16777216"
    "memorysetup-cache-allocator-block-size=4194304"
    "memorysetup-typetree-allocator-block-size=2097152"
    "memorysetup-profiler-bucket-allocator-granularity=16"
    "memorysetup-profiler-bucket-allocator-bucket-count=8"
    "memorysetup-profiler-bucket-allocator-block-size=4194304"
    "memorysetup-profiler-bucket-allocator-block-count=1"
    "memorysetup-profiler-allocator-block-size=16777216"
    "memorysetup-profiler-editor-allocator-block-size=1048576"
    "memorysetup-temp-allocator-siz

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


10 Episode / Step: 201 / Score: 0.21 / Loss: nan / Epsilon: 1.0000
20 Episode / Step: 392 / Score: -0.18 / Loss: nan / Epsilon: 1.0000
30 Episode / Step: 709 / Score: -0.31 / Loss: nan / Epsilon: 1.0000
40 Episode / Step: 916 / Score: -0.20 / Loss: nan / Epsilon: 1.0000
50 Episode / Step: 1045 / Score: -0.72 / Loss: nan / Epsilon: 1.0000
60 Episode / Step: 1290 / Score: -0.23 / Loss: nan / Epsilon: 1.0000
70 Episode / Step: 1504 / Score: 0.40 / Loss: nan / Epsilon: 1.0000
80 Episode / Step: 1625 / Score: -0.31 / Loss: nan / Epsilon: 1.0000
90 Episode / Step: 1901 / Score: -0.87 / Loss: nan / Epsilon: 1.0000
100 Episode / Step: 2255 / Score: -0.95 / Loss: nan / Epsilon: 1.0000
... Save Model to /Users/hyunjae.k/110_HyunJae_Git/2025_Playgrounds/Unity_Robotics_Playgrounds/Agent_Scripts/temp/pytorch_output/saved_models/GridWorld/DQN/20250917140151/GridWrod_DQN.ckpt
110 Episode / Step: 2446 / Score: -0.18 / Loss: nan / Epsilon: 1.0000
120 Episode / Step: 2639 / Score: -0.78 / Loss: nan / Ep

## Test the pretrained DQN Model

In [10]:
load_model = True
train_mode = False

load_path = save_model_path

engine_configuration_channel = EngineConfigurationChannel()
env = UnityEnvironment(file_name=env_name,
                        side_channels=[engine_configuration_channel],
                        base_port=unity_base_port)
env.reset()

# Setup Unitu MLAgent
behavior_name = list(env.behavior_specs.keys())[0]
# spec = env.behavior_specs[behavior_name]
engine_configuration_channel.set_configuration_parameters(time_scale=1.0)
dec, term = env.get_steps(behavior_name)

# DQNAgent
agent = DQNAgent()
score = 0
score_lst = {}
episode = 0
pre_step = 0
for step in range(test_step):
    preprocess = lambda obs, goal: np.concatenate((obs*goal[0][0], obs*goal[0][1]), axis=1)
    state = preprocess(dec.obs[OBS],dec.obs[GOAL_OBS])
    action = agent.get_action(state, train_mode)
    real_action = action + 1
    action_tuple = ActionTuple()
    action_tuple.add_discrete(real_action)
    env.set_actions(behavior_name, action_tuple)
    env.step()
    dec, term = env.get_steps(behavior_name)
    done = len(term.agent_id) > 0
    reward = term.reward if done else dec.reward
    score += reward[0]
    if done:
        episode +=1
        score_lst[episode] = score
        print(f"{episode} - episode ({step - pre_step} steps) is done with the score of {score}")
        pre_step = step
        score = 0

env.close()

print(score_lst)

[UnityMemory] Configuration Parameters - Can be set up in boot.config
    "memorysetup-bucket-allocator-granularity=16"
    "memorysetup-bucket-allocator-bucket-count=8"
    "memorysetup-bucket-allocator-block-size=4194304"
    "memorysetup-bucket-allocator-block-count=1"
    "memorysetup-main-allocator-block-size=16777216"
    "memorysetup-thread-allocator-block-size=16777216"
    "memorysetup-gfx-main-allocator-block-size=16777216"
    "memorysetup-gfx-thread-allocator-block-size=16777216"
    "memorysetup-cache-allocator-block-size=4194304"
    "memorysetup-typetree-allocator-block-size=2097152"
    "memorysetup-profiler-bucket-allocator-granularity=16"
    "memorysetup-profiler-bucket-allocator-bucket-count=8"
    "memorysetup-profiler-bucket-allocator-block-size=4194304"
    "memorysetup-profiler-bucket-allocator-block-count=1"
    "memorysetup-profiler-allocator-block-size=16777216"
    "memorysetup-profiler-editor-allocator-block-size=1048576"
    "memorysetup-temp-allocator-siz