# Testing for DQN

In [1]:
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import random

# Add parent directory to path
import sys
from pathlib import Path
current_dir = Path().resolve()
root_dir = current_dir.parent
if str(root_dir) not in sys.path:
    sys.path.insert(0,str(root_dir))

from Gyms.SimulatedNetworkSync import SimulatedNetworkSync
from Gyms.RealNetworkSync import RealNetworkSync
from Algorithms.DQN import DQN, ReplayBuffer

In [2]:
# ---- Discretize MultiDiscrete Actions ----
def index_to_action(index, action_dim, action_n):
    # Convert a flat index into MultiDiscrete action
    action = []
    for _ in range(action_dim):
        action.append(index % action_n)
        index //= action_n
    return np.array(list(reversed(action)))

def action_to_index(action, action_n):
    index = 0
    for a in action:
        index = index * action_n + a
    return index

def select_action(env, state, policy_net, epsilon, action_dim, action_n):
    # Epsilon-greedy action
    if random.random() < epsilon:
        action = env.action_space.sample()
        action_idx = action_to_index(action, action_n)
    else:
        with torch.no_grad():
            state_tensor = torch.FloatTensor(state).unsqueeze(0)
            q_values = policy_net(state_tensor)
            action_idx = torch.argmax(q_values).item()
            action = index_to_action(action_idx, action_dim, action_n)
    
    return action, action_idx

# ---- Train Loop ----
def train(env, episodes=200, steps_per_episode=100, batch_size=64, gamma=0.99, 
          epsilon_start=1.0, epsilon_end=0.01, epsilon_decay=0.995,
          buffer_capacity=10000, lr=1e-3, target_update_freq=10):

    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.nvec.shape[0]
    action_n = env.action_space.nvec[0]

    policy_net = DQN(state_dim, action_dim, action_n)
    target_net = DQN(state_dim, action_dim, action_n)
    target_net.load_state_dict(policy_net.state_dict())
    target_net.eval()

    optimizer = optim.Adam(policy_net.parameters(), lr=lr)
    replay_buffer = ReplayBuffer(buffer_capacity)

    epsilon = epsilon_start

    for episode in range(episodes):
        state, _ = env.reset()
        total_reward = 0

        for step in range(steps_per_episode):
            action, action_idx = select_action(env, state, policy_net, epsilon, action_dim, action_n)

            next_state, reward, _, _, _ = env.step(action)
            replay_buffer.push(state, action_idx, reward, next_state, False)  # done=False always

            state = next_state
            total_reward += reward

            # Train step
            if len(replay_buffer) >= batch_size:
                states, actions, rewards, next_states, dones = replay_buffer.sample(batch_size)

                states = torch.FloatTensor(states)
                actions = torch.LongTensor(actions).unsqueeze(1)
                rewards = torch.FloatTensor(rewards).unsqueeze(1)
                next_states = torch.FloatTensor(next_states)
                dones = torch.BoolTensor(dones).unsqueeze(1)

                q_values = policy_net(states).gather(1, actions)

                with torch.no_grad():
                    next_q_values = target_net(next_states).max(1)[0].unsqueeze(1)
                    target_q = rewards + gamma * next_q_values * (~dones)

                loss = nn.MSELoss()(q_values, target_q)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

        # Decay epsilon
        epsilon = max(epsilon_end, epsilon * epsilon_decay)

        # Periodically update the target network
        if episode % target_update_freq == 0:
            target_net.load_state_dict(policy_net.state_dict())

        # print(f"Episode {episode} | Total Reward: {total_reward:.2f} | Epsilon: {epsilon:.3f}")
        print(f"Episode {episode} | Current Reward: {reward:.2f} | Epsilon: {epsilon:.3f}")

    return policy_net


In [3]:
def test_policy(env, model, episodes=10, steps_per_episode=100, render=False):
    model.eval()

    action_dim = env.action_space.nvec.shape[0]
    action_n = env.action_space.nvec[0]
    
    for episode in range(episodes):
        state, _ = env.reset()
        total_reward = 0

        for step in range(steps_per_episode):
            state_tensor = torch.FloatTensor(state).unsqueeze(0)
            with torch.no_grad():
                q_values = model(state_tensor)
                action_idx = torch.argmax(q_values).item()
                action = index_to_action(action_idx, action_dim, action_n)

            state, reward, _, _, info = env.step(action)
            total_reward += reward

            if render:
                env.render()

        print(f"Test Episode {episode} | Total Reward: {total_reward:.2f}")

In [4]:
# Initialize the environment parameters
action_dim = 5 # Number of dimensions in each action (5 time steps)
state_dim = 10 # Number of features in the state representation

# env = SimulatedNetworkSync(action_dim=action_dim, state_dim=state_dim)
env = RealNetworkSync(action_dim=action_dim, state_dim=state_dim, circuit_id=3)
trained_model = train(env, episodes=40, steps_per_episode=20, epsilon_decay=0.99)

Host/Port open and accessable
Episode 0 | Current Reward: 1.00 | Epsilon: 0.990
Episode 1 | Current Reward: -1.00 | Epsilon: 0.980
Episode 2 | Current Reward: 3.00 | Epsilon: 0.970
Episode 3 | Current Reward: 1.00 | Epsilon: 0.961
Episode 4 | Current Reward: 1.00 | Epsilon: 0.951
Episode 5 | Current Reward: 1.00 | Epsilon: 0.941
Episode 6 | Current Reward: 2.00 | Epsilon: 0.932
Episode 7 | Current Reward: -2.00 | Epsilon: 0.923
Episode 8 | Current Reward: 0.00 | Epsilon: 0.914
Episode 9 | Current Reward: 1.00 | Epsilon: 0.904
Episode 10 | Current Reward: -2.00 | Epsilon: 0.895
Episode 11 | Current Reward: 1.00 | Epsilon: 0.886
Episode 12 | Current Reward: 1.00 | Epsilon: 0.878
Episode 13 | Current Reward: -1.00 | Epsilon: 0.869
Episode 14 | Current Reward: 0.00 | Epsilon: 0.860
Episode 15 | Current Reward: 4.00 | Epsilon: 0.851
Episode 16 | Current Reward: 2.00 | Epsilon: 0.843
Episode 17 | Current Reward: -1.00 | Epsilon: 0.835
Episode 18 | Current Reward: 0.00 | Epsilon: 0.826
Episod

In [5]:
test_policy(env, trained_model, episodes=5, steps_per_episode=20)

Test Episode 0 | Total Reward: 46.00
Test Episode 1 | Total Reward: 41.00
Test Episode 2 | Total Reward: 37.00
Test Episode 3 | Total Reward: 22.00
Test Episode 4 | Total Reward: 34.00
