In [3]:
from shared_DQN import SharedDQN
from spider_fly_env.envs.grid_MA_pettingzoo_testing import SpiderFlyEnvMA
from spider_fly_env.wrappers.pettingzoo_wrapper import PettingZooWrapper

import numpy as np

import pandas as pd

#### DQN (Shared parameters)

In [4]:
env = SpiderFlyEnvMA(render_mode = "ascii")
env = PettingZooWrapper(env)

env.observation_space[0].sample()

[' ' ' ' 'X' 'X' ' ' 'O' 'O']


array([2, 5, 6, 6])

In [5]:
env = SpiderFlyEnvMA(max_steps = 200)
env = PettingZooWrapper(env)

In [6]:
DQN = SharedDQN(env, eps_steps = 50000, layer_sizes = (64, 64), tau = 0.0025, buffer_max_size = 100000) 

In [7]:
rewards, losses = DQN.train(1000)

RuntimeError: mat1 and mat2 must have the same dtype, but got Double and Float

In [None]:
data1 = np.vstack(rewards)
data2 = np.vstack(losses)

df1 = pd.DataFrame(data1, columns = ["agent_1", "agent_2"])
df1["Episode"] = list(range(data1.shape[0]))

df2 = pd.DataFrame(data2, columns = ["agent_1", "agent_2"])
df2["Episode"] = list(range(data1.shape[0]))

df1 = df1.melt('Episode', var_name='Agent', value_name='Rewards')
df2 = df2.melt('Episode', var_name='Agent', value_name='Rewards')

In [None]:
# we need 1.5.0 for rolling average of next step
pd.__version__

In [None]:
display(df1)
display(df2)

In [None]:
df1["Avg_Reward"] = df1["Rewards"].rolling(window = 5, step = 5).mean()
df1 = df1[df1.Episode > 5]
df1.dropna()

df2["Avg_Loss"] = df2["Rewards"].rolling(window = 5, step = 5).mean()
df2 = df2[df2.Episode > 5]
df2.dropna()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.title("Rewards")
sns.lineplot(data = df1, x = "Episode", y = "Avg_Reward")
plt.title("Rewards")
plt.figure()
sns.lineplot(data = df1, x = "Episode", y = "Avg_Reward", hue = "Agent")

plt.figure()
plt.title("Losses")
sns.lineplot(data = df2, x = "Episode", y = "Avg_Loss")
plt.figure()
plt.title("Losses")
sns.lineplot(data = df2, x = "Episode", y = "Avg_Loss", hue = "Agent")

#### Sequential Q-learning

Tabular Q-learning:
$$
\begin{align*}
    & s = env.reset()\\
    &\text{while not } done:\\
    & \quad\quad \text{for } i \text{ in } (1, \ldots, m):\\
    & \quad\quad\quad\quad a_i = \argmax_{a_i} Q_i(s_i, a_i)\\
    & \quad\quad s', r, d = env.step(a_1, \ldots, a_m)\\
    & \quad\quad \text{for } i \text{ in } (1, \ldots, m):\\
    & \quad\quad\quad\quad Q_i(s, a_i) = Q_i(s, a_i) + lr * ((mean(r) + \gamma * \max_{a'_i} Q_i(s', a'_i)) - Q_i(s, a_i))\\
\end{align*}
$$


Sequential Tabular Q-learning:
$$
\begin{align*}
    & s = env.reset()\\
    &\text{while not } done:\\
    & \quad\quad \text{for } i \text{ in } (1, \ldots, m):\\
    & \quad\quad\quad\quad a_i = \argmax_{a_i} Q_i(s, a_1, \ldots, a_i)\\
    & \quad\quad s', r, d = env.step(a_1, \ldots, a_m)\\
    & \quad\quad \text{for } i \text{ in } (1, \ldots, m-1):\\
    & \quad\quad\quad\quad Q_i(s, a_1, \ldots, a_i) = Q_i(s, a_1, \ldots, a_i) + (i/m) * lr * (\max_{a_{i+1}} Q_{i+1}(s, a_1, \ldots, a_{i+1}) - Q_i(s, a_1, \ldots, a_i))\\
    & \quad\quad Q_m(s, a_1, \ldots, a_m) = Q_m(s, a_1, \ldots, a_m) + lr * ((mean(r) + \gamma * \max_{a'_1} Q_1(s', a'_1)) - Q_m(s, a_1, \ldots, a_m))\\
\end{align*}
$$

### Sequential DQN

In [None]:
# data1 = np.vstack(rewards)
# data2 = np.vstack(losses)

# df1 = pd.DataFrame(data1, columns = ["agent_1", "agent_2"])
# df1["Episode"] = list(range(data1.shape[0]))

# df2 = pd.DataFrame(data2, columns = ["agent_1", "agent_2"])
# df2["Episode"] = list(range(data1.shape[0]))

# df1 = df1.melt('Episode', var_name='Agent', value_name='Rewards')
# df2 = df2.melt('Episode', var_name='Agent', value_name='Rewards')
# df1["Avg_Reward"] = df1["Rewards"].rolling(window = 10, step = 10).mean()
# df1 = df1[df1.Episode > 10]
# df1.dropna()

# df2["Avg_Loss"] = df2["Rewards"].rolling(window = 10, step = 10).mean()
# df2 = df2[df2.Episode > 10]
# df2.dropna()

# import seaborn as sns
# import matplotlib.pyplot as plt

# plt.title("Rewards")
# sns.lineplot(data = df1, x = "Episode", y = "Avg_Reward")
# plt.title("Rewards")
# plt.figure()
# sns.lineplot(data = df1, x = "Episode", y = "Avg_Reward", hue = "Agent")

# plt.figure()
# plt.title("Losses")
# sns.lineplot(data = df2, x = "Episode", y = "Avg_Loss")
# plt.figure()
# plt.title("Losses")
# sns.lineplot(data = df2, x = "Episode", y = "Avg_Loss", hue = "Agent")

In [2]:
# from shared_seq_double_DQN import seqDoubleDQN
from custom_agents.CTCE_algorithms.shared_seq_DQN import seqDQN
from custom_spider_env.spider_fly_env.envs.grid_MA_pettingzoo import SpiderFlyEnvMA
from spider_fly_env.wrappers.pettingzoo_wrapper import PettingZooWrapper

import numpy as np

import pandas as pd

env = SpiderFlyEnvMA(size = 3, spiders = 2, max_timesteps = 100, render_mode = "ascii")
env = SpiderFlyEnvMA(size = 3, spiders = 2, max_timesteps = 100)
env = PettingZooWrapper(env)

  logger.warn(f"Overriding environment {new_spec.id} already in registry.")
  logger.warn(f"Overriding environment {new_spec.id} already in registry.")


[['X' 'X' ' ']
 [' ' ' ' ' ']
 [' ' 'O' ' ']]


In [4]:
sequential_DQN = seqDQN(env, eps_steps = 100 * 100, batch_size = 256, layer_sizes = (32, 32), tau = 0.0025, buffer_max_size = 100000, global_observations = True) 

sequential_DQN.train(200)

  return torch._C._cuda_getDeviceCount() > 0


Episode: 0 - Reward:[38.563 38.59 ] - Avg loss (last ep): None
Episode: 10 - Reward:[37.549 37.594] - Avg loss (last ep): [0.0046275  0.05785612]
Episode: 20 - Reward:[47.674 47.644] - Avg loss (last ep): [0.00955623 0.06173425]
Episode: 30 - Reward:[58.738 58.723] - Avg loss (last ep): [0.01511669 0.06929759]


KeyboardInterrupt: 

In [5]:
from centralized_DQN import DQN

mdDQN = DQN(env, eps_steps = 100 * 100, batch_size = 256, layer_sizes = (32, 32), tau = 0.0025, buffer_max_size = 100000, global_observations = True) 
mdDQN.train(200)

SyntaxError: keyword argument repeated: eps_steps (2258089495.py, line 3)

In [None]:
sequential_DQN.shared_DQN.save("models", "seqDQN_hard2")

In [None]:
# load model
model = seqDQN(env, layer_sizes = (64,64), global_observations = True)
model.shared_DQN.load("models/seqDQN_hard2")

In [None]:
env = SpiderFlyEnvMA(size = 4, spiders = 2, max_timesteps = 100, render_mode = "ascii")
env = PettingZooWrapper(env)

terminal = False
print("start ----------------------------")
obs, _ = env.reset()
while not terminal:
    actions = []
    actions = model.get_actions(obs, deterministic = True)
    print([env.action_to_direction_string[act] for act in actions])

    obs, rewards, terminals, truncations, infos = env.step(actions)
    if rewards[0] == 1:
        break
    # print(obs, rewards, terminals, truncations, infos, actions)
    # print("--------------------")
    
    terminal = terminals[0]

### 3 Spiders

In [1]:
from custom_agents.CTCE_algorithms.shared_seq_DQN import seqDQN
from custom_spider_env.spider_fly_env.envs.grid_MA_pettingzoo2 import SpiderFlyEnvMA
from spider_fly_env.wrappers.pettingzoo_wrapper import PettingZooWrapper

import numpy as np

import pandas as pd

env = SpiderFlyEnvMA(size = 4, spiders = 3, max_timesteps = 100, render_mode = "ascii")
env = PettingZooWrapper(env, normalize = True)


env = SpiderFlyEnvMA(size = 4, spiders = 3, max_timesteps = 100)
env = PettingZooWrapper(env, normalize = True)


2024-05-20 19:02:35.615916: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


[[' ' ' ' ' ' 'X']
 [' ' ' ' ' ' ' ']
 ['O' ' ' ' ' ' ']
 [' ' 'X' 'X' ' ']]


  logger.warn(f"Overriding environment {new_spec.id} already in registry.")
  logger.warn(f"Overriding environment {new_spec.id} already in registry.")


In [2]:
sequential_DQN = seqDQN(env,eps_steps = 100 * 1000, layer_sizes = (128, 128), tau = 0.0025, buffer_max_size = 1000000, batch_size = 256, global_observations = True, log_dir = "tensorboard_logs_seqDQN_hard3_LOW_LR", lr = 0.0001) 

rewards, losses = sequential_DQN.train(100 * 100000)

# save model
# sequential_DQN.shared_DQN.save("models", "seqDQN_hard3")

Episode: 0 - Reward:[-0.066 -0.024 -0.064] - Avg loss (last ep): None


In [None]:
# sequential_DQN.shared_DQN.save("models", "seqDQN_hard3")

In [None]:
# load model
model = seqDQN(env, layer_sizes = (64,64), global_observations = True)
model.shared_DQN.load("models/seqDQN_hard3")

In [None]:
env = SpiderFlyEnvMA(size = 4, spiders = 3, max_timesteps = 100, render_mode = "ascii")
env = PettingZooWrapper(env)

terminal = False
print("start ----------------------------")
obs, _ = env.reset()
while not terminal:
    actions = []
    actions = model.get_actions(obs, deterministic = True)
    print([env.action_to_direction_string[act] for act in actions])

    obs, rewards, terminals, truncations, infos = env.step(actions)
    if rewards[0] == 1:
        break
    # print(obs, rewards, terminals, truncations, infos, actions)
    # print("--------------------")
    
    terminal = terminals[0]