In [None]:
from shared_DQN import IndependentDQN
from spider_fly_env.envs.grid_MA_pettingzoo_testing import SpiderFlyEnvMA
from spider_fly_env.wrappers.pettingzoo_wrapper import PettingZooWrapper

import numpy as np

import pandas as pd

#### DQN (Shared parameters)

In [None]:
env = SpiderFlyEnvMA(render_mode = "ascii")
env = PettingZooWrapper(env)

env.observation_space[0].sample()

In [None]:
env = SpiderFlyEnvMA(max_steps = 200)
env = PettingZooWrapper(env)

In [None]:
IDQN = IndependentDQN(env, eps_steps = 50000, layer_sizes = (64, 64), tau = 0.0025, buffer_max_size = 50000) 

In [None]:
rewards, losses = IDQN.train(1000)

In [None]:
data1 = np.vstack(rewards)
data2 = np.vstack(losses)

df1 = pd.DataFrame(data1, columns = ["agent_1", "agent_2"])
df1["Episode"] = list(range(data1.shape[0]))

df2 = pd.DataFrame(data2, columns = ["agent_1", "agent_2"])
df2["Episode"] = list(range(data1.shape[0]))

df1 = df1.melt('Episode', var_name='Agent', value_name='Rewards')
df2 = df2.melt('Episode', var_name='Agent', value_name='Rewards')

In [None]:
# we need 1.5.0 for rolling average of next step
pd.__version__

In [None]:
display(df1)
display(df2)

In [None]:
df1["Avg_Reward"] = df1["Rewards"].rolling(window = 5, step = 5).mean()
df1 = df1[df1.Episode > 5]
df1.dropna()

df2["Avg_Loss"] = df2["Rewards"].rolling(window = 5, step = 5).mean()
df2 = df2[df2.Episode > 5]
df2.dropna()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.title("Rewards")
sns.lineplot(data = df1, x = "Episode", y = "Avg_Reward")
plt.title("Rewards")
plt.figure()
sns.lineplot(data = df1, x = "Episode", y = "Avg_Reward", hue = "Agent")

plt.figure()
plt.title("Losses")
sns.lineplot(data = df2, x = "Episode", y = "Avg_Loss")
plt.figure()
plt.title("Losses")
sns.lineplot(data = df2, x = "Episode", y = "Avg_Loss", hue = "Agent")

#### Sequential Q-learning

Tabular Q-learning:
$$
\begin{align*}
    & s = env.reset()\\
    &\text{while not } done:\\
    & \quad\quad \text{for } i \text{ in } (1, \ldots, m):\\
    & \quad\quad\quad\quad a_i = \argmax_{a_i} Q_i(s_i, a_i)\\
    & \quad\quad s', r, d = env.step(a_1, \ldots, a_m)\\
    & \quad\quad \text{for } i \text{ in } (1, \ldots, m):\\
    & \quad\quad\quad\quad Q_i(s, a_i) = Q_i(s, a_i) + lr * ((mean(r) + \gamma * \max_{a'_i} Q_i(s', a'_i)) - Q_i(s, a_i))\\
\end{align*}
$$


Sequential Tabular Q-learning:
$$
\begin{align*}
    & s = env.reset()\\
    &\text{while not } done:\\
    & \quad\quad \text{for } i \text{ in } (1, \ldots, m):\\
    & \quad\quad\quad\quad a_i = \argmax_{a_i} Q_i(s, a_1, \ldots, a_i)\\
    & \quad\quad s', r, d = env.step(a_1, \ldots, a_m)\\
    & \quad\quad \text{for } i \text{ in } (1, \ldots, m-1):\\
    & \quad\quad\quad\quad Q_i(s, a_1, \ldots, a_i) = Q_i(s, a_1, \ldots, a_i) + (i/m) * lr * (\max_{a_{i+1}} Q_{i+1}(s, a_1, \ldots, a_{i+1}) - Q_i(s, a_1, \ldots, a_i))\\
    & \quad\quad Q_m(s, a_1, \ldots, a_m) = Q_m(s, a_1, \ldots, a_m) + lr * ((mean(r) + \gamma * \max_{a'_1} Q_1(s', a'_1)) - Q_m(s, a_1, \ldots, a_m))\\
\end{align*}
$$

### Sequential DQN

In [None]:
# data1 = np.vstack(rewards)
# data2 = np.vstack(losses)

# df1 = pd.DataFrame(data1, columns = ["agent_1", "agent_2"])
# df1["Episode"] = list(range(data1.shape[0]))

# df2 = pd.DataFrame(data2, columns = ["agent_1", "agent_2"])
# df2["Episode"] = list(range(data1.shape[0]))

# df1 = df1.melt('Episode', var_name='Agent', value_name='Rewards')
# df2 = df2.melt('Episode', var_name='Agent', value_name='Rewards')
# df1["Avg_Reward"] = df1["Rewards"].rolling(window = 10, step = 10).mean()
# df1 = df1[df1.Episode > 10]
# df1.dropna()

# df2["Avg_Loss"] = df2["Rewards"].rolling(window = 10, step = 10).mean()
# df2 = df2[df2.Episode > 10]
# df2.dropna()

# import seaborn as sns
# import matplotlib.pyplot as plt

# plt.title("Rewards")
# sns.lineplot(data = df1, x = "Episode", y = "Avg_Reward")
# plt.title("Rewards")
# plt.figure()
# sns.lineplot(data = df1, x = "Episode", y = "Avg_Reward", hue = "Agent")

# plt.figure()
# plt.title("Losses")
# sns.lineplot(data = df2, x = "Episode", y = "Avg_Loss")
# plt.figure()
# plt.title("Losses")
# sns.lineplot(data = df2, x = "Episode", y = "Avg_Loss", hue = "Agent")

In [1]:
from shared_seq_double_DQN import seqDoubleDQN
from shared_seq_DQN import seqDQN
from shared_DQN import IndependentDQN
from custom_spider_env.spider_fly_env.envs.grid_MA_pettingzoo import SpiderFlyEnvMA
from spider_fly_env.wrappers.pettingzoo_wrapper import PettingZooWrapper

import numpy as np

import pandas as pd

env = SpiderFlyEnvMA(size = 4, spiders = 2, max_timesteps = 100, render_mode = "ascii")
env = PettingZooWrapper(env)

env.observation_space[0].sample()

env = SpiderFlyEnvMA(size = 4, spiders = 2, max_timesteps = 100)
env = PettingZooWrapper(env)

  logger.warn(f"Overriding environment {new_spec.id} already in registry.")
  logger.warn(f"Overriding environment {new_spec.id} already in registry.")


[['O' ' ' ' ' ' ']
 ['X' ' ' ' ' ' ']
 [' ' ' ' 'X' ' ']
 [' ' ' ' ' ' ' ']]


In [2]:
sequential_DQN = seqDQN(env, eps_steps = 100 * 100, layer_sizes = (64, 64), tau = 0.0025, buffer_max_size = 100000, batch_size = 256, global_observations = True, log_dir = "tensorboard_logs_seqDQN_hard2_eq_lr") 

rewards, losses = sequential_DQN.train(200)

# save model
sequential_DQN.shared_DQN.save("models", "seqDQN_hard2")

  return torch._C._cuda_getDeviceCount() > 0
  avg_loss = loss_sum / learn_steps
  loss_log.append(loss_sum / learn_steps)


Episode: 0 - Reward:[1.825 1.888] - Avg loss (last ep): [nan nan]
Episode: 10 - Reward:[0.838 0.883] - Avg loss (last ep): [0.00032585 0.00187178]
Episode: 20 - Reward:[4.927 4.846] - Avg loss (last ep): [0.00075622 0.00281553]
Episode: 30 - Reward:[3.868 3.856] - Avg loss (last ep): [0.0009669  0.00291591]
Episode: 40 - Reward:[7.    6.823] - Avg loss (last ep): [0.00143454 0.00401261]
Episode: 50 - Reward:[11.965 11.86 ] - Avg loss (last ep): [0.00189193 0.00633617]
Episode: 60 - Reward:[14.944 14.866] - Avg loss (last ep): [0.0025698  0.00751691]
Episode: 70 - Reward:[24.958 24.883] - Avg loss (last ep): [0.00299741 0.00939011]
Episode: 80 - Reward:[26.908 26.914] - Avg loss (last ep): [0.00378357 0.01105038]
Episode: 90 - Reward:[40.933 40.942] - Avg loss (last ep): [0.00431614 0.01386695]
Episode: 100 - Reward:[35.956 35.908] - Avg loss (last ep): [0.00457349 0.01610767]
Episode: 110 - Reward:[33.913 33.973] - Avg loss (last ep): [0.00505176 0.01920811]
Episode: 120 - Reward:[28.9

In [3]:
sequential_DQN.shared_DQN.save("models", "seqDQN_hard2")

In [4]:
# load model
model = seqDQN(env, layer_sizes = (64,64), global_observations = True)
model.shared_DQN.load("models/seqDQN_hard2")

In [31]:
env = SpiderFlyEnvMA(size = 4, spiders = 2, max_timesteps = 100, render_mode = "ascii")
env = PettingZooWrapper(env)

terminal = False
print("start ----------------------------")
obs, _ = env.reset()
while not terminal:
    actions = []
    actions = model.get_actions(obs, deterministic = True)
    print([env.action_to_direction_string[act] for act in actions])

    obs, rewards, terminals, truncations, infos = env.step(actions)
    if rewards[0] == 1:
        break
    # print(obs, rewards, terminals, truncations, infos, actions)
    # print("--------------------")
    
    terminal = terminals[0]

[[' ' ' ' ' ' ' ']
 [' ' ' ' ' ' ' ']
 ['X' ' ' ' ' ' ']
 [' ' 'O' 'X' ' ']]
start ----------------------------
[[' ' ' ' ' ' ' ']
 [' ' ' ' 'X' ' ']
 [' ' ' ' ' ' ' ']
 [' ' 'O' ' ' 'X']]
['left', 'left']
[[' ' ' ' ' ' ' ']
 [' ' 'X' ' ' ' ']
 [' ' ' ' ' ' ' ']
 [' ' 'O' 'X' ' ']]
['nothing', 'down']
[[' ' ' ' ' ' ' ']
 [' ' ' ' ' ' ' ']
 [' ' 'X' ' ' ' ']
 [' ' 'O' 'X' ' ']]


### 3 Spiders

In [1]:
from shared_seq_double_DQN import seqDoubleDQN
from shared_seq_DQN import seqDQN
from shared_DQN import IndependentDQN
from custom_spider_env.spider_fly_env.envs.grid_MA_pettingzoo import SpiderFlyEnvMA
from spider_fly_env.wrappers.pettingzoo_wrapper import PettingZooWrapper

from pettingzoo.test import parallel_api_test
from supersuit import normalize_obs_v0, dtype_v0

import numpy as np

import pandas as pd

env = SpiderFlyEnvMA(size = 4, spiders = 3, max_timesteps = 100, render_mode = "ascii")
env = PettingZooWrapper(env, normalize = False)


env = SpiderFlyEnvMA(size = 4, spiders = 3, max_timesteps = 100)
env = PettingZooWrapper(env, normalize = False)


  logger.warn(f"Overriding environment {new_spec.id} already in registry.")
  logger.warn(f"Overriding environment {new_spec.id} already in registry.")


[['X' ' ' ' ' ' ']
 ['O' 'X' ' ' ' ']
 ['X' ' ' ' ' ' ']
 [' ' ' ' ' ' ' ']]


In [13]:
sequential_DQN = seqDQN(env, eps_steps = 100 * 10, layer_sizes = (64, 64), tau = 0.0025, buffer_max_size = 100000, batch_size = 2, global_observations = True, log_dir = "tensorboard_logs_seqDQN_hard3") 

rewards, losses = sequential_DQN.train(25)

# save model
# sequential_DQN.shared_DQN.save("models", "seqDQN_hard3")

GET ACTIONS -------
last action sampled:  1
last action sampled:  1
last action sampled:  0
Transition added:  [[0, 3, 2, 1, 1, 3, 3, 1], [0, 3, 2, 1, 1, 3, 3, 1], [0, 3, 2, 1, 1, 3, 3, 1]] [1, 1, 0] [-0.002, 0.001, -0.002] [[0, 3, 2, 1, 1, 3, 3, 1], [0, 3, 2, 1, 1, 3, 3, 1], [0, 3, 2, 1, 1, 3, 3, 1]] [False, False, False]
GET ACTIONS -------
last action sampled:  3
last action sampled:  3
last action sampled:  1
Transition added:  [[0, 3, 2, 1, 1, 3, 3, 1], [0, 3, 2, 1, 1, 3, 3, 1], [0, 3, 2, 1, 1, 3, 3, 1]] [3, 3, 1] [-0.002, -0.002, -0.002] [[0, 3, 2, 2, 2, 3, 3, 1], [0, 3, 2, 2, 2, 3, 3, 1], [0, 3, 2, 2, 2, 3, 3, 1]] [False, False, False]
TRAINING STEP --------------
---------------
input tensor agent  0
tensor([[0., 3., 2., 1., 1., 3., 3., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         1., 0., 0.],
        [0., 3., 2., 1., 1., 3., 3., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         1., 0., 0.]])
Qvals
tensor([[ 0.0692, -0.2229,  0.0311,  0.1646, -0.1517],
        [ 0.06

KeyboardInterrupt: 

In [None]:
# sequential_DQN.shared_DQN.save("models", "seqDQN_hard3")

In [6]:
# load model
model = seqDQN(env, layer_sizes = (64,64), global_observations = True)
model.shared_DQN.load("models/seqDQN_hard3")

In [7]:
env = SpiderFlyEnvMA(size = 4, spiders = 3, max_timesteps = 100, render_mode = "ascii")
env = PettingZooWrapper(env)

terminal = False
print("start ----------------------------")
obs, _ = env.reset()
while not terminal:
    actions = []
    actions = model.get_actions(obs, deterministic = True)
    print([env.action_to_direction_string[act] for act in actions])

    obs, rewards, terminals, truncations, infos = env.step(actions)
    if rewards[0] == 1:
        break
    # print(obs, rewards, terminals, truncations, infos, actions)
    # print("--------------------")
    
    terminal = terminals[0]

[['O' ' ' ' ' 'X']
 [' ' ' ' ' ' ' ']
 ['X' ' ' 'X' ' ']
 [' ' ' ' ' ' ' ']]
start ----------------------------
[[' ' ' ' 'X' ' ']
 ['X' ' ' ' ' ' ']
 [' ' ' ' ' ' ' ']
 [' ' ' ' 'X' 'O']]
['left', 'left', 'down']
[[' ' ' ' ' ' ' ']
 ['X' ' ' 'X' ' ']
 [' ' ' ' ' ' ' ']
 [' ' 'X' ' ' 'O']]
['left', 'left', 'right']
[[' ' ' ' ' ' ' ']
 ['X' ' ' ' ' 'X']
 [' ' ' ' ' ' ' ']
 ['X' ' ' ' ' 'O']]
['down', 'down', 'down']
[[' ' ' ' ' ' ' ']
 [' ' ' ' ' ' ' ']
 ['X' ' ' ' ' 'X']
 ['X' ' ' ' ' 'O']]
['right', 'right', 'right']
[[' ' ' ' ' ' ' ']
 [' ' ' ' ' ' ' ']
 [' ' 'X' ' ' 'X']
 [' ' 'X' ' ' 'O']]
['right', 'right', 'down']
[[' ' ' ' ' ' ' ']
 [' ' ' ' ' ' ' ']
 [' ' ' ' 'X' 'X']
 [' ' ' ' 'X' 'O']]
