# Linear network training and replay

In [14]:
from yawning_titan.networks.node import Node
from yawning_titan.networks.network import Network

import time
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.monitor import Monitor
from stable_baselines3 import A2C, DQN, PPO
from stable_baselines3.ppo import MlpPolicy as PPOMlp

from yawning_titan.envs.generic.core.blue_interface import BlueInterface
from yawning_titan.envs.generic.core.red_interface import RedInterface
from yawning_titan.envs.generic.generic_env import GenericNetworkEnv
from yawning_titan.envs.generic.core.action_loops import ActionLoop
from yawning_titan.envs.generic.core.network_interface import NetworkInterface
from yawning_titan.networks.network_db import default_18_node_network
import yawning_titan.game_modes
from yawning_titan.envs.generic.core.action_loops import ActionLoop

## Using DB to retrieve game modes
from yawning_titan.game_modes.game_mode_db import GameModeDB, GameModeSchema
from yawning_titan.db.doc_metadata import DocMetadataSchema
db = GameModeDB()

### Network Setup

Network of 5 nodes, connected in a line. Node 1 is the entry node and node 5 is the high value target. All nodes begin with an initial vulnerability of 0.3.

In [2]:
# Instantiate the Network
network = Network()

# Instantiate the Node's and add them to the Network
 
pc_1 = Node("PC 1")
network.add_node(pc_1)
pc_1.x_pos = -1.00
pc_1.y_pos = 0.01
pc_1.entry_node = True
pc_1.vulnerability = 0.8

pc_2 = Node("PC 2")
network.add_node(pc_2)
pc_2.x_pos = -0.50
pc_2.y_pos = 0.01
pc_2.vulnerability = 0.8

pc_3 = Node("PC 3")
network.add_node(pc_3)
pc_3.x_pos = 0.00
pc_3.y_pos = 0.01
pc_3.vulnerability = 0.8

pc_4 = Node("PC 4")
network.add_node(pc_4)
pc_4.x_pos = 0.50
pc_4.y_pos = 0.01
pc_4.vulnerability = 0.8

pc_5 = Node("PC 5")
network.add_node(pc_5)
pc_5.x_pos = 1.00
pc_5.y_pos = 0.01
pc_5.high_value_node = True
pc_5.vulnerability = 0.8


# Add the edges between Node's
network.add_edge(pc_1, pc_2)
network.add_edge(pc_2, pc_3)
network.add_edge(pc_3, pc_4)
network.add_edge(pc_4, pc_5)



# Reset the entry nodes, high value nodes, and vulnerability scores by calling .setup()
# network.reset()

# View the Networks Node Details
network.show(verbose=True)

UUID                                  Name    High Value Node    Entry Node      Vulnerability  Position (x,y)
------------------------------------  ------  -----------------  ------------  ---------------  ----------------
0fdc2c71-fb51-4e14-a01d-80325bee970f  PC 1    False              True                      0.8  -1.00, 0.01
0cf00849-52e1-42ce-bccf-56589a31c272  PC 2    False              False                     0.8  -0.50, 0.01
4c1fc42a-18b2-4bad-8b40-159ab6cc4b5d  PC 3    False              False                     0.8  0.00, 0.01
0c9f0109-50e9-4bb9-a163-335ae86f141d  PC 4    False              False                     0.8  0.50, 0.01
b40d296d-3027-4e76-b06e-5d14a8870bef  PC 5    True               False                     0.8  1.00, 0.01


## Creating environments and training

In [3]:
# load notebook extension
%reload_ext tensorboard

In [4]:
db.show(True)

name               author              locked    uuid
-----------------  ------------------  --------  ------------------------------------
DCBO Agent Config  dstl/YAWNING-TITAN  True      bac2cb9d-b24b-426c-88a5-5edd0c2de413
Default Game Mode  dstl/YAWNING-TITAN  True      900a704f-6271-4994-ade7-40b74d3199b1
Low skill red      dstl/YAWNING-TITAN  True      3ccd9988-8781-4c3e-9c75-44cc987ae6af
XAI_mode           H Harrison          False     f5665563-d91a-4164-9e19-c67ce3db0066


### Game mode

#### Red: 

Can attack from any node it controls 

Only basic attack and zero day enabled. 

Starts with one zero day attack and gains another every 5 timesteps 

No natural spreading 

Target mechanism: (prioritise vulnerable nodes – sorts nodes it can attack and selects most vulnerable) changed to random 

#### Blue: 

Action set: reduce vulnerability, restore node (considering taking away restore node as kept winning by immediately restoring first node red attacks) 

100% chance of immediately discovering intrusions 

#### Game rules: 

Max steps: 30, no grace period 

Blue loss if high value node lost

#### Observation space 

Compromised status, vulnerability scores and node connections. I removed special nodes as this is kind of inferred by the rewards and adds a lot of dimensions.

#### Rewards 

-100 for loss, 100 for reaching end.  

Negative reward reduced for closer fails – if closer to end of timesteps 


In [13]:
simple_mode = db.get("f5665563-d91a-4164-9e19-c67ce3db0066")

JSONDecodeError: Extra data: line 1 column 12621 (char 12620)

In [12]:
## Build network interface
s_network_interface = NetworkInterface(game_mode=simple_mode, network=network)

## Name agents
red = RedInterface(s_network_interface)
blue = BlueInterface(s_network_interface)

## Create environment
s_env = GenericNetworkEnv(red, blue, s_network_interface)

## Check compliant with OpenAI gym
check_env(s_env, warn=True)
_ = s_env.reset()

In [9]:
## Initialise environment callback
eval_callback = EvalCallback(Monitor(s_env), eval_freq=1000, deterministic=False, render=False)

## Create agent
s_agent_6 = PPO(PPOMlp, s_env, verbose=1, tensorboard_log="./logs/tensorboard_ex/")

## Train agent for 200,000 timesteps
s_agent_6.learn(total_timesteps=100000, n_eval_episodes=1, callback=eval_callback)
%tensorboard --logdir ./logs/tensorboard_ex/

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to ./logs/tensorboard_ex/PPO_1
Eval num_timesteps=1000, episode_reward=-34.35 +/- 8.24
Episode length: 14.40 +/- 3.93
---------------------------------
| eval/              |          |
|    mean_ep_length  | 14.4     |
|    mean_reward     | -34.4    |
| time/              |          |
|    total_timesteps | 1000     |
---------------------------------
New best mean reward!
Eval num_timesteps=2000, episode_reward=-38.90 +/- 8.04
Episode length: 12.00 +/- 6.26
---------------------------------
| eval/              |          |
|    mean_ep_length  | 12       |
|    mean_reward     | -38.9    |
| time/              |          |
|    total_timesteps | 2000     |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 12       |
|    ep_rew_mean     | -40.4    |
| time/              |          |
|    fps             | 83

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 15.1     |
|    ep_rew_mean     | -25.3    |
| time/              |          |
|    fps             | 610      |
|    iterations      | 6        |
|    time_elapsed    | 20       |
|    total_timesteps | 12288    |
---------------------------------
Eval num_timesteps=13000, episode_reward=-42.95 +/- 5.27
Episode length: 8.40 +/- 3.07
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 8.4         |
|    mean_reward          | -42.9       |
| time/                   |             |
|    total_timesteps      | 13000       |
| train/                  |             |
|    approx_kl            | 0.006221044 |
|    clip_fraction        | 0.0214      |
|    clip_range           | 0.2         |
|    entropy_loss         | -2.19       |
|    explained_variance   | 0.515       |
|    learning_rate        | 0.0003      |
|    loss                 |

Eval num_timesteps=24000, episode_reward=0.76 +/- 30.91
Episode length: 24.60 +/- 5.92
---------------------------------
| eval/              |          |
|    mean_ep_length  | 24.6     |
|    mean_reward     | 0.764    |
| time/              |          |
|    total_timesteps | 24000    |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 19.8     |
|    ep_rew_mean     | -8.76    |
| time/              |          |
|    fps             | 613      |
|    iterations      | 12       |
|    time_elapsed    | 40       |
|    total_timesteps | 24576    |
---------------------------------
Eval num_timesteps=25000, episode_reward=-22.78 +/- 15.61
Episode length: 16.80 +/- 7.98
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 16.8        |
|    mean_reward          | -22.8       |
| time/                   |             |
|    total_timesteps      | 25000   

Eval num_timesteps=35000, episode_reward=11.00 +/- 42.35
Episode length: 22.20 +/- 9.81
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 22.2        |
|    mean_reward          | 11          |
| time/                   |             |
|    total_timesteps      | 35000       |
| train/                  |             |
|    approx_kl            | 0.010766781 |
|    clip_fraction        | 0.0931      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.61       |
|    explained_variance   | 0.38        |
|    learning_rate        | 0.0003      |
|    loss                 | 129         |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0109     |
|    value_loss           | 254         |
-----------------------------------------
Eval num_timesteps=36000, episode_reward=48.15 +/- 4.81
Episode length: 30.00 +/- 0.00
---------------------------------
| eval/              |          |
|

Eval num_timesteps=47000, episode_reward=3.88 +/- 44.28
Episode length: 19.40 +/- 9.48
---------------------------------
| eval/              |          |
|    mean_ep_length  | 19.4     |
|    mean_reward     | 3.88     |
| time/              |          |
|    total_timesteps | 47000    |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 26.4     |
|    ep_rew_mean     | 30       |
| time/              |          |
|    fps             | 598      |
|    iterations      | 23       |
|    time_elapsed    | 78       |
|    total_timesteps | 47104    |
---------------------------------
Eval num_timesteps=48000, episode_reward=49.66 +/- 1.60
Episode length: 30.00 +/- 0.00
------------------------------------------
| eval/                   |              |
|    mean_ep_length       | 30           |
|    mean_reward          | 49.7         |
| time/                   |              |
|    total_timesteps      | 48000

Eval num_timesteps=58000, episode_reward=35.92 +/- 38.96
Episode length: 25.40 +/- 9.20
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 25.4        |
|    mean_reward          | 35.9        |
| time/                   |             |
|    total_timesteps      | 58000       |
| train/                  |             |
|    approx_kl            | 0.007951869 |
|    clip_fraction        | 0.0636      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.998      |
|    explained_variance   | 0.213       |
|    learning_rate        | 0.0003      |
|    loss                 | 45.2        |
|    n_updates            | 280         |
|    policy_gradient_loss | -0.00734    |
|    value_loss           | 123         |
-----------------------------------------
Eval num_timesteps=59000, episode_reward=51.76 +/- 3.93
Episode length: 30.00 +/- 0.00
---------------------------------
| eval/              |          |
|

Eval num_timesteps=69000, episode_reward=39.79 +/- 33.02
Episode length: 26.60 +/- 6.80
---------------------------------
| eval/              |          |
|    mean_ep_length  | 26.6     |
|    mean_reward     | 39.8     |
| time/              |          |
|    total_timesteps | 69000    |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 28.8     |
|    ep_rew_mean     | 49.4     |
| time/              |          |
|    fps             | 603      |
|    iterations      | 34       |
|    time_elapsed    | 115      |
|    total_timesteps | 69632    |
---------------------------------
Eval num_timesteps=70000, episode_reward=59.18 +/- 1.36
Episode length: 30.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 30          |
|    mean_reward          | 59.2        |
| time/                   |             |
|    total_timesteps      | 70000    

Eval num_timesteps=80000, episode_reward=60.06 +/- 1.76
Episode length: 30.00 +/- 0.00
------------------------------------------
| eval/                   |              |
|    mean_ep_length       | 30           |
|    mean_reward          | 60.1         |
| time/                   |              |
|    total_timesteps      | 80000        |
| train/                  |              |
|    approx_kl            | 0.0045377775 |
|    clip_fraction        | 0.0642       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.861       |
|    explained_variance   | 0.106        |
|    learning_rate        | 0.0003       |
|    loss                 | 18.9         |
|    n_updates            | 390          |
|    policy_gradient_loss | -0.00457     |
|    value_loss           | 34.9         |
------------------------------------------
Eval num_timesteps=81000, episode_reward=58.30 +/- 1.82
Episode length: 30.00 +/- 0.00
---------------------------------
| eval/           

Eval num_timesteps=92000, episode_reward=59.31 +/- 1.18
Episode length: 30.00 +/- 0.00
---------------------------------
| eval/              |          |
|    mean_ep_length  | 30       |
|    mean_reward     | 59.3     |
| time/              |          |
|    total_timesteps | 92000    |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 30       |
|    ep_rew_mean     | 58.9     |
| time/              |          |
|    fps             | 604      |
|    iterations      | 45       |
|    time_elapsed    | 152      |
|    total_timesteps | 92160    |
---------------------------------
Eval num_timesteps=93000, episode_reward=60.55 +/- 0.63
Episode length: 30.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 30          |
|    mean_reward          | 60.6        |
| time/                   |             |
|    total_timesteps      | 93000     

In [None]:
## Save trained agent
s_agent_name = "ppo-s-linear"
s_agent.save(s_agent_name)

In [None]:
# Evaluation 

# Create a new environment for evaluation
eval_env =GenericNetworkEnv(red, blue, s_network_interface)

# Evaluate the model with 10 evaluation episodes and deterministic=True
mean_reward, std_reward = evaluate_policy(s_agent, eval_env, n_eval_episodes=10, deterministic=True)

# Print the results
print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")