# Linear network training and replay

In [1]:
from yawning_titan.networks.node import Node
from yawning_titan.networks.network import Network

import time
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.monitor import Monitor
from stable_baselines3 import A2C, DQN, PPO
from stable_baselines3.ppo import MlpPolicy as PPOMlp

from yawning_titan.envs.generic.core.blue_interface import BlueInterface
from yawning_titan.envs.generic.core.red_interface import RedInterface
from yawning_titan.envs.generic.generic_env import GenericNetworkEnv
from yawning_titan.envs.generic.core.action_loops import ActionLoop
from yawning_titan.envs.generic.core.network_interface import NetworkInterface
from yawning_titan.networks.network_db import default_18_node_network
import yawning_titan.game_modes
from yawning_titan.envs.generic.core.action_loops import ActionLoop

## Using DB to retrieve game modes
from yawning_titan.game_modes.game_mode_db import GameModeDB, GameModeSchema
from yawning_titan.db.doc_metadata import DocMetadataSchema
db = GameModeDB()

  from .autonotebook import tqdm as notebook_tqdm


### Network Setup

Network of 5 nodes, connected in a line. Node 1 is the entry node and node 5 is the high value target. All nodes begin with an initial vulnerability of 0.3.

In [2]:
# Instantiate the Network
network = Network()

# Instantiate the Node's and add them to the Network
 
pc_1 = Node("PC 1")
network.add_node(pc_1)
pc_1.x_pos = -1.00
pc_1.y_pos = 0.01
pc_1.entry_node = True
pc_1.vulnerability = 0.8

pc_2 = Node("PC 2")
network.add_node(pc_2)
pc_2.x_pos = -0.50
pc_2.y_pos = 0.01
pc_2.vulnerability = 0.8

pc_3 = Node("PC 3")
network.add_node(pc_3)
pc_3.x_pos = 0.00
pc_3.y_pos = 0.01
pc_3.vulnerability = 0.8

pc_4 = Node("PC 4")
network.add_node(pc_4)
pc_4.x_pos = 0.50
pc_4.y_pos = 0.01
pc_4.vulnerability = 0.8

pc_5 = Node("PC 5")
network.add_node(pc_5)
pc_5.x_pos = 1.00
pc_5.y_pos = 0.01
pc_5.high_value_node = True
pc_5.vulnerability = 0.8


# Add the edges between Node's
network.add_edge(pc_1, pc_2)
network.add_edge(pc_2, pc_3)
network.add_edge(pc_3, pc_4)
network.add_edge(pc_4, pc_5)



# Reset the entry nodes, high value nodes, and vulnerability scores by calling .setup()
# network.reset()

# View the Networks Node Details
network.show(verbose=True)

UUID                                  Name    High Value Node    Entry Node      Vulnerability  Position (x,y)
------------------------------------  ------  -----------------  ------------  ---------------  ----------------
2c1b1c0a-10cd-422d-ae74-b97ce23b85c0  PC 1    False              True                      0.8  -1.00, 0.01
a54a8f17-4ad0-48f4-b73e-765f8dd9760f  PC 2    False              False                     0.8  -0.50, 0.01
92334edb-d719-4dee-adce-acf3e010c951  PC 3    False              False                     0.8  0.00, 0.01
1b957f58-a67a-4480-b133-bf43ff56fbe9  PC 4    False              False                     0.8  0.50, 0.01
875e4334-2e8b-4ccf-a558-7b21b13d2a2d  PC 5    True               False                     0.8  1.00, 0.01


## Creating environments and training

In [3]:
# load notebook extension
%reload_ext tensorboard

In [4]:
db.show(True)

JSONDecodeError: Extra data: line 1 column 15878 (char 15877)

### Simple mode:
simple_mode 

#### Red: 

Can attack from any node it controls 

Only basic attack and zero day enabled. 

Starts with one zero day attack and gains another every 5 timesteps 

No natural spreading 

Target mechanism: (prioritise vulnerable nodes – sorts nodes it can attack and selects most vulnerable) changed to random 

#### Blue: 

Action set: reduce vulnerability, restore node (considering taking away restore node as kept winning by immediately restoring first node red attacks) 

100% chance of immediately discovering intrusions 

#### Game rules: 

Max steps: 30, no grace period 

Blue loss if high value node lost

#### Observation space 

Compromised status, vulnerability scores and node connections. I removed special nodes as this is kind of inferred by the rewards and adds a lot of dimensions.

#### Rewards 

-100 for loss, 100 for reaching end.  

Negative reward reduced for closer fails – if closer to end of timesteps 


In [5]:
simple_mode = db.get("919da33c-7bc9-4d29-99eb-097a7e9bb016")

JSONDecodeError: Extra data: line 1 column 15878 (char 15877)

In [None]:
## Build network interface
s_network_interface = NetworkInterface(game_mode=simple_mode, network=network)

## Name agents
red = RedInterface(s_network_interface)
blue = BlueInterface(s_network_interface)

## Create environment
s_env = GenericNetworkEnv(red, blue, s_network_interface)

## Check compliant with OpenAI gym
check_env(s_env, warn=True)
_ = s_env.reset()

In [None]:
## Initialise environment callback
eval_callback = EvalCallback(Monitor(s_env), eval_freq=1000, deterministic=False, render=False)

## Create agent
s_agent = PPO(PPOMlp, s_env, verbose=1, tensorboard_log="./logs/ppo_linear_tensorboard2/")

## Train agent for 200,000 timesteps
s_agent.learn(total_timesteps=200000, n_eval_episodes=1, callback=eval_callback)
%tensorboard --logdir ./logs/ppo_linear_tensorboard2/

In [None]:
## Save trained agent
s_agent_name = "ppo-s-linear"
s_agent.save(s_agent_name)

In [None]:
# Evaluation 

# Create a new environment for evaluation
eval_env =GenericNetworkEnv(red, blue, s_network_interface)

# Evaluate the model with 10 evaluation episodes and deterministic=True
mean_reward, std_reward = evaluate_policy(s_agent, eval_env, n_eval_episodes=10, deterministic=True)

# Print the results
print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")

In [None]:
# Create gif
loop = ActionLoop(eval_env, s_agent, episode_count=1)
loop.gif_action_loop(save_gif=True, render_network=True, gif_output_directory='.\gifs', webm_output_directory='.\gifs')

In [None]:
loop = ActionLoop(eval_env, s_agent, episode_count=1)
print(loop)

## Replay

In [None]:
import pandas as pd

In [None]:
        complete_results = []
        for i in range(1000):
            results = pd.DataFrame(
                columns=["action", "rewards", "info", "old_state", "new_state"]
            )  
            obs = eval_env.reset()
            done = False

            while not done:
                # gets the agents prediction for the best next action to take
                old_state = obs
                action, _states = s_agent.predict(obs, deterministic= True)

                # step the env
                obs, rewards, done, info = eval_env.step(action)

                results.loc[len(results.index)] = [action, rewards, info, old_state, obs]

            complete_results.append(results)

In [None]:
complete_results[1]

In [None]:
complete_results[1]['info'][11]

In [None]:
complete_results[1]['new_state'][11]


In [None]:
pd.complete_results.to_csv("replay data")
