## First create network


In [1]:
from yawning_titan.networks.node import Node
from yawning_titan.networks.network import Network

#Import packages - SB3
import time
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.monitor import Monitor
from stable_baselines3 import A2C, DQN, PPO
from stable_baselines3.ppo import MlpPolicy as PPOMlp

from yawning_titan.envs.generic.core.blue_interface import BlueInterface
from yawning_titan.envs.generic.core.red_interface import RedInterface
from yawning_titan.envs.generic.generic_env import GenericNetworkEnv
from yawning_titan.envs.generic.core.action_loops import ActionLoop
from yawning_titan.envs.generic.core.network_interface import NetworkInterface
from yawning_titan.networks.network_db import default_18_node_network
import yawning_titan.game_modes
from yawning_titan.envs.generic.core.action_loops import ActionLoop

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Instantiate the Network
network = Network()

# Instantiate the Node's and add them to the Network
 
pc_1 = Node("PC 1")
network.add_node(pc_1)
pc_1.x_pos = -1.00
pc_1.y_pos = 0.01
pc_1.entry_node = True
pc_1.vulnerability = 0.5

pc_2 = Node("PC 2")
network.add_node(pc_2)
pc_2.x_pos = -0.50
pc_2.y_pos = 0.01
pc_2.vulnerability = 0.5

pc_3 = Node("PC 3")
network.add_node(pc_3)
pc_3.x_pos = 0.00
pc_3.y_pos = 0.01
pc_3.vulnerability = 0.5

pc_4 = Node("PC 4")
network.add_node(pc_4)
pc_4.x_pos = 0.50
pc_4.y_pos = 0.01
pc_4.vulnerability = 0.5

pc_5 = Node("PC 5")
network.add_node(pc_5)
pc_5.x_pos = 1.00
pc_5.y_pos = 0.01
pc_5.high_value_node = True
pc_5.vulnerability = 0.5


# Add the edges between Node's
network.add_edge(pc_1, pc_2)
network.add_edge(pc_2, pc_3)
network.add_edge(pc_3, pc_4)
network.add_edge(pc_4, pc_5)



# Reset the entry nodes, high value nodes, and vulnerability scores by calling .setup()
# network.reset()

# View the Networks Node Details
network.show(verbose=True)

UUID                                  Name    High Value Node    Entry Node      Vulnerability  Position (x,y)
------------------------------------  ------  -----------------  ------------  ---------------  ----------------
688308b1-8c7b-4fc0-9bef-f21af31eab8e  PC 1    False              True                      0.5  -1.00, 0.01
54cf8097-6b21-4437-87c8-f7f01c539274  PC 2    False              False                     0.5  -0.50, 0.01
37d7fb0d-5f6c-4abe-892f-49fbf75e37eb  PC 3    False              False                     0.5  0.00, 0.01
3f4cc577-52b8-45c7-8159-3354ef0a4fee  PC 4    False              False                     0.5  0.50, 0.01
27893026-be4c-4120-a82b-833e3eb4abed  PC 5    True               False                     0.5  1.00, 0.01


In [3]:
## Using DB
from yawning_titan.game_modes.game_mode_db import GameModeDB, GameModeSchema
from yawning_titan.db.doc_metadata import DocMetadataSchema
db = GameModeDB()

In [4]:
db.show(True)

name               author              locked    uuid
-----------------  ------------------  --------  ------------------------------------
DCBO Agent Config  dstl/YAWNING-TITAN  True      bac2cb9d-b24b-426c-88a5-5edd0c2de413
Default Game Mode  dstl/YAWNING-TITAN  True      900a704f-6271-4994-ade7-40b74d3199b1
Low skill red      dstl/YAWNING-TITAN  True      3ccd9988-8781-4c3e-9c75-44cc987ae6af
simple_mode        Hannah Harrison     False     919da33c-7bc9-4d29-99eb-097a7e9bb016
no_zero_day        Hannah Harrison     False     fe76bb6c-4806-41af-aaf3-ac78d2942021


In [5]:
simple_mode = db.get("919da33c-7bc9-4d29-99eb-097a7e9bb016")

In [6]:
## Build network interface
s_network_interface = NetworkInterface(game_mode=simple_mode, network=network)

In [7]:
## Name agents
red = RedInterface(s_network_interface)
blue = BlueInterface(s_network_interface)

In [8]:
## Create environment
s_env = GenericNetworkEnv(red, blue, s_network_interface)

In [9]:
## Check compliant with OpenAI gym
check_env(s_env, warn=True)
_ = s_env.reset()

## Creating a dataset of states and actions
https://colab.research.google.com/github/Stable-Baselines-Team/rl-colab-notebooks/blob/sb3/pretraining.ipynb#scrollTo=Tgx4AMZo8anP

In [10]:
import gymnasium as gym
from tqdm import tqdm
import numpy as np

print(f"{gym.__version__}")

0.26.3


In [11]:
import torch as th
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR

In [12]:
from stable_baselines3 import PPO, A2C, SAC, TD3
from stable_baselines3.common.evaluation import evaluate_policy

In [21]:
agent = PPO.load('./ppo-s-linear.zip', env= s_env)

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [23]:
mean_reward, std_reward = evaluate_policy(agent, s_env, n_eval_episodes=10)

print(f"Mean reward = {mean_reward} +/- {std_reward}")

Mean reward = -88.50666600763797 +/- 0.3711846905218861


In [24]:
def interact_with_agent(env, agent, max_steps=30):
    """
    Interact with a Stable Baselines 3 PPO agent in an environment.

    Parameters:
        env (gym.Env): The environment to interact with.
        agent (BaseAlgorithm): The Stable Baselines 3 PPO agent.
        max_steps (int): The maximum number of steps to interact with the agent.

    Returns:
        List of tuples: Each tuple contains (action, prev_state, new_state, reward)
                        for each interaction step.
    """
    interactions = []

    # Reset the environment to get the initial state
    state = env.reset()

    for step in range(max_steps):
        # Store the previous state
        prev_state = np.array(state)

        # Get the action from the agent
        action, _ = agent.predict(state, deterministic=True)

        # Perform the action in the environment
        new_state, reward, done, info = env.step(action)

        # Store the interaction tuple (action, previous state, current state, reward)
        interactions.append((action, prev_state, np.array(new_state), reward, info))

        # Update the state for the next step
        state = new_state

        # If the episode is done, break out of the loop
        if done:
            break

    return interactions

In [25]:
if __name__ == "__main__":
    
    interactions = interact_with_agent(s_env, agent)

    for action, prev_state, new_state, reward in interactions:
        print(f"Action: {action}, Prev State: {prev_state}, New State: {new_state}, Reward: {reward}")

Action: 0, Prev State: [0.  1.  0.  0.  0.  1.  0.  1.  0.  0.  0.  1.  0.  1.  0.  0.  0.  1.
 0.  1.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5
 0.5 0.5 0.5 0.5 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.
 0.  0.  0.  0.  0.  1.  0.7], New State: [0.  1.  0.  0.  0.  1.  0.  1.  0.  0.  0.  1.  0.  1.  0.  0.  0.  1.
 0.  1.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5
 0.5 0.5 0.5 0.3 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.
 0.  0.  0.  0.  0.  1.  0.7], Reward: 0.3000000000000007
Action: 0, Prev State: [0.  1.  0.  0.  0.  1.  0.  1.  0.  0.  0.  1.  0.  1.  0.  0.  0.  1.
 0.  1.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5
 0.5 0.5 0.5 0.3 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.
 0.  0.  0.  0.  0.  1.  0.7], New State: [0.  1.  0.  0.  0.  1.  0.  1.  0.  0.  0.  1.  0.  1.  0.  0.  0.  1.
 0.  1.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5
 0.5 0.5 0.5 0.1

### Method 2
Using network_interface.py

In [26]:
import pandas as pd

In [27]:
        complete_results = []
        for i in range(100):
            results = pd.DataFrame(
                columns=["action", "rewards", "info"]
            )  
            obs = s_env.reset()
            done = False

            while not done:
                # gets the agents prediction for the best next action to take
                action, _states = agent.predict(obs, deterministic= True)

                # step the env
                obs, rewards, done, info = s_env.step(action)

                results.loc[len(results.index)] = [action, rewards, info]

            complete_results.append(results)


In [28]:
complete_results[0]

Unnamed: 0,action,rewards,info
0,0,0.3,{'initial_state': {'688308b1-8c7b-4fc0-9bef-f2...
1,0,0.3,{'initial_state': {'688308b1-8c7b-4fc0-9bef-f2...
2,0,-0.14,{'initial_state': {'688308b1-8c7b-4fc0-9bef-f2...
3,0,-1.0,{'initial_state': {'688308b1-8c7b-4fc0-9bef-f2...
4,1,-3.5,{'initial_state': {'688308b1-8c7b-4fc0-9bef-f2...
5,1,-3.5,{'initial_state': {'688308b1-8c7b-4fc0-9bef-f2...
6,1,-3.5,{'initial_state': {'688308b1-8c7b-4fc0-9bef-f2...
7,1,-3.5,{'initial_state': {'688308b1-8c7b-4fc0-9bef-f2...
8,1,-3.5,{'initial_state': {'688308b1-8c7b-4fc0-9bef-f2...
9,1,-3.5,{'initial_state': {'688308b1-8c7b-4fc0-9bef-f2...


In [29]:
complete_results[1]['info'][2]

{'initial_state': {'688308b1-8c7b-4fc0-9bef-f21af31eab8e': 0,
  '54cf8097-6b21-4437-87c8-f7f01c539274': 0,
  '37d7fb0d-5f6c-4abe-892f-49fbf75e37eb': 0,
  '3f4cc577-52b8-45c7-8159-3354ef0a4fee': 0,
  '27893026-be4c-4120-a82b-833e3eb4abed': 0},
 'initial_blue_view': {'688308b1-8c7b-4fc0-9bef-f21af31eab8e': 0,
  '54cf8097-6b21-4437-87c8-f7f01c539274': 0,
  '37d7fb0d-5f6c-4abe-892f-49fbf75e37eb': 0,
  '3f4cc577-52b8-45c7-8159-3354ef0a4fee': 0,
  '27893026-be4c-4120-a82b-833e3eb4abed': 0},
 'initial_vulnerabilities': {'688308b1-8c7b-4fc0-9bef-f21af31eab8e': 0.5,
  '54cf8097-6b21-4437-87c8-f7f01c539274': 0.5,
  '37d7fb0d-5f6c-4abe-892f-49fbf75e37eb': 0.5,
  '3f4cc577-52b8-45c7-8159-3354ef0a4fee': 0.5,
  '27893026-be4c-4120-a82b-833e3eb4abed': 0.09999999999999998},
 'initial_red_location': None,
 'initial_graph': {Node(uuid='688308b1-8c7b-4fc0-9bef-f21af31eab8e', name='PC 1', high_value_node=False, entry_node=True, vulnerability=0.5, x_pos=-1.0, y_pos=0.01): {Node(uuid='54cf8097-6b21-4437-87c