## 7/5: File used for initial testing of running randomization agent

This is all some intro code just to visualize some of the settings and get a baseline. You can run through it if you want to get familiar with what the scenario arguments, agents, and training looks like.

In [14]:
testfile = 'data.yaml'


def writeToYAML():
    with open(testfile, 'w') as f:
        data = yaml.dump(curr_data, f, sort_keys=False, default_flow_style=False)

In [15]:
def print_nested_dict(dict_obj, indent = 0):
    ''' Pretty Print nested dictionary with given indent level  
    '''
    # Iterate over all key-value pairs of dictionary
    for key, value in dict_obj.items():
        # If value is dict type, then print nested dict 
        if isinstance(value, dict):
            print(' ' * indent, key, ':', '{')
            print_nested_dict(value, indent + 4)
            print(' ' * indent, '}')
        else:
            print(' ' * indent, key, ':', value)

In [16]:
scenario_args={
    "num_hosts": 5,         # Number of hosts in the network 
    
    "num_services": 3,      # Number of services on the network (ssh, ftp, http)
    
    "num_os": 2,            # Number of operatings systems on the network (windows, linux, etc)
    
    "num_processes": 2,     # Number of processes on the network (tomcat, daclsvc, etc)
    
    "num_exploits": None,   # Number of exploits to use
    
    "num_privescs": None,   # Number of privilege escalation actions
    
    "r_sensitive": 10,      # Reward for sensitive subnet documents (default 10)
    
    "r_user": 10,           # Reward for user subnet documents      (default 10)
    
    "exploit_cost": 1,      # Cost to use an exploit (default 1)
    
    "exploit_probs": 1.0,   # Sucess probability of exploits (default 1.0)
    
    "privesc_cost": 1,      # Cost of privilege escalation action (default 1)
    
    "privesc_probs": 1.0,   # Sucess probability of privilege escalation action (default 1.0)
    
    "service_scan_cost": 1, # Cost for a service scan (default 1)
    
    "os_scan_cost": 1,      # Cost for an OS scan (default 1)
    
    "subnet_scan_cost": 1,  # Cost for a subnet scan (default 1)
    
    "process_scan_cost": 1, # Cost for a process scan (default 1)
    
    "uniform": False,       # Whether to use uniform distribution or correlaed host configuration (default false)
    
    "alpha_H": 2.0,         # Scaling or concentration parameter for controlling corelation between host configurations (default 2.0)
    
    "alpha_V": 2.0,         # Scaling or concentration parameter for controlling corelation between services across host configruations (default 2.0)
    
    "lambda_V": 1.0,        # Parameter for controlling average number of services running per host configuration (default 1.0)
    
    "restrictiveness": 5,   # Maximum number of services allowed to pass through firewalls between zones (default 5)
    
    "random_goal": False,   # Whether to randomly assign the goal user host or not (default False)
    
    "base_host_value": 1,   # Value of non sensitive hosts (default 1)
    
    "host_discovery_value": 1,  # Value of discovering a host for the first time (default 1)
    
    "seed": None,           # Random number generator seed (default None)
    
    "name": None,           # Name of the scenario, one will be generated if None (default None)
    
    "step_limit": None}     # Max number of steps permitted in a single episode, None means no limit (default None)

#Scenario Generator Parameter List: https://networkattacksimulator.readthedocs.io/en/latest/reference/scenarios/generator.html#scenario-generator

In [18]:
import nasim
import json
env = nasim.generate(**scenario_args)
env = nasim.make_benchmark("huge-gen")
env = nasim.load("unreachable.yaml")


scenario_desc = env.scenario.get_description() #get_description found in scenario.py file under nasim->scenarios
scenario_dict = env.scenario.scenario_dict
#scenario_exploit_map = env.scenario.exploit_map # A nested dictionary for all exploits in scenario.
#scenario_privesc_map = env.scenario.privesc_map # A nested dictionary for all privilege escalation actions in scenario.

print("Scenario Description: ")
print_nested_dict(scenario_desc,4)

print("\nScenario Dictionary: ")
print_nested_dict(scenario_dict,6)

Scenario Description: 
     Name : unreachable
     Type : static
     Subnets : 4
     Hosts : 3
     OS : 1
     Services : 1
     Processes : 1
     Exploits : 1
     PrivEscs : 1
     Actions : 18
     Observation Dims : (4, 14)
     States : 576
     Step Limit : 1000

Scenario Dictionary: 
       subnets : [1, 1, 1, 1]
       topology : [[1, 1, 0, 0], [1, 1, 1, 1], [0, 1, 1, 1], [0, 1, 1, 1]]
       os : ['linux']
       services : ['ssh']
       processes : ['tomcat']
       sensitive_hosts : {
           (2, 0) : 100
           (3, 0) : 100
       }
       exploits : {
           e_ssh : {
               service : ssh
               os : linux
               prob : 0.8
               cost : 1
               access : 1
           }
       }
       privilege_escalation : {
           pe_tomcat : {
               process : tomcat
               os : linux
               prob : 1.0
               cost : 1
               access : 2
           }
       }
       os_scan_cost : 1
     

In [19]:
#env.get_minimum_actions()

In [20]:
 """An example Tabular, epsilon greedy Q-Learning Agent.

This agent does not use an Experience replay (see the 'ql_replay_agent.py')

It uses pytorch 1.5+ tensorboard library for logging (HINT: these dependencies
can be installed by running pip install nasim[dqn])

To run 'tiny' benchmark scenario with default settings, run the following from
the nasim/agents dir:

$ python ql_agent.py tiny

To see detailed results using tensorboard:

$ tensorboard --logdir runs/

To see available hyperparameters:

$ python ql_agent.py --help

Notes
-----

This is by no means a state of the art implementation of Tabular Q-Learning.
It is designed to be an example implementation that can be used as a reference
for building your own agents and for simple experimental comparisons.
"""
import random
import numpy as np
from pprint import pprint

import nasim

try:
    from torch.utils.tensorboard import SummaryWriter
except ImportError as e:
    from gymnasium import error
    raise error.DependencyNotInstalled(
        f"{e}. (HINT: you can install tabular_q_learning_agent dependencies "
        "by running 'pip install nasim[dqn]'.)"
    )


class TabularQFunction:
    """Tabular Q-Function """

    def __init__(self, num_actions):
        self.q_func = dict()
        self.num_actions = num_actions

    def __call__(self, x):
        return self.forward(x)

    def forward(self, x):
        if isinstance(x, np.ndarray):
            x = str(x.astype(int))
        if x not in self.q_func:
            self.q_func[x] = np.zeros(self.num_actions, dtype=np.float32)
        return self.q_func[x]

    def forward_batch(self, x_batch):
        return np.asarray([self.forward(x) for x in x_batch])

    def update_batch(self, s_batch, a_batch, delta_batch):
        for s, a, delta in zip(s_batch, a_batch, delta_batch):
            q_vals = self.forward(s)
            q_vals[a] += delta

    def update(self, s, a, delta):
        q_vals = self.forward(s)
        q_vals[a] += delta

    def get_action(self, x):
        return int(self.forward(x).argmax())

    def display(self):
        pprint(self.q_func)


class TabularQLearningAgent:
    """A Tabular. epsilon greedy Q-Learning Agent using Experience Replay """

    def __init__(self,
                 env,
                 seed=None,
                 lr=0.001,
                 training_steps=10000,
                 final_epsilon=0.05,
                 exploration_steps=10000,
                 gamma=0.99,
                 verbose=True,
                 **kwargs):

        # This implementation only works for flat actions
        assert env.flat_actions
        self.verbose = verbose
        if self.verbose:
            print("\nRunning Tabular Q-Learning with config:")
            pprint(locals())

        # set seeds
        self.seed = seed
        if self.seed is not None:
            np.random.seed(self.seed)

        # envirnment setup
        self.env = env

        self.num_actions = self.env.action_space.n
        self.obs_dim = self.env.observation_space.shape

        # logger setup
        self.logger = SummaryWriter()

        # Training related attributes
        self.lr = lr
        self.exploration_steps = exploration_steps
        self.final_epsilon = final_epsilon
        self.epsilon_schedule = np.linspace(
            1.0, self.final_epsilon, self.exploration_steps
        )
        self.discount = gamma
        self.training_steps = training_steps
        self.steps_done = 0

        # Q-Function
        self.qfunc = TabularQFunction(self.num_actions)

    def get_epsilon(self):
        if self.steps_done < self.exploration_steps:
            return self.epsilon_schedule[self.steps_done]
        return self.final_epsilon

    def get_egreedy_action(self, o, epsilon):
        if random.random() > epsilon:
            return self.qfunc.get_action(o)
        return random.randint(0, self.num_actions-1)

    def optimize(self, s, a, next_s, r, done):
        # get q_val for state and action performed in that state
        q_vals_raw = self.qfunc.forward(s)
        q_val = q_vals_raw[a]

        # get target q val = max val of next state
        target_q_val = self.qfunc.forward(next_s).max()
        target = r + self.discount * (1-done) * target_q_val

        # calculate error and update
        td_error = target - q_val
        td_delta = self.lr * td_error

        # optimize the model
        self.qfunc.update(s, a, td_delta)

        s_value = q_vals_raw.max()
        return td_error, s_value

    def train(self):
        if self.verbose:
            print("\nStarting training")

        num_episodes = 0
        training_steps_remaining = self.training_steps

        while self.steps_done < self.training_steps:
            ep_results = self.run_train_episode(training_steps_remaining)
            ep_return, ep_steps, goal = ep_results
            num_episodes += 1
            training_steps_remaining -= ep_steps

            self.logger.add_scalar("episode", num_episodes, self.steps_done)
            self.logger.add_scalar(
                "epsilon", self.get_epsilon(), self.steps_done
            )
            self.logger.add_scalar(
                "episode_return", ep_return, self.steps_done
            )
            self.logger.add_scalar(
                "episode_steps", ep_steps, self.steps_done
            )
            self.logger.add_scalar(
                "episode_goal_reached", int(goal), self.steps_done
            )

            if num_episodes % 10 == 0 and self.verbose:
                print(f"\nEpisode {num_episodes}:")
                print(f"\tsteps done = {self.steps_done} / "
                      f"{self.training_steps}")
                print(f"\treturn = {ep_return}")
                print(f"\tgoal = {goal}")

        self.logger.close()
        if self.verbose:
            print("Training complete")
            print(f"\nEpisode {num_episodes}:")
            print(f"\tsteps done = {self.steps_done} / {self.training_steps}")
            print(f"\treturn = {ep_return}")
            print(f"\tgoal = {goal}")

    def run_train_episode(self, step_limit):
        s, _ = self.env.reset()
        done = False
        env_step_limit_reached = False

        steps = 0
        episode_return = 0

        while not done and not env_step_limit_reached and steps < step_limit:
            a = self.get_egreedy_action(s, self.get_epsilon())

            next_s, r, done, env_step_limit_reached, _ = self.env.step(a)
            self.steps_done += 1
            td_error, s_value = self.optimize(s, a, next_s, r, done)
            self.logger.add_scalar("td_error", td_error, self.steps_done)
            self.logger.add_scalar("s_value", s_value, self.steps_done)

            s = next_s
            episode_return += r
            steps += 1

        return episode_return, steps, self.env.goal_reached()

    def run_eval_episode(self,
                         env=None,
                         render=False,
                         eval_epsilon=0.05,
                         render_mode="human"):
        if env is None:
            env = self.env

        original_render_mode = env.render_mode
        env.render_mode = render_mode

        s, _ = env.reset()
        done = False
        env_step_limit_reached = False

        steps = 0
        episode_return = 0

        line_break = "="*60
        if render:
            print("\n" + line_break)
            print(f"Running EVALUATION using epsilon = {eval_epsilon:.4f}")
            print(line_break)
            env.render()
            input("Initial state. Press enter to continue..")

        while not done and not env_step_limit_reached:
            a = self.get_egreedy_action(s, eval_epsilon)
            next_s, r, done, env_step_limit_reached, _ = env.step(a)
            s = next_s
            episode_return += r
            steps += 1
            if render:
                print("\n" + line_break)
                print(f"Step {steps}")
                print(line_break)
                print(f"Action Performed = {env.action_space.get_action(a)}")
                env.render()
                print(f"Reward = {r}")
                print(f"Done = {done}")
                print(f"Step limit reached = {env_step_limit_reached}")
                input("Press enter to continue..")

                if done or env_step_limit_reached:
                    print("\n" + line_break)
                    print("EPISODE FINISHED")
                    print(line_break)
                    print(f"Goal reached = {env.goal_reached()}")
                    print(f"Total steps = {steps}")
                    print(f"Total reward = {episode_return}")

        env.render_mode = original_render_mode
        return episode_return, steps, env.goal_reached()


if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("env_name", type=str, help="benchmark scenario name")
    parser.add_argument("--render_eval", action="store_true",
                        help="Renders final policy")
    parser.add_argument("--lr", type=float, default=0.001,
                        help="Learning rate (default=0.001)")
    parser.add_argument("-t", "--training_steps", type=int, default=10000,
                        help="training steps (default=10000)")
    parser.add_argument("--batch_size", type=int, default=32,
                        help="(default=32)")
    parser.add_argument("--seed", type=int, default=0,
                        help="(default=0)")
    parser.add_argument("--replay_size", type=int, default=100000,
                        help="(default=100000)")
    parser.add_argument("--final_epsilon", type=float, default=0.05,
                        help="(default=0.05)")
    parser.add_argument("--init_epsilon", type=float, default=1.0,
                        help="(default=1.0)")
    parser.add_argument("-e", "--exploration_steps", type=int, default=10000,
                        help="(default=10000)")
    parser.add_argument("--gamma", type=float, default=0.99,
                        help="(default=0.99)")
    parser.add_argument("--quite", action="store_false",
                        help="Run in Quite mode")
    args = parser.parse_args()

    env = nasim.make_benchmark(
        args.env_name,
        args.seed,
        fully_obs=True,
        flat_actions=True,
        flat_obs=True
    )
    ql_agent = TabularQLearningAgent(
        env, verbose=args.quite, **vars(args)
    )
    #ql_agent.train()
    #ql_agent.run_eval_episode(render=args.render_eval)


usage: ipykernel_launcher.py [-h] [--render_eval] [--lr LR]
                             [-t TRAINING_STEPS] [--batch_size BATCH_SIZE]
                             [--seed SEED] [--replay_size REPLAY_SIZE]
                             [--final_epsilon FINAL_EPSILON]
                             [--init_epsilon INIT_EPSILON]
                             [-e EXPLORATION_STEPS] [--gamma GAMMA] [--quite]
                             env_name
ipykernel_launcher.py: error: unrecognized arguments: -f


SystemExit: 2

In [21]:
#NOW USING AGENT ABOVE INSTEAD OF IMPORTING AGENT:
#USED TO BE: from nasim.agents.ql_agent import TabularQLearningAgent

ql_agent = TabularQLearningAgent(env, verbose=1, training_steps=50000)
ql_agent.train()


Running Tabular Q-Learning with config:
{'env': <nasim.envs.environment.NASimEnv object at 0x7f7a1c172a40>,
 'exploration_steps': 10000,
 'final_epsilon': 0.05,
 'gamma': 0.99,
 'kwargs': {},
 'lr': 0.001,
 'seed': None,
 'self': <__main__.TabularQLearningAgent object at 0x7f78b88675b0>,
 'training_steps': 50000,
 'verbose': 1}

Starting training

Episode 10:
	steps done = 10000 / 50000
	return = -1000.0
	goal = False

Episode 20:
	steps done = 20000 / 50000
	return = -1000.0
	goal = False

Episode 30:
	steps done = 30000 / 50000
	return = -1000.0
	goal = False

Episode 40:
	steps done = 40000 / 50000
	return = -1000.0
	goal = False

Episode 50:
	steps done = 50000 / 50000
	return = -1000.0
	goal = False
Training complete

Episode 50:
	steps done = 50000 / 50000
	return = -1000.0
	goal = False


## Current Code 
Here is the main code to test/run.

In [22]:
# Initial scenario arguments... we will be editing the number of hosts by marking actions involving them as invalid
scenario_args={
    "num_hosts": 5,         # Number of hosts in the network 
    
    "num_services": 3,      # Number of services on the network (ssh, ftp, http)
    
    "num_os": 2,            # Number of operatings systems on the network (windows, linux, etc)
    
    "num_processes": 2,     # Number of processes on the network (tomcat, daclsvc, etc)
    
    "num_exploits": None,   # Number of exploits to use
    
    "num_privescs": None,   # Number of privilege escalation actions
    
    "r_sensitive": 10,      # Reward for sensitive subnet documents (default 10)
    
    "r_user": 10,           # Reward for user subnet documents      (default 10)
    
    "exploit_cost": 1,      # Cost to use an exploit (default 1)
    
    "exploit_probs": 1.0,   # Sucess probability of exploits (default 1.0)
    
    "privesc_cost": 1,      # Cost of privilege escalation action (default 1)
    
    "privesc_probs": 1.0,   # Sucess probability of privilege escalation action (default 1.0)
    
    "service_scan_cost": 1, # Cost for a service scan (default 1)
    
    "os_scan_cost": 1,      # Cost for an OS scan (default 1)
    
    "subnet_scan_cost": 1,  # Cost for a subnet scan (default 1)
    
    "process_scan_cost": 1, # Cost for a process scan (default 1)
    
    "uniform": False,       # Whether to use uniform distribution or correlaed host configuration (default false)
    
    "alpha_H": 2.0,         # Scaling or concentration parameter for controlling corelation between host configurations (default 2.0)
    
    "alpha_V": 2.0,         # Scaling or concentration parameter for controlling corelation between services across host configruations (default 2.0)
    
    "lambda_V": 1.0,        # Parameter for controlling average number of services running per host configuration (default 1.0)
    
    "restrictiveness": 5,   # Maximum number of services allowed to pass through firewalls between zones (default 5)
    
    "random_goal": False,   # Whether to randomly assign the goal user host or not (default False)
    
    "base_host_value": 1,   # Value of non sensitive hosts (default 1)
    
    "host_discovery_value": 1,  # Value of discovering a host for the first time (default 1)
    
    "seed": None,           # Random number generator seed (default None)
    
    "name": None,           # Name of the scenario, one will be generated if None (default None)
    
    "step_limit": None}     # Max number of steps permitted in a single episode, None means no limit (default None)

#Scenario Generator Parameter List: https://networkattacksimulator.readthedocs.io/en/latest/reference/scenarios/generator.html#scenario-generator

In [23]:
# Defining Python user-defined exceptions
class SensitiveHostRemovalException(Exception):
    "Raised when selected network host cannot be removed (sensitive host needs to remain in network)"
    pass

class PublicHostRemovalException(Exception):
    "Raised when selected network host cannot be removed (public host to enter the network... specific to this configuration)"
    pass

In [24]:
# DEFAULT DQN AGENT AS CONTROL

"""An example DQN Agent.

It uses pytorch 1.5+ and tensorboard libraries (HINT: these dependencies can
be installed by running pip install nasim[dqn])

To run 'tiny' benchmark scenario with default settings, run the following from
the nasim/agents dir:

$ python dqn_agent.py tiny

To see detailed results using tensorboard:

$ tensorboard --logdir runs/

To see available hyperparameters:

$ python dqn_agent.py --help

Notes
-----

This is by no means a state of the art implementation of DQN, but is designed
to be an example implementation that can be used as a reference for building
your own agents.
"""
import random
from pprint import pprint

from gymnasium import error
import numpy as np

import nasim

try:
    import torch
    import torch.nn as nn
    import torch.optim as optim
    import torch.nn.functional as F
    from torch.utils.tensorboard import SummaryWriter
except ImportError as e:
    raise error.DependencyNotInstalled(
        f"{e}. (HINT: you can install dqn_agent dependencies by running "
        "'pip install nasim[dqn]'.)"
    )


class ReplayMemory:

    def __init__(self, capacity, s_dims, device="cpu"):
        self.capacity = capacity
        self.device = device
        self.s_buf = np.zeros((capacity, *s_dims), dtype=np.float32)
        self.a_buf = np.zeros((capacity, 1), dtype=np.int64)
        self.next_s_buf = np.zeros((capacity, *s_dims), dtype=np.float32)
        self.r_buf = np.zeros(capacity, dtype=np.float32)
        self.done_buf = np.zeros(capacity, dtype=np.float32)
        self.ptr, self.size = 0, 0

    def store(self, s, a, next_s, r, done):
        self.s_buf[self.ptr] = s
        self.a_buf[self.ptr] = a
        self.next_s_buf[self.ptr] = next_s
        self.r_buf[self.ptr] = r
        self.done_buf[self.ptr] = done
        self.ptr = (self.ptr + 1) % self.capacity
        self.size = min(self.size+1, self.capacity)

    def sample_batch(self, batch_size):
        sample_idxs = np.random.choice(self.size, batch_size)
        batch = [self.s_buf[sample_idxs],
                 self.a_buf[sample_idxs],
                 self.next_s_buf[sample_idxs],
                 self.r_buf[sample_idxs],
                 self.done_buf[sample_idxs]]
        return [torch.from_numpy(buf).to(self.device) for buf in batch]


class DQN(nn.Module):
    """A simple Deep Q-Network """

    def __init__(self, input_dim, layers, num_actions):
        super().__init__()
        self.layers = nn.ModuleList([nn.Linear(input_dim[0], layers[0])])
        for l in range(1, len(layers)):
            self.layers.append(nn.Linear(layers[l-1], layers[l]))
        self.out = nn.Linear(layers[-1], num_actions)

    def forward(self, x):
        for layer in self.layers:
            x = F.relu(layer(x))
        x = self.out(x)
        return x

    def save_DQN(self, file_path):
        torch.save(self.state_dict(), file_path)

    def load_DQN(self, file_path):
        self.load_state_dict(torch.load(file_path))

    def get_action(self, x):
        with torch.no_grad():
            if len(x.shape) == 1:
                x = x.view(1, -1)
            return self.forward(x).max(1)[1]


class DQNAgent:
    """A simple Deep Q-Network Agent """

    def __init__(self,
                 env,
                 seed=None,
                 lr=0.001,
                 training_steps=20000,
                 batch_size=32,
                 replay_size=10000,
                 final_epsilon=0.05,
                 exploration_steps=10000,
                 gamma=0.99,
                 hidden_sizes=[64, 64],
                 target_update_freq=1000,
                 verbose=True,
                 **kwargs):

        # This DQN implementation only works for flat actions
        assert env.flat_actions
        self.verbose = verbose
        if self.verbose:
            print(f"\nRunning DQN with config:")
            pprint(locals())

        # set seeds
        self.seed = seed
        if self.seed is not None:
            np.random.seed(self.seed)

        # environment setup
        self.env = env

        self.num_actions = self.env.action_space.n
        self.obs_dim = self.env.observation_space.shape

        # logger setup
        self.logger = SummaryWriter()

        # Training related attributes
        self.lr = lr
        self.exploration_steps = exploration_steps
        self.final_epsilon = final_epsilon
        self.epsilon_schedule = np.linspace(1.0,
                                            self.final_epsilon,
                                            self.exploration_steps)
        self.batch_size = batch_size
        self.discount = gamma
        self.training_steps = training_steps
        self.steps_done = 0

        # Neural Network related attributes
        self.device = torch.device("cuda"
                                   if torch.cuda.is_available()
                                   else "cpu")
        self.dqn = DQN(self.obs_dim,
                       hidden_sizes,
                       self.num_actions).to(self.device)
        if self.verbose:
            print(f"\nUsing Neural Network running on device={self.device}:")
            print(self.dqn)

        self.target_dqn = DQN(self.obs_dim,
                              hidden_sizes,
                              self.num_actions).to(self.device)
        self.target_update_freq = target_update_freq

        self.optimizer = optim.Adam(self.dqn.parameters(), lr=self.lr)
        self.loss_fn = nn.SmoothL1Loss()

        # replay setup
        self.replay = ReplayMemory(replay_size,
                                   self.obs_dim,
                                   self.device)

    def save(self, save_path):
        self.dqn.save_DQN(save_path)

    def load(self, load_path):
        self.dqn.load_DQN(load_path)

    def get_epsilon(self):
        if self.steps_done < self.exploration_steps:
            return self.epsilon_schedule[self.steps_done]
        return self.final_epsilon

    def get_egreedy_action(self, o, epsilon):
        if random.random() > epsilon:
            o = torch.from_numpy(o).float().to(self.device)
            return self.dqn.get_action(o).cpu().item()
        return random.randint(0, self.num_actions-1)

    def optimize(self):
        batch = self.replay.sample_batch(self.batch_size)
        s_batch, a_batch, next_s_batch, r_batch, d_batch = batch

        # get q_vals for each state and the action performed in that state
        q_vals_raw = self.dqn(s_batch)
        q_vals = q_vals_raw.gather(1, a_batch).squeeze()

        # get target q val = max val of next state
        with torch.no_grad():
            target_q_val_raw = self.target_dqn(next_s_batch)
            target_q_val = target_q_val_raw.max(1)[0]
            target = r_batch + self.discount*(1-d_batch)*target_q_val

        # calculate loss
        loss = self.loss_fn(q_vals, target)

        # optimize the model
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        if self.steps_done % self.target_update_freq == 0:
            self.target_dqn.load_state_dict(self.dqn.state_dict())

        q_vals_max = q_vals_raw.max(1)[0]
        mean_v = q_vals_max.mean().item()
        return loss.item(), mean_v

    def train(self):
        if self.verbose:
            print("\nStarting training")

        num_episodes = 0
        training_steps_remaining = self.training_steps

        while self.steps_done < self.training_steps:
            ep_results = self.run_train_episode(training_steps_remaining)
            ep_return, ep_steps, goal = ep_results #ep_return, ep_steps, and goal equal ep_results
            num_episodes += 1
            training_steps_remaining -= ep_steps

            self.logger.add_scalar("episode", num_episodes, self.steps_done)
            self.logger.add_scalar(
                "epsilon", self.get_epsilon(), self.steps_done
            )
            self.logger.add_scalar(
                "episode_return", ep_return, self.steps_done
            )
            self.logger.add_scalar(
                "episode_steps", ep_steps, self.steps_done
            )
            self.logger.add_scalar(
                "episode_goal_reached", int(goal), self.steps_done
            )

            if num_episodes % 10 == 0 and self.verbose:
                print(f"\nEpisode {num_episodes}:")
                print(f"\tsteps done = {self.steps_done} / "
                      f"{self.training_steps}")
                print(f"\treturn = {ep_return}")
                print(f"\tgoal = {goal}")

        self.logger.close()
        if self.verbose:
            print("Training complete")
            print(f"\nEpisode {num_episodes}:")
            print(f"\tsteps done = {self.steps_done} / {self.training_steps}")
            print(f"\treturn = {ep_return}")
            print(f"\tgoal = {goal}")

    def run_train_episode(self, step_limit):
        o, _ = self.env.reset()
        done = False
        env_step_limit_reached = False

        steps = 0
        episode_return = 0

        while not done and not env_step_limit_reached and steps < step_limit:
            a = self.get_egreedy_action(o, self.get_epsilon())

            next_o, r, done, env_step_limit_reached, _ = self.env.step(a)
            self.replay.store(o, a, next_o, r, done)
            self.steps_done += 1
            loss, mean_v = self.optimize()
            self.logger.add_scalar("loss", loss, self.steps_done)
            self.logger.add_scalar("mean_v", mean_v, self.steps_done)

            o = next_o
            episode_return += r
            steps += 1

        return episode_return, steps, self.env.goal_reached()

    def run_eval_episode(self,
                         env=None,
                         render=False,
                         eval_epsilon=0.05,
                         render_mode="human"):
        if env is None:
            env = self.env

        original_render_mode = env.render_mode
        env.render_mode = render_mode

        o, _ = env.reset()
        done = False
        env_step_limit_reached = False

        steps = 0
        episode_return = 0

        line_break = "="*60
        if render:
            print("\n" + line_break)
            print(f"Running EVALUATION using epsilon = {eval_epsilon:.4f}")
            print(line_break)
            env.render()
            input("Initial state. Press enter to continue..")

        while not done and not env_step_limit_reached:
            a = self.get_egreedy_action(o, eval_epsilon)
            next_o, r, done, env_step_limit_reached, _ = env.step(a)
            o = next_o
            episode_return += r
            steps += 1
            if render:
                print("\n" + line_break)
                print(f"Step {steps}")
                print(line_break)
                print(f"Action Performed = {env.action_space.get_action(a)}")
                env.render()
                print(f"Reward = {r}")
                print(f"Done = {done}")
                print(f"Step limit reached = {env_step_limit_reached}")
                input("Press enter to continue..")

                if done or env_step_limit_reached:
                    print("\n" + line_break)
                    print("EPISODE FINISHED")
                    print(line_break)
                    print(f"Goal reached = {env.goal_reached()}")
                    print(f"Total steps = {steps}")
                    print(f"Total reward = {episode_return}")

        env.render_mode = original_render_mode
        return episode_return, steps, env.goal_reached()


if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("env_name", type=str, help="benchmark scenario name")
    parser.add_argument("--render_eval", action="store_true",
                        help="Renders final policy")
    parser.add_argument("-o", "--partially_obs", action="store_true",
                        help="Partially Observable Mode")
    parser.add_argument("--hidden_sizes", type=int, nargs="*",
                        default=[64, 64],
                        help="(default=[64. 64])")
    parser.add_argument("--lr", type=float, default=0.001,
                        help="Learning rate (default=0.001)")
    parser.add_argument("-t", "--training_steps", type=int, default=20000,
                        help="training steps (default=20000)")
    parser.add_argument("--batch_size", type=int, default=32,
                        help="(default=32)")
    parser.add_argument("--target_update_freq", type=int, default=1000,
                        help="(default=1000)")
    parser.add_argument("--seed", type=int, default=0,
                        help="(default=0)")
    parser.add_argument("--replay_size", type=int, default=100000,
                        help="(default=100000)")
    parser.add_argument("--final_epsilon", type=float, default=0.05,
                        help="(default=0.05)")
    parser.add_argument("--init_epsilon", type=float, default=1.0,
                        help="(default=1.0)")
    parser.add_argument("--exploration_steps", type=int, default=10000,
                        help="(default=10000)")
    parser.add_argument("--gamma", type=float, default=0.99,
                        help="(default=0.99)")
    parser.add_argument("--quite", action="store_false",
                        help="Run in Quite mode")
    args = parser.parse_args()

    env = nasim.make_benchmark(args.env_name,
                               args.seed,
                               fully_obs=not args.partially_obs,
                               flat_actions=True,
                               flat_obs=True)
    dqn_agent = DQNAgent(env, verbose=args.quite, **vars(args))
    dqn_agent.train()
    dqn_agent.run_eval_episode(render=args.render_eval)

usage: ipykernel_launcher.py [-h] [--render_eval] [-o]
                             [--hidden_sizes [HIDDEN_SIZES ...]] [--lr LR]
                             [-t TRAINING_STEPS] [--batch_size BATCH_SIZE]
                             [--target_update_freq TARGET_UPDATE_FREQ]
                             [--seed SEED] [--replay_size REPLAY_SIZE]
                             [--final_epsilon FINAL_EPSILON]
                             [--init_epsilon INIT_EPSILON]
                             [--exploration_steps EXPLORATION_STEPS]
                             [--gamma GAMMA] [--quite]
                             env_name
ipykernel_launcher.py: error: unrecognized arguments: -f


SystemExit: 2

In [26]:
#NOW USING AGENT ABOVE INSTEAD OF IMPORTANT AGENT:
#USED TO BE: from nasim.agents.ql_agent import TabularQLearningAgent

baseline_dqn_agent = DQNAgent(env, verbose=1, training_steps=200000, max_episodes=1000)
baseline_dqn_agent.train()
baseline_dqn_agent.run_eval_episode(render_mode="human")


Running DQN with config:
{'batch_size': 32,
 'env': <nasim.envs.environment.NASimEnv object at 0x7f7a1c172a40>,
 'exploration_steps': 10000,
 'final_epsilon': 0.05,
 'gamma': 0.99,
 'hidden_sizes': [64, 64],
 'kwargs': {'max_episodes': 1000},
 'lr': 0.001,
 'replay_size': 10000,
 'seed': None,
 'self': <__main__.DQNAgent object at 0x7f7a1e36d360>,
 'target_update_freq': 1000,
 'training_steps': 200000,
 'verbose': 1}

Using Neural Network running on device=cuda:
DQN(
  (layers): ModuleList(
    (0): Linear(in_features=56, out_features=64, bias=True)
    (1): Linear(in_features=64, out_features=64, bias=True)
  )
  (out): Linear(in_features=64, out_features=18, bias=True)
)

Starting training

Episode 10:
	steps done = 10000 / 200000
	return = -1000.0
	goal = False

Episode 20:
	steps done = 20000 / 200000
	return = -1000.0
	goal = False

Episode 30:
	steps done = 30000 / 200000
	return = -1000.0
	goal = False

Episode 40:
	steps done = 40000 / 200000
	return = -1000.0
	goal = False

E

(-1000.0, 1000, False)

In [6]:
baseline_dqn_agent = DQNAgent(env, verbose=1, training_steps=2000000) #10-20k steps, 10k episodes
baseline_dqn_agent.train()
baseline_dqn_agent.run_eval_episode(render_mode="human")

NameError: name 'env' is not defined

In [23]:
baseline_dqn_agent = DQNAgent(env, verbose=1, training_steps=150000) #10-20k steps, 10k episodes
baseline_dqn_agent.train()
baseline_dqn_agent.run_eval_episode(render_mode="human")


Running DQN with config:
{'batch_size': 32,
 'env': <nasim.envs.environment.NASimEnv object at 0x7f7fe1337910>,
 'exploration_steps': 10000,
 'final_epsilon': 0.05,
 'gamma': 0.99,
 'hidden_sizes': [64, 64],
 'kwargs': {},
 'lr': 0.001,
 'replay_size': 10000,
 'seed': None,
 'self': <nasim.agents.dqn_agent.DQNAgent object at 0x7f7fe80167a0>,
 'target_update_freq': 1000,
 'training_steps': 150000,
 'verbose': 1}

Using Neural Network running on device=cuda:
DQN(
  (layers): ModuleList(
    (0): Linear(in_features=207, out_features=64, bias=True)
    (1): Linear(in_features=64, out_features=64, bias=True)
  )
  (out): Linear(in_features=64, out_features=72, bias=True)
)

Starting training
Blocked host index:  -1


ValueError: setting an array element with a sequence. The requested array would exceed the maximum number of dimension of 1.


Running DQN with config:
{'batch_size': 32,
 'env': <nasim.envs.environment.NASimEnv object at 0x7f3692615e70>,
 'exploration_steps': 10000,
 'final_epsilon': 0.05,
 'gamma': 0.99,
 'hidden_sizes': [64, 64],
 'kwargs': {},
 'lr': 0.001,
 'replay_size': 10000,
 'seed': None,
 'self': <__main__.DQNAgent object at 0x7f34f4a52e90>,
 'target_update_freq': 1000,
 'training_steps': 150000,
 'verbose': 1}

Using Neural Network running on device=cuda:
DQN(
  (layers): ModuleList(
    (0): Linear(in_features=207, out_features=64, bias=True)
    (1): Linear(in_features=64, out_features=64, bias=True)
  )
  (out): Linear(in_features=64, out_features=72, bias=True)
)

Starting training

Episode 10:
	steps done = 4608 / 150000
	return = -204.0
	goal = True

Episode 20:
	steps done = 13583 / 150000
	return = -1031.0
	goal = False

Episode 30:
	steps done = 23583 / 150000
	return = -1717.0
	goal = False

Episode 40:
	steps done = 33243 / 150000
	return = -953.0
	goal = False

Episode 50:
	steps done 

(-1011.0, 1000, False)

In [18]:
baseline_dqn_agent.run_eval_episode(render_mode="human")

(-1019.0, 1000, False)

In [None]:
# Import necessary libraries, including which methods will be redefined
import nasim
import random
from nasim.envs.action import Action
from nasim.agents.dqn_agent import DQNAgent
from nasim.envs.environment import NASimEnv

# User-defined Python method to check whether the selected blocked_host is valid to select
def check_host_valid(self, blocked_host):
    if blocked_host == -1:
        return
    elif self.env.network.address_space[blocked_host] in self.env.network.get_sensitive_hosts():
        raise SensitiveHostRemovalException
    elif blocked_host == 0:
        raise PublicHostRemovalException
    else:
        return

# Setting the method
DQNAgent.check_host_valid = check_host_valid
    
# Redefining the DQNAgent run_train_episode method
def run_train_episode(self, step_limit):
        done = False
        env_step_limit_reached = False #Unnecessary now with loop below using steps < step_limit
        steps = 0
        episode_return = 0
        max_host_index = len(self.env.network.host_num_map) - 1
        
        # Choosing random host index to be invalid... try/catch loop until valid host selected to block. Note: If -1, no host will be marked invalid
        blocked_host = -1
        if self.steps_done > 0:
            while True:
                try:
                    blocked_host = random.randint(-1,max_host_index)
                    self.check_host_valid(blocked_host)
                    break
                except SensitiveHostRemovalException:
                    pass
                except PublicHostRemovalException:
                    pass
                
        o, _ = self.env.reset()
        
        # If you wanted to see which host was blocked... used for the logging
        print("Blocked host index:  " + str(blocked_host))
        
        while not done and not env_step_limit_reached: #steps < step_limit: #J: changed from env_step_limit_reached:     
            #J: steps continuously updated at the bottom and will break as soon as step limit is reached
            # Keep generating an action in the action space until it does not involve a blocked host
            while True:
                a = self.get_egreedy_action(o, self.get_epsilon())
                
                if blocked_host == -1:
                    break
                else:
                    action = self.env.action_space.get_action(a)
                    target_host_index = self.env.network.host_num_map[action.target]
                    if target_host_index != blocked_host:
                        break
                
            next_o, r, done, env_step_limit_reached, _ = self.env.step(a)
            self.replay.store(o, a, next_o, r, done)
            self.steps_done += 1
            loss, mean_v = self.optimize()
            
            o = next_o
            episode_return += r
            steps += 1

        return episode_return, steps, self.env.goal_reached()

# Setting the method
DQNAgent.run_train_episode = run_train_episode

# Training function... redefined because it wasn't converging originally
def train(self):
    if self.verbose:
        print("\nStarting training")

    num_episodes = 0
    training_steps_remaining = self.training_steps
    og_env = self.env
    
    while self.steps_done < self.training_steps:
        self.env = og_env
        ep_results = self.run_train_episode(training_steps_remaining)
        ep_return, ep_steps, goal = ep_results
        num_episodes += 1
        training_steps_remaining -= ep_steps

        self.logger.add_scalar("episode", num_episodes, self.steps_done)
        self.logger.add_scalar(
            "epsilon", self.get_epsilon(), self.steps_done
        )
        self.logger.add_scalar(
            "episode_return", ep_return, self.steps_done
        )
        self.logger.add_scalar(
            "episode_steps", ep_steps, self.steps_done
        )
        self.logger.add_scalar(
            "episode_goal_reached", int(goal), self.steps_done
        )

        if num_episodes % 10 == 0 and self.verbose:
            print(f"\nEpisode {num_episodes}:")
            print(f"\tsteps done = {self.steps_done} / "
                    f"{self.training_steps}")
            print(f"\treturn = {ep_return}")
            print(f"\tgoal = {goal}")

    self.logger.close()
    if self.verbose:
        print("Training complete")
        print(f"\nEpisode {num_episodes}:")
        print(f"\tsteps done = {self.steps_done} / {self.training_steps}")
        print(f"\treturn = {ep_return}")
        print(f"\tgoal = {goal}")
# Set the method        
DQNAgent.train = train

# You can switch to a different benchmark if you want... like the scenario args posted or your own
env = nasim.make_benchmark("small")
#Liam does it this way: env = nasim.load("small.yaml")
# Initializing and training agent
dqn_agent = DQNAgent(env, verbose=1, training_steps=50000000)
dqn_agent.train()


Running DQN with config:
{'batch_size': 32,
 'env': <nasim.envs.environment.NASimEnv object at 0x7f78b8965fc0>,
 'exploration_steps': 10000,
 'final_epsilon': 0.05,
 'gamma': 0.99,
 'hidden_sizes': [64, 64],
 'kwargs': {},
 'lr': 0.001,
 'replay_size': 10000,
 'seed': None,
 'self': <nasim.agents.dqn_agent.DQNAgent object at 0x7f78b89653c0>,
 'target_update_freq': 1000,
 'training_steps': 50000000,
 'verbose': 1}

Using Neural Network running on device=cuda:
DQN(
  (layers): ModuleList(
    (0): Linear(in_features=207, out_features=64, bias=True)
    (1): Linear(in_features=64, out_features=64, bias=True)
  )
  (out): Linear(in_features=64, out_features=72, bias=True)
)

Starting training
Blocked host index:  -1
Blocked host index:  6
Blocked host index:  2
Blocked host index:  2
Blocked host index:  2
Blocked host index:  3
Blocked host index:  4
Blocked host index:  -1
Blocked host index:  5
Blocked host index:  6

Episode 10:
	steps done = 5662 / 50000000
	return = -593.0
	goal = T

In [12]:
dqn_agent.run_eval_episode(render=False)

(-1018.0, 1000, False)

In [13]:
dqn_agent.run_eval_episode(render=False)

(-1011.0, 1000, False)

## Past Attempts

This was some code that didn't end up working if you wanted to see a previous attempt

In [None]:
import numpy as np

capacity = 10
s_dims = (5,)
s_buf = np.zeros((capacity, *s_dims), dtype=np.float32)
#test_tuple.resize(test_tuple, [3,2])

print(s_buf)

In [5]:
import nasim
import random
from nasim.agents.dqn_agent import DQNAgent

def run_train_episode(self, step_limit):
        done = False
        env_step_limit_reached = False
        steps = 0
        episode_return = 0
        
        o = self.env.reset()
        
        while not done and not env_step_limit_reached: #and steps < step_limit:
            a = self.get_egreedy_action(o, self.get_epsilon())
        
            next_o, r, done, env_step_limit_reached, _ = self.env.step(a)
            self.replay.store(o, a, next_o, r, done)
            self.steps_done += 1
            loss, mean_v = self.optimize()
            
            o = next_o
            episode_return += r
            steps += 1

        return episode_return, steps, self.env.goal_reached()
    
DQNAgent.run_train_episode = run_train_episode

def train(self):
    if self.verbose:
        print("\nStarting training")

    num_episodes = 0
    training_steps_remaining = self.training_steps
    max_hosts = (self.env.scenario.get_description())['Hosts']
    max_obs_dim = self.env.observation_space.shape
    
    while self.steps_done < self.training_steps:
        if self.steps_done > 0:
            print(self.env.network.address_space)
            print(self.env.network.host_num_map)
            print(self.env.network.subnets)
            print(self.env.network.topology)
            print(self.env.network.firewall)
            print(self.env.network.address_space)
            print(self.env.network.address_space_bounds)
            print(self.env.network.sensitive_addresses)
            print(self.env.network.sensitive_hosts)

            self.env.observation_space = prev_observation_space
            self.num_actions = prev_num_actions
            self.obs_dim = prev_obs_dim
            self.replay = ReplayMemory(prev_replay_size,
                                   #self.obs_dim,
                                   #self.device)
            
            prev_observation_space = self.env.observation_space
            prev_num_actions = self.num_actions
            prev_obs_dim = self.obs_dim
            prev_replay = self.replay
            
            scenario_args.update(num_hosts=random.randint(3,max_hosts))
            
            self.env =  nasim.generate(**scenario_args)
            self.env.observation_space = prev_observation_space
            self.num_actions = prev_num_actions
            self.obs_dim = prev_obs_dim
            self.replay = prev_replay
            
        ep_results = self.run_train_episode(training_steps_remaining)
        ep_return, ep_steps, goal = ep_results
        num_episodes += 1
        training_steps_remaining -= ep_steps

        self.logger.add_scalar("episode", num_episodes, self.steps_done)
        self.logger.add_scalar(
            "epsilon", self.get_epsilon(), self.steps_done
        )
        self.logger.add_scalar(
            "episode_return", ep_return, self.steps_done
        )
        self.logger.add_scalar(
            "episode_steps", ep_steps, self.steps_done
        )
        self.logger.add_scalar(
            "episode_goal_reached", int(goal), self.steps_done
        )

        if num_episodes % 10 == 0 and self.verbose:
            print(f"\nEpisode {num_episodes}:")
            print(f"\tsteps done = {self.steps_done} / "
                    f"{self.training_steps}")
            print(f"\treturn = {ep_return}")
            print(f"\tgoal = {goal}")

    self.logger.close()
    if self.verbose:
        print("Training complete")
        print(f"\nEpisode {num_episodes}:")
        print(f"\tsteps done = {self.steps_done} / {self.training_steps}")
        print(f"\treturn = {ep_return}")
        print(f"\tgoal = {goal}")
            
DQNAgent.train = train

print(scenario_args)
env = nasim.generate(**scenario_args)
dqn_agent = DQNAgent(env, verbose=1, training_steps=100000)
dqn_agent.train()
dqn_agent.run_eval_episode(render=args.render_eval)

SyntaxError: invalid syntax (<ipython-input-5-e80566c61b5c>, line 58)