## 7/24: File used to test NaSim QL Agent on custom network built with scenario args AND NaSim Small Network

This is all some intro code just to visualize some of the settings and get a baseline. You can run through it if you want to get familiar with what the scenario arguments, agents, and training looks like.

In [5]:
testfile = 'data.yaml'


def writeToYAML():
    with open(testfile, 'w') as f:
        data = yaml.dump(curr_data, f, sort_keys=False, default_flow_style=False)

In [6]:
def print_nested_dict(dict_obj, indent = 0):
    ''' Pretty Print nested dictionary with given indent level  
    '''
    # Iterate over all key-value pairs of dictionary
    for key, value in dict_obj.items():
        # If value is dict type, then print nested dict 
        if isinstance(value, dict):
            print(' ' * indent, key, ':', '{')
            print_nested_dict(value, indent + 4)
            print(' ' * indent, '}')
        else:
            print(' ' * indent, key, ':', value)

In [7]:
import nasim
import json
env = nasim.generate(**scenario_args)
env = nasim.make_benchmark("huge-gen")
env = nasim.load("unreachable.yaml")
env2 = env = nasim.make_benchmark("small")


scenario_desc = env.scenario.get_description() #get_description found in scenario.py file under nasim->scenarios
scenario_dict = env.scenario.scenario_dict
#scenario_exploit_map = env.scenario.exploit_map # A nested dictionary for all exploits in scenario.
#scenario_privesc_map = env.scenario.privesc_map # A nested dictionary for all privilege escalation actions in scenario.

print("Scenario Description: ")
print_nested_dict(scenario_desc,4)

print("\nScenario Dictionary: ")
print_nested_dict(scenario_dict,6)

NameError: name 'scenario_args' is not defined

In [5]:
#env.get_minimum_actions()

In [8]:
#NOW USING AGENT ABOVE INSTEAD OF IMPORTING AGENT:
#USED TO BE: from nasim.agents.ql_agent import TabularQLearningAgent

ql_agent = TabularQLearningAgent(env2, verbose=1, training_steps=500)
training_outputs = ql_agent.train()


Running Tabular Q-Learning with config:
{'env': <nasim.envs.environment.NASimEnv object at 0x7f0bc6879c60>,
 'exploration_steps': 10000,
 'final_epsilon': 0.05,
 'gamma': 0.99,
 'kwargs': {},
 'lr': 0.001,
 'seed': None,
 'self': <__main__.TabularQLearningAgent object at 0x7f0b2c276080>,
 'training_steps': 500,
 'verbose': 1}

Starting training
Training complete

Episode 2:
	steps done = 500 / 500
	return = -339.0
	goal = False


## Current Code 
Here is the main code to test/run.

In [1]:
# Initial scenario arguments... we will be editing the number of hosts by marking actions involving them as invalid
scenario_args2={
    "num_hosts": 5,         # Number of hosts in the network 
    
    "num_services": 3,      # Number of services on the network (ssh, ftp, http)
    
    "num_os": 2,            # Number of operatings systems on the network (windows, linux, etc)
    
    "num_processes": 2,     # Number of processes on the network (tomcat, daclsvc, etc)
    
    "num_exploits": None,   # Number of exploits to use
    
    "num_privescs": None,   # Number of privilege escalation actions
    
    "r_sensitive": 10,      # Reward for sensitive subnet documents (default 10)
    
    "r_user": 10,           # Reward for user subnet documents      (default 10)
    
    "exploit_cost": 1,      # Cost to use an exploit (default 1)
    
    "exploit_probs": 1.0,   # Sucess probability of exploits (default 1.0)
    
    "privesc_cost": 1,      # Cost of privilege escalation action (default 1)
    
    "privesc_probs": 1.0,   # Sucess probability of privilege escalation action (default 1.0)
    
    "service_scan_cost": 1, # Cost for a service scan (default 1)
    
    "os_scan_cost": 1,      # Cost for an OS scan (default 1)
    
    "subnet_scan_cost": 1,  # Cost for a subnet scan (default 1)
    
    "process_scan_cost": 1, # Cost for a process scan (default 1)
    
    "uniform": False,       # Whether to use uniform distribution or correlaed host configuration (default false)
    
    "alpha_H": 2.0,         # Scaling or concentration parameter for controlling corelation between host configurations (default 2.0)
    
    "alpha_V": 2.0,         # Scaling or concentration parameter for controlling corelation between services across host configruations (default 2.0)
    
    "lambda_V": 1.0,        # Parameter for controlling average number of services running per host configuration (default 1.0)
    
    "restrictiveness": 5,   # Maximum number of services allowed to pass through firewalls between zones (default 5)
    
    "random_goal": False,   # Whether to randomly assign the goal user host or not (default False)
    
    "base_host_value": 1,   # Value of non sensitive hosts (default 1)
    
    "host_discovery_value": 1,  # Value of discovering a host for the first time (default 1)
    
    "seed": None,           # Random number generator seed (default None)
    
    "name": None,           # Name of the scenario, one will be generated if None (default None)
    
    "step_limit": None}     # Max number of steps permitted in a single episode, None means no limit (default None)

#Scenario Generator Parameter List: https://networkattacksimulator.readthedocs.io/en/latest/reference/scenarios/generator.html#scenario-generator

In [2]:
# Defining Python user-defined exceptions
class SensitiveHostRemovalException(Exception):
    "Raised when selected network host cannot be removed (sensitive host needs to remain in network)"
    pass

class PublicHostRemovalException(Exception):
    "Raised when selected network host cannot be removed (public host to enter the network... specific to this configuration)"
    pass

In [3]:
"""An example Tabular, epsilon greedy Q-Learning Agent.

This agent does not use an Experience replay (see the 'ql_replay_agent.py')

It uses pytorch 1.5+ tensorboard library for logging (HINT: these dependencies
can be installed by running pip install nasim[dqn])

To run 'tiny' benchmark scenario with default settings, run the following from
the nasim/agents dir:

$ python ql_agent.py tiny

To see detailed results using tensorboard:

$ tensorboard --logdir runs/

To see available hyperparameters:

$ python ql_agent.py --help

Notes
-----

This is by no means a state of the art implementation of Tabular Q-Learning.
It is designed to be an example implementation that can be used as a reference
for building your own agents and for simple experimental comparisons.
"""
import random
import numpy as np
from pprint import pprint

import nasim

try:
    from torch.utils.tensorboard import SummaryWriter
except ImportError as e:
    from gymnasium import error
    raise error.DependencyNotInstalled(
        f"{e}. (HINT: you can install tabular_q_learning_agent dependencies "
        "by running 'pip install nasim[dqn]'.)"
    )


class TabularQFunction:
    """Tabular Q-Function """

    def __init__(self, num_actions):
        self.q_func = dict()
        self.num_actions = num_actions

    def __call__(self, x):
        return self.forward(x)

    def forward(self, x):
        if isinstance(x, np.ndarray):
            x = str(x.astype(int))
        if x not in self.q_func:
            self.q_func[x] = np.zeros(self.num_actions, dtype=np.float32)
        return self.q_func[x]

    def forward_batch(self, x_batch):
        return np.asarray([self.forward(x) for x in x_batch])

    def update_batch(self, s_batch, a_batch, delta_batch):
        for s, a, delta in zip(s_batch, a_batch, delta_batch):
            q_vals = self.forward(s)
            q_vals[a] += delta

    def update(self, s, a, delta):
        q_vals = self.forward(s)
        q_vals[a] += delta

    def get_action(self, x):
        return int(self.forward(x).argmax())

    def display(self):
        pprint(self.q_func)


class TabularQLearningAgent:
    """A Tabular. epsilon greedy Q-Learning Agent using Experience Replay """

    def __init__(self,
                 env,
                 seed=None,
                 lr=0.001,
                 training_steps=10000,
                 final_epsilon=0.05,
                 exploration_steps=10000,
                 gamma=0.99,
                 verbose=True,
                 **kwargs):

        # This implementation only works for flat actions
        assert env.flat_actions
        self.verbose = verbose
        if self.verbose:
            print("\nRunning Tabular Q-Learning with config:")
            pprint(locals())

        # set seeds
        self.seed = seed
        if self.seed is not None:
            np.random.seed(self.seed)

        # envirnment setup
        self.env = env

        self.num_actions = self.env.action_space.n
        self.obs_dim = self.env.observation_space.shape

        # logger setup
        self.logger = SummaryWriter()

        # Training related attributes
        self.lr = lr
        self.exploration_steps = exploration_steps
        self.final_epsilon = final_epsilon
        self.epsilon_schedule = np.linspace(
            1.0, self.final_epsilon, self.exploration_steps
        )
        self.discount = gamma
        self.training_steps = training_steps
        self.steps_done = 0

        # Q-Function
        self.qfunc = TabularQFunction(self.num_actions)

    def get_epsilon(self):
        if self.steps_done < self.exploration_steps:
            return self.epsilon_schedule[self.steps_done]
        return self.final_epsilon

    def get_egreedy_action(self, o, epsilon):
        if random.random() > epsilon:
            return self.qfunc.get_action(o)
        return random.randint(0, self.num_actions-1)

    def optimize(self, s, a, next_s, r, done):
        # get q_val for state and action performed in that state
        q_vals_raw = self.qfunc.forward(s)
        q_val = q_vals_raw[a]

        # get target q val = max val of next state
        target_q_val = self.qfunc.forward(next_s).max()
        target = r + self.discount * (1-done) * target_q_val

        # calculate error and update
        td_error = target - q_val
        td_delta = self.lr * td_error

        # optimize the model
        self.qfunc.update(s, a, td_delta)

        s_value = q_vals_raw.max()
        return td_error, s_value

    def train(self):
        if self.verbose:
            print("\nStarting training")

        num_episodes = 0
        training_steps_remaining = self.training_steps
        elems_to_avg = []
        all_avgs = []

        while self.steps_done < self.training_steps:
            ep_results = self.run_train_episode(training_steps_remaining)
            ep_return, ep_steps, goal = ep_results
            num_episodes += 1
            training_steps_remaining -= ep_steps

            self.logger.add_scalar("episode", num_episodes, self.steps_done)
            self.logger.add_scalar(
                "epsilon", self.get_epsilon(), self.steps_done
            )
            self.logger.add_scalar(
                "episode_return", ep_return, self.steps_done
            )
            self.logger.add_scalar(
                "episode_steps", ep_steps, self.steps_done
            )
            self.logger.add_scalar(
                "episode_goal_reached", int(goal), self.steps_done
            )

            if num_episodes % 10 == 0 and self.verbose:
                print(f"\nEpisode {num_episodes}:")
                print(f"\tsteps done = {self.steps_done} / "
                      f"{self.training_steps}")
                print(f"\treturn = {ep_return}")
                print(f"\tgoal = {goal}")
                print(f"\t")
    
                elems_to_avg.append(ep_return)
                
            if num_episodes % 50 == 0 and self.verbose:
                avg = (sum(elems_to_avg) / len(elems_to_avg))
                all_avgs.append(avg)

                print(f"\t")
                print(f"\tRunning_Average = {avg}")
                print(f"\t")
                
            if num_episodes % 1000 == 0 and self.verbose:
                for i in range(len(all_avgs)):
                    print("Episode " + str((i+1)*50), end=": ")
                    print(all_avgs[i])
                print(f"\t")
            
                for i in range(len(elems_to_avg)):
                    print("Episode " + str((i+1)*10), end=": ")
                    print(elems_to_avg[i])
                print(f"\t")

                for i in range(len(elems_to_avg)):
                    print(elems_to_avg[i])

        self.logger.close()
        if self.verbose:
            print("Training complete")
            print(f"\nEpisode {num_episodes}:")
            print(f"\tsteps done = {self.steps_done} / {self.training_steps}")
            print(f"\treturn = {ep_return}")
            print(f"\tgoal = {goal}")
            
            print(f"\t")
            print("Running_Average List:")
            print(f"\t")
            
            for i in range(len(all_avgs)):
                print("Episode " + str((i+1)*50), end=": ")
                print(all_avgs[i])
            print(f"\t")
            
            for i in range(len(elems_to_avg)):
                print("Episode " + str((i+1)*10), end=": ")
                print(elems_to_avg[i])
            print(f"\t")
            
            for i in range(len(elems_to_avg)):
                print(elems_to_avg[i])
            print(f"\t")

    def run_train_episode(self, step_limit):
        s, _ = self.env.reset()
        done = False
        env_step_limit_reached = False

        steps = 0
        episode_return = 0

        while not done and not env_step_limit_reached and steps < step_limit:
            a = self.get_egreedy_action(s, self.get_epsilon())

            next_s, r, done, env_step_limit_reached, _ = self.env.step(a)
            self.steps_done += 1
            td_error, s_value = self.optimize(s, a, next_s, r, done)
            self.logger.add_scalar("td_error", td_error, self.steps_done)
            self.logger.add_scalar("s_value", s_value, self.steps_done)

            s = next_s
            episode_return += r
            steps += 1

        return episode_return, steps, self.env.goal_reached()

    def run_eval_episode(self,
                         env=None,
                         render=False,
                         eval_epsilon=0.05,
                         render_mode="human"):
        if env is None:
            env = self.env

        original_render_mode = env.render_mode
        env.render_mode = render_mode

        s, _ = env.reset()
        done = False
        env_step_limit_reached = False

        steps = 0
        episode_return = 0

        line_break = "="*60
        if render:
            print("\n" + line_break)
            print(f"Running EVALUATION using epsilon = {eval_epsilon:.4f}")
            print(line_break)
            env.render()
            input("Initial state. Press enter to continue..")

        while not done and not env_step_limit_reached:
            a = self.get_egreedy_action(s, eval_epsilon)
            next_s, r, done, env_step_limit_reached, _ = env.step(a)
            s = next_s
            episode_return += r
            steps += 1
            if render:
                print("\n" + line_break)
                print(f"Step {steps}")
                print(line_break)
                print(f"Action Performed = {env.action_space.get_action(a)}")
                env.render()
                print(f"Reward = {r}")
                print(f"Done = {done}")
                print(f"Step limit reached = {env_step_limit_reached}")
                input("Press enter to continue..")

                if done or env_step_limit_reached:
                    print("\n" + line_break)
                    print("EPISODE FINISHED")
                    print(line_break)
                    print(f"Goal reached = {env.goal_reached()}")
                    print(f"Total steps = {steps}")
                    print(f"Total reward = {episode_return}")

        env.render_mode = original_render_mode
        return episode_return, steps, env.goal_reached()


if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("env_name", type=str, help="benchmark scenario name")
    parser.add_argument("--render_eval", action="store_true",
                        help="Renders final policy")
    parser.add_argument("--lr", type=float, default=0.001,
                        help="Learning rate (default=0.001)")
    parser.add_argument("-t", "--training_steps", type=int, default=10000,
                        help="training steps (default=10000)")
    parser.add_argument("--batch_size", type=int, default=32,
                        help="(default=32)")
    parser.add_argument("--seed", type=int, default=0,
                        help="(default=0)")
    parser.add_argument("--replay_size", type=int, default=100000,
                        help="(default=100000)")
    parser.add_argument("--final_epsilon", type=float, default=0.05,
                        help="(default=0.05)")
    parser.add_argument("--init_epsilon", type=float, default=1.0,
                        help="(default=1.0)")
    parser.add_argument("-e", "--exploration_steps", type=int, default=10000,
                        help="(default=10000)")
    parser.add_argument("--gamma", type=float, default=0.99,
                        help="(default=0.99)")
    parser.add_argument("--quite", action="store_false",
                        help="Run in Quite mode")
    args = parser.parse_args()

    env = nasim.make_benchmark(
        args.env_name,
        args.seed,
        fully_obs=True,
        flat_actions=True,
        flat_obs=True
    )
    ql_agent = TabularQLearningAgent(
        env, verbose=args.quite, **vars(args)
    )
    ql_agent.train()
    ql_agent.run_eval_episode(render=args.render_eval)

usage: ipykernel_launcher.py [-h] [--render_eval] [--lr LR]
                             [-t TRAINING_STEPS] [--batch_size BATCH_SIZE]
                             [--seed SEED] [--replay_size REPLAY_SIZE]
                             [--final_epsilon FINAL_EPSILON]
                             [--init_epsilon INIT_EPSILON]
                             [-e EXPLORATION_STEPS] [--gamma GAMMA] [--quite]
                             env_name
ipykernel_launcher.py: error: unrecognized arguments: -f


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [22]:
#NOW USING AGENT ABOVE INSTEAD OF IMPORTANT AGENT:
#USED TO BE: from nasim.agents.ql_agent import TabularQLearningAgent

env3 = nasim.generate(**scenario_args2)
env4 = nasim.make_benchmark("small")
baseline_ql_agent = TabularQLearningAgent(env4, verbose=1, training_steps=300000, max_episodes=2000)
baseline_ql_agent.train()
baseline_ql_agent.run_eval_episode(render_mode="human")

print("QL AGENT NOW ON scenario_args2 NETWORK")

baseline_ql_agent = TabularQLearningAgent(env4, verbose=1, training_steps=300000, max_episodes=2000) #This actually ran the small network again, scenario_args2 network done below this cell
baseline_ql_agent.train()
baseline_ql_agent.run_eval_episode(render_mode="human")


Running Tabular Q-Learning with config:
{'env': <nasim.envs.environment.NASimEnv object at 0x7f45e465ee60>,
 'exploration_steps': 10000,
 'final_epsilon': 0.05,
 'gamma': 0.99,
 'kwargs': {'max_episodes': 2000},
 'lr': 0.001,
 'seed': None,
 'self': <__main__.TabularQLearningAgent object at 0x7f47535b7eb0>,
 'training_steps': 300000,
 'verbose': 1}

Starting training

Episode 10:
	steps done = 5328 / 300000
	return = -347.0
	goal = True
	

Episode 20:
	steps done = 13517 / 300000
	return = -965.0
	goal = False
	

Episode 30:
	steps done = 22012 / 300000
	return = -150.0
	goal = True
	

Episode 40:
	steps done = 29040 / 300000
	return = -1029.0
	goal = False
	

Episode 50:
	steps done = 35554 / 300000
	return = -40.0
	goal = True
	
	
	Running_Average = -506.2
	

Episode 60:
	steps done = 41017 / 300000
	return = -204.0
	goal = True
	

Episode 70:
	steps done = 47746 / 300000
	return = -907.0
	goal = True
	

Episode 80:
	steps done = 53228 / 300000
	return = -360.0
	goal = True
	

Episo

(186.0, 8, True)

In [4]:
env3 = nasim.generate(**scenario_args2)
baseline_ql_agent2 = TabularQLearningAgent(env3, verbose=1, training_steps=300000) #10-20k steps, 10k episodes
baseline_ql_agent2.train()
baseline_ql_agent2.run_eval_episode(render_mode="human")


Running Tabular Q-Learning with config:
{'env': <nasim.envs.environment.NASimEnv object at 0x7f8c842fba00>,
 'exploration_steps': 10000,
 'final_epsilon': 0.05,
 'gamma': 0.99,
 'kwargs': {},
 'lr': 0.001,
 'seed': None,
 'self': <__main__.TabularQLearningAgent object at 0x7f8c842fb1c0>,
 'training_steps': 300000,
 'verbose': 1}

Starting training

Episode 10:
	steps done = 806 / 300000
	return = -40.0
	goal = True
	

Episode 20:
	steps done = 1642 / 300000
	return = -74.0
	goal = True
	

Episode 30:
	steps done = 2081 / 300000
	return = -56.0
	goal = True
	

Episode 40:
	steps done = 2489 / 300000
	return = -39.0
	goal = True
	

Episode 50:
	steps done = 2901 / 300000
	return = -9.0
	goal = True
	
	
	Running_Average = -43.6
	

Episode 60:
	steps done = 3511 / 300000
	return = -106.0
	goal = True
	

Episode 70:
	steps done = 3884 / 300000
	return = -16.0
	goal = True
	

Episode 80:
	steps done = 4147 / 300000
	return = 17.0
	goal = True
	

Episode 90:
	steps done = 4390 / 300000
	retu

KeyboardInterrupt: 

In [19]:
baseline_dqn_agent.run_eval_episode(render_mode="human")

(-1014.0, 1000, False)

In [1]:
# Import necessary libraries, including which methods will be redefined
import nasim
import random
from nasim.envs.action import Action
from nasim.agents.dqn_agent import DQNAgent
from nasim.envs.environment import NASimEnv

# User-defined Python method to check whether the selected blocked_host is valid to select
def check_host_valid(self, blocked_host):
    if blocked_host == -1:
        return
    elif self.env.network.address_space[blocked_host] in self.env.network.get_sensitive_hosts():
        raise SensitiveHostRemovalException
    elif blocked_host == 0:
        raise PublicHostRemovalException
    else:
        return

# Setting the method
DQNAgent.check_host_valid = check_host_valid
    
# Redefining the DQNAgent run_train_episode method
def run_train_episode(self, step_limit):
        done = False
        env_step_limit_reached = False #Unnecessary now with loop below using steps < step_limit
        steps = 0
        episode_return = 0
        max_host_index = len(self.env.network.host_num_map) - 1
        
        # Choosing random host index to be invalid... try/catch loop until valid host selected to block. Note: If -1, no host will be marked invalid
        blocked_host = -1
        if self.steps_done > 0:
            while True:
                try:
                    blocked_host = random.randint(-1,max_host_index)
                    self.check_host_valid(blocked_host)
                    break
                except SensitiveHostRemovalException:
                    pass
                except PublicHostRemovalException:
                    pass
                
        o, _ = self.env.reset()
        
        # If you wanted to see which host was blocked... used for the logging
        print("Blocked host index:  " + str(blocked_host))
        
        while not done and not env_step_limit_reached: #steps < step_limit: #J: changed from env_step_limit_reached:     
            #J: steps continuously updated at the bottom and will break as soon as step limit is reached
            # Keep generating an action in the action space until it does not involve a blocked host
            while True:
                a = self.get_egreedy_action(o, self.get_epsilon())
                
                if blocked_host == -1:
                    break
                else:
                    action = self.env.action_space.get_action(a)
                    target_host_index = self.env.network.host_num_map[action.target]
                    if target_host_index != blocked_host:
                        break
                
            next_o, r, done, env_step_limit_reached, _ = self.env.step(a)
            self.replay.store(o, a, next_o, r, done)
            self.steps_done += 1
            loss, mean_v = self.optimize()
            
            o = next_o
            episode_return += r
            steps += 1

        return episode_return, steps, self.env.goal_reached()

# Setting the method
DQNAgent.run_train_episode = run_train_episode

# Training function... redefined because it wasn't converging originally
def train(self):
    if self.verbose:
        print("\nStarting training")

    num_episodes = 0
    training_steps_remaining = self.training_steps
    og_env = self.env
    
    elems_to_avg = []
    all_avgs = []
    
    while self.steps_done < self.training_steps:
        self.env = og_env
        ep_results = self.run_train_episode(training_steps_remaining)
        ep_return, ep_steps, goal = ep_results
        num_episodes += 1
        training_steps_remaining -= ep_steps

        self.logger.add_scalar("episode", num_episodes, self.steps_done)
        self.logger.add_scalar(
            "epsilon", self.get_epsilon(), self.steps_done
        )
        self.logger.add_scalar(
            "episode_return", ep_return, self.steps_done
        )
        self.logger.add_scalar(
            "episode_steps", ep_steps, self.steps_done
        )
        self.logger.add_scalar(
            "episode_goal_reached", int(goal), self.steps_done
        )

        if num_episodes % 10 == 0 and self.verbose:
            print(f"\nEpisode {num_episodes}:")
            print(f"\tsteps done = {self.steps_done} / "
                f"{self.training_steps}")
            print(f"\treturn = {ep_return}")
            print(f"\tgoal = {goal}")
            print(f"\t")
            
            elems_to_avg.append(ep_return) #Jacob edit
        
        if num_episodes % 50 == 0 and self.verbose:
            avg = (sum(elems_to_avg) / len(elems_to_avg))
            all_avgs.append(avg)
            
            print(f"\t")
            print(f"\tRunning_Average = {avg}")
            print(f"\t")
            
            print("Running_Average List:")
            print(f"\t")
            
            for i in range(len(all_avgs)):
                print("Episode " + str((i+1)*50), end=": ")
                print(all_avgs[i])
            print(f"\t")
                

    self.logger.close()
    if self.verbose:
            print("Training complete")
            print(f"\nEpisode {num_episodes}:")
            print(f"\tsteps done = {self.steps_done} / {self.training_steps}")
            print(f"\treturn = {ep_return}")
            print(f"\tgoal = {goal}")
            
            print(f"\t")
            print("Running_Average List:")
            print(f"\t")
            
            
            print("Running_Average List:")
            print(f"\t")
            
            for i in range(len(all_avgs)):
                print("Episode " + str((i+1)*50), end=": ")
                print(all_avgs[i])
            print(f"\t")
            
            for i in range(len(elems_to_avg)):
                print("Episode " + str((i+1)*10), end=": ")
                print(elems_to_avg[i])
            print(f"\t")
            
            for i in range(len(elems_to_avg)):
                print(elems_to_avg[i])
            print(f"\t")
            
            
            #plot_average_bar_chart(elems_to_avg)

# Set the method        
DQNAgent.train = train

# You can switch to a different benchmark if you want... like the scenario args posted or your own
env = nasim.make_benchmark("small")
#Liam does it this way: env = nasim.load("small.yaml")
# Initializing and training agent
dqn_agent = DQNAgent(env, verbose=1, training_steps=5000000)
dqn_agent.train()


Running DQN with config:
{'batch_size': 32,
 'env': <nasim.envs.environment.NASimEnv object at 0x7fb48a1bdd50>,
 'exploration_steps': 10000,
 'final_epsilon': 0.05,
 'gamma': 0.99,
 'hidden_sizes': [64, 64],
 'kwargs': {},
 'lr': 0.001,
 'replay_size': 10000,
 'seed': None,
 'self': <nasim.agents.dqn_agent.DQNAgent object at 0x7fb3bfd09ea0>,
 'target_update_freq': 1000,
 'training_steps': 5000000,
 'verbose': 1}


KeyboardInterrupt: 

In [None]:
dqn_agent.run_eval_episode(render=False)

## Past Attempts

This was some code that didn't end up working if you wanted to see a previous attempt

In [None]:
import numpy as np

capacity = 10
s_dims = (5,)
s_buf = np.zeros((capacity, *s_dims), dtype=np.float32)
#test_tuple.resize(test_tuple, [3,2])

print(s_buf)

In [None]:
import nasim
import random
from nasim.agents.dqn_agent import DQNAgent

def run_train_episode(self, step_limit):
        done = False
        env_step_limit_reached = False
        steps = 0
        episode_return = 0
        
        o = self.env.reset()
        
        while not done and not env_step_limit_reached: #and steps < step_limit:
            a = self.get_egreedy_action(o, self.get_epsilon())
        
            next_o, r, done, env_step_limit_reached, _ = self.env.step(a)
            self.replay.store(o, a, next_o, r, done)
            self.steps_done += 1
            loss, mean_v = self.optimize()
            
            o = next_o
            episode_return += r
            steps += 1

        return episode_return, steps, self.env.goal_reached()
    
DQNAgent.run_train_episode = run_train_episode

def train(self):
    if self.verbose:
        print("\nStarting training")

    num_episodes = 0
    training_steps_remaining = self.training_steps
    max_hosts = (self.env.scenario.get_description())['Hosts']
    max_obs_dim = self.env.observation_space.shape
    
    while self.steps_done < self.training_steps:
        if self.steps_done > 0:
            print(self.env.network.address_space)
            print(self.env.network.host_num_map)
            print(self.env.network.subnets)
            print(self.env.network.topology)
            print(self.env.network.firewall)
            print(self.env.network.address_space)
            print(self.env.network.address_space_bounds)
            print(self.env.network.sensitive_addresses)
            print(self.env.network.sensitive_hosts)

            self.env.observation_space = prev_observation_space
            self.num_actions = prev_num_actions
            self.obs_dim = prev_obs_dim
            self.replay = ReplayMemory(prev_replay_size,
                                   #self.obs_dim,
                                   #self.device)
            
            prev_observation_space = self.env.observation_space
            prev_num_actions = self.num_actions
            prev_obs_dim = self.obs_dim
            prev_replay = self.replay
            
            scenario_args.update(num_hosts=random.randint(3,max_hosts))
            
            self.env =  nasim.generate(**scenario_args)
            self.env.observation_space = prev_observation_space
            self.num_actions = prev_num_actions
            self.obs_dim = prev_obs_dim
            self.replay = prev_replay
            
        ep_results = self.run_train_episode(training_steps_remaining)
        ep_return, ep_steps, goal = ep_results
        num_episodes += 1
        training_steps_remaining -= ep_steps

        self.logger.add_scalar("episode", num_episodes, self.steps_done)
        self.logger.add_scalar(
            "epsilon", self.get_epsilon(), self.steps_done
        )
        self.logger.add_scalar(
            "episode_return", ep_return, self.steps_done
        )
        self.logger.add_scalar(
            "episode_steps", ep_steps, self.steps_done
        )
        self.logger.add_scalar(
            "episode_goal_reached", int(goal), self.steps_done
        )

        if num_episodes % 10 == 0 and self.verbose:
            print(f"\nEpisode {num_episodes}:")
            print(f"\tsteps done = {self.steps_done} / "
                    f"{self.training_steps}")
            print(f"\treturn = {ep_return}")
            print(f"\tgoal = {goal}")

    self.logger.close()
    if self.verbose:
        print("Training complete")
        print(f"\nEpisode {num_episodes}:")
        print(f"\tsteps done = {self.steps_done} / {self.training_steps}")
        print(f"\treturn = {ep_return}")
        print(f"\tgoal = {goal}")
            
DQNAgent.train = train

print(scenario_args)
env = nasim.generate(**scenario_args)
dqn_agent = DQNAgent(env, verbose=1, training_steps=100000)
dqn_agent.train()
dqn_agent.run_eval_episode(render=args.render_eval)