In [1]:
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

"""Notebook used for debugging purpose to train the
the DQL agent and then run it one step at a time.
"""

# pylint: disable=invalid-name

'Notebook used for debugging purpose to train the\nthe DQL agent and then run it one step at a time.\n'

In [2]:
import os
from dotenv import load_dotenv
import pandas as pd
import datetime
import cyberbattle.agents.baseline.learner as learner
import cyberbattle.agents.baseline.agent_wrapper as w
import cyberbattle.agents.baseline.agent_dql as dqla
import logging
from cyberbattle.agents.baseline.agent_wrapper import ActionTrackingStateAugmentation, AgentWrapper, Verbosity
from IPython.display import display
import progressbar
import gym
from cyberbattle.simulation.config import configuration, logger

load_dotenv()

True

In [3]:
max_episode_steps = 50
log_results = os.getenv("LOG_RESULTS", 'False').lower() in ('true', '1', 't')
gymid = os.getenv("GYMID", 'CyberBattleTinyMicro-v0')
log_level = os.getenv('LOG_LEVEL', "info")
iteration_count = None
training_episode_count = None
train_while_exploit = os.getenv("TRAIN_WHILE_EXPLOIT", 'True').lower() in ('true', '1', 't')
exploit_train = "exploit_train"   # "exploit_manual"
eval_episode_count = int(os.getenv('EVAL_EPISODE_COUNT', 0))
eval_freq = int(os.getenv('EVAL_FREQ', 0))
epsilon_exponential_decay = int(os.getenv('EPS_EXP_DECAY', max_episode_steps * 4000))  # 5000
mean_reward_window = int(os.getenv('MEAN_REWARD_WINDOW', 10))

log_dir = 'logs/exper/' + "notebook_dql_debug_with_tinymicro"
# convert the datetime object to string of specific format
datetime_str = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
log_dir = os.path.join(log_dir, gymid, datetime_str)

In [4]:
# Parameters
gymid = "CyberBattleTinyMicro-v1"


In [5]:
iteration_count = max_episode_steps if iteration_count is None else iteration_count
os.environ['TRAINING_EPISODE_COUNT'] = os.getenv('TRAINING_EPISODE_COUNT', 1000) if training_episode_count is None else training_episode_count
training_episode_count = int(os.environ['TRAINING_EPISODE_COUNT'])
os.environ['LOG_DIR'] = log_dir
os.environ['LOG_RESULTS'] = str(log_results).lower()

os.makedirs(log_dir, exist_ok=True) if log_results else ''

configuration.update_globals(log_dir, gymid, log_level, log_results)
configuration.update_logger()

# if os.environ['RUN_IN_SILENT_MODE'] in ['true']:
#     f = open(os.devnull, 'w')
#     sys.stdout = f

progressbar.streams.wrap_stderr()

<progressbar.utils.WrappingIO at 0x7f5609e13c10>

In [6]:
ctf_env = gym.make(gymid)
ep = w.EnvironmentBounds.of_identifiers(
    maximum_node_count=ctf_env.bounds.maximum_node_count,  # either we identify from configuration, or by ourselves
    maximum_total_credentials=1,
    identifiers=ctf_env.identifiers
)

ctf_env = gym.make(gymid, env_bounds=ep)
ctf_env.spec.max_episode_steps = max_episode_steps


# if not log_results:
#     lhStdout = logger.handlers[0]  # stdout is the only handler initially
#     logger.removeHandler(lhStdout)

In [7]:
# Evaluate the Deep Q-learning agent

os.makedirs(os.path.join(log_dir, 'training'), exist_ok=True) if log_results else ''

learning_rate = 0.01  # 0.01
gamma = 0.015  # 0.015
dqn_learning_run = learner.epsilon_greedy_search(
    cyberbattle_gym_env=ctf_env,
    environment_properties=ep,
    learner=dqla.DeepQLearnerPolicy(
        ep=ep,
        gamma=gamma,
        replay_memory_size=10000,
        target_update=5,
        batch_size=512,  # TODO increase?
        learning_rate=learning_rate  # torch default learning rate is 1e-2
    ),
    episode_count=training_episode_count,
    iteration_count=iteration_count,
    epsilon=0.90,
    epsilon_exponential_decay=epsilon_exponential_decay,
    epsilon_minimum=0.10,
    eval_episode_count=eval_episode_count,
    eval_freq=eval_freq,
    mean_reward_window=mean_reward_window,
    verbosity=Verbosity.Quiet,
    render=False,
    render_last_episode_rewards_to=os.path.join(log_dir, 'training/') if log_results else None,
    plot_episodes_length=False,
    title="DQL",
    save_model_filename=log_results * os.path.join(os.path.join(log_dir, 'training/'),
                                                   f"{exploit_train}_{train_while_exploit * 'train_while_exploit'}_trainepisodes{training_episode_count}_best_model.tar")
)

if log_results:
    configuration.writer.close()

###### DQL
Learning with: episode_count=3000,iteration_count=50,ϵ=0.9,ϵ_min=0.1, ϵ_expdecay=250000,γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5
  ## Episode: 1/3000 'DQL' ϵ=0.9000, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 1 ended at t=50 total_reward -1130.0 with 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 2/41 (0.05)
    explore-remote: 0/2 (0.00)
    explore-connect: 0/0 (NaN)
    exploit-local: 0/0 (NaN)
    exploit-remote: 0/5 (0.00)
    exploit-connect: 0/0 (NaN)
  exploit deflected to exploration: 0
  ## Episode: 2/3000 'DQL' ϵ=0.8998, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 2 ended at t=50 total_reward -1101.0 with 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 2/42 (0.05)
    explore-remote: 0/2 (0.00)
    explore-connect: 0/0 (NaN)
    exploit-local: 0/0 (NaN)
    exploit-remote: 0/4 (0.00)
    exploit-connect: 0/0 (NaN)
  exploit deflected to exploration: 0
  ## Episode: 3/3000 'DQL' ϵ=0.8997, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 3 ended at t=50 total_reward -1032.0 with 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 2/41 (0.05)
    explore-remote: 1/4 (0.20)
    explore-connect: 0/0 (NaN)
    exploit-local: 0/0 (NaN)
    exploit-remote: 0/2 (0.00)
    exploit-connect: 0/0 (NaN)
  exploit deflected to exploration: 0
  ## Episode: 4/3000 'DQL' ϵ=0.8995, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 4 ended at t=50 total_reward -1130.0 with 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 2/42 (0.05)
    explore-remote: 0/1 (0.00)
    explore-connect: 0/0 (NaN)
    exploit-local: 0/0 (NaN)
    exploit-remote: 0/5 (0.00)
    exploit-connect: 0/0 (NaN)
  exploit deflected to exploration: 0
  ## Episode: 5/3000 'DQL' ϵ=0.8994, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 5 ended at t=50 total_reward -1207.0 with 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 2/39 (0.05)
    explore-remote: 0/1 (0.00)
    explore-connect: 0/0 (NaN)
    exploit-local: 0/0 (NaN)
    exploit-remote: 0/8 (0.00)
    exploit-connect: 0/0 (NaN)
  exploit deflected to exploration: 0
  ## Episode: 6/3000 'DQL' ϵ=0.8992, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 6 ended at t=50 total_reward -1101.0 with 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 2/42 (0.05)
    explore-remote: 0/2 (0.00)
    explore-connect: 0/0 (NaN)
    exploit-local: 0/0 (NaN)
    exploit-remote: 0/4 (0.00)
    exploit-connect: 0/0 (NaN)
  exploit deflected to exploration: 0
  ## Episode: 7/3000 'DQL' ϵ=0.8990, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 7 ended at t=50 total_reward -1092.0 with 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 2/43 (0.04)
    explore-remote: 0/2 (0.00)
    explore-connect: 0/0 (NaN)
    exploit-local: 0/0 (NaN)
    exploit-remote: 0/3 (0.00)
    exploit-connect: 0/0 (NaN)
  exploit deflected to exploration: 0
  ## Episode: 8/3000 'DQL' ϵ=0.8989, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 8 ended at t=50 total_reward -1111.0 with 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 2/42 (0.05)
    explore-remote: 0/2 (0.00)
    explore-connect: 0/0 (NaN)
    exploit-local: 0/0 (NaN)
    exploit-remote: 0/4 (0.00)
    exploit-connect: 0/0 (NaN)
  exploit deflected to exploration: 0
  ## Episode: 9/3000 'DQL' ϵ=0.8987, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 9 ended at t=50 total_reward -1246.0 with 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 2/38 (0.05)
    explore-remote: 0/1 (0.00)
    explore-connect: 0/0 (NaN)
    exploit-local: 0/0 (NaN)
    exploit-remote: 0/9 (0.00)
    exploit-connect: 0/0 (NaN)
  exploit deflected to exploration: 0
  ## Episode: 10/3000 'DQL' ϵ=0.8986, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 10 ended at t=50 total_reward -1137.0 with 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 2/39 (0.05)
    explore-remote: 1/1 (0.50)
    explore-connect: 0/0 (NaN)
    exploit-local: 0/0 (NaN)
    exploit-remote: 0/7 (0.00)
    exploit-connect: 0/0 (NaN)
  exploit deflected to exploration: 0
  ## Episode: 11/3000 'DQL' ϵ=0.8984, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 11 ended at t=50 total_reward -1139.0 with loss=5.314572334289551
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 2/42 (0.05)
    explore-remote: 0/0 (NaN)
    explore-connect: 0/0 (NaN)
    exploit-local: 0/0 (NaN)
    exploit-remote: 0/6 (0.00)
    exploit-connect: 0/0 (NaN)
  exploit deflected to exploration: 0
  ## Episode: 12/3000 'DQL' ϵ=0.8982, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 12 ended at t=50 total_reward -1120.0 with loss=3.390625
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 2/42 (0.05)
    explore-remote: 0/1 (0.00)
    explore-connect: 0/0 (NaN)
    exploit-local: 0/0 (NaN)
    exploit-remote: 0/5 (0.00)
    exploit-connect: 0/0 (NaN)
  exploit deflected to exploration: 0
  ## Episode: 13/3000 'DQL' ϵ=0.8981, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 13 ended at t=50 total_reward -1234.0 with loss=2.547646999359131
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 2/35 (0.05)
    explore-remote: 1/2 (0.33)
    explore-connect: 0/0 (NaN)
    exploit-local: 0/0 (NaN)
    exploit-remote: 0/10 (0.00)
    exploit-connect: 0/0 (NaN)
  exploit deflected to exploration: 0
  ## Episode: 14/3000 'DQL' ϵ=0.8979, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 14 ended at t=50 total_reward -994.0 with loss=2.026846408843994
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 2/47 (0.04)
    explore-remote: 0/0 (NaN)
    explore-connect: 0/0 (NaN)
    exploit-local: 0/0 (NaN)
    exploit-remote: 0/1 (0.00)
    exploit-connect: 0/0 (NaN)
  exploit deflected to exploration: 0
  ## Episode: 15/3000 'DQL' ϵ=0.8978, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 15 ended at t=50 total_reward -1140.0 with loss=2.1726200580596924
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 2/40 (0.05)
    explore-remote: 0/3 (0.00)
    explore-connect: 0/0 (NaN)
    exploit-local: 0/0 (NaN)
    exploit-remote: 0/5 (0.00)
    exploit-connect: 0/0 (NaN)
  exploit deflected to exploration: 0
  ## Episode: 16/3000 'DQL' ϵ=0.8976, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 16 ended at t=50 total_reward -1130.0 with loss=1.9407927989959717
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 1/41 (0.02)
    explore-remote: 0/2 (0.00)
    explore-connect: 0/0 (NaN)
    exploit-local: 1/0 (1.00)
    exploit-remote: 0/5 (0.00)
    exploit-connect: 0/0 (NaN)
  exploit deflected to exploration: 0
  ## Episode: 17/3000 'DQL' ϵ=0.8974, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 17 ended at t=50 total_reward -1101.0 with loss=1.8770813941955566
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 2/43 (0.04)
    explore-remote: 0/1 (0.00)
    explore-connect: 0/0 (NaN)
    exploit-local: 0/0 (NaN)
    exploit-remote: 0/4 (0.00)
    exploit-connect: 0/0 (NaN)
  exploit deflected to exploration: 0
  ## Episode: 18/3000 'DQL' ϵ=0.8973, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 18 ended at t=50 total_reward -1023.0 with loss=2.111839771270752
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 1/38 (0.03)
    explore-remote: 2/5 (0.29)
    explore-connect: 0/0 (NaN)
    exploit-local: 1/0 (1.00)
    exploit-remote: 0/3 (0.00)
    exploit-connect: 0/0 (NaN)
  exploit deflected to exploration: 0
  ## Episode: 19/3000 'DQL' ϵ=0.8971, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 19 ended at t=50 total_reward -1091.0 with loss=2.176680326461792
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 2/43 (0.04)
    explore-remote: 0/1 (0.00)
    explore-connect: 0/0 (NaN)
    exploit-local: 0/0 (NaN)
    exploit-remote: 0/4 (0.00)
    exploit-connect: 0/0 (NaN)
  exploit deflected to exploration: 0
  ## Episode: 20/3000 'DQL' ϵ=0.8970, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 20 ended at t=50 total_reward -1140.0 with loss=1.9754343032836914
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 1/41 (0.02)
    explore-remote: 0/2 (0.00)
    explore-connect: 0/0 (NaN)
    exploit-local: 1/0 (1.00)
    exploit-remote: 0/5 (0.00)
    exploit-connect: 0/0 (NaN)
  exploit deflected to exploration: 0
  ## Episode: 21/3000 'DQL' ϵ=0.8968, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 21 ended at t=50 total_reward -1160.0 with loss=1.7048293352127075
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 2/40 (0.05)
    explore-remote: 0/3 (0.00)
    explore-connect: 0/0 (NaN)
    exploit-local: 0/0 (NaN)
    exploit-remote: 0/5 (0.00)
    exploit-connect: 0/0 (NaN)
  exploit deflected to exploration: 0
  ## Episode: 22/3000 'DQL' ϵ=0.8967, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 22 ended at t=50 total_reward -1081.0 with loss=1.8823269605636597
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 1/44 (0.02)
    explore-remote: 0/0 (NaN)
    explore-connect: 0/0 (NaN)
    exploit-local: 1/0 (1.00)
    exploit-remote: 0/4 (0.00)
    exploit-connect: 0/0 (NaN)
  exploit deflected to exploration: 0
  ## Episode: 23/3000 'DQL' ϵ=0.8965, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 23 ended at t=50 total_reward -1168.0 with loss=1.7933176755905151
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 2/41 (0.05)
    explore-remote: 0/0 (NaN)
    explore-connect: 0/0 (NaN)
    exploit-local: 0/0 (NaN)
    exploit-remote: 0/7 (0.00)
    exploit-connect: 0/0 (NaN)
  exploit deflected to exploration: 0
  ## Episode: 24/3000 'DQL' ϵ=0.8963, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 24 ended at t=50 total_reward -1111.0 with loss=1.5829453468322754
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 2/41 (0.05)
    explore-remote: 0/3 (0.00)
    explore-connect: 0/0 (NaN)
    exploit-local: 0/0 (NaN)
    exploit-remote: 0/4 (0.00)
    exploit-connect: 0/0 (NaN)
  exploit deflected to exploration: 0
  ## Episode: 25/3000 'DQL' ϵ=0.8962, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 25 ended at t=50 total_reward -1099.0 with loss=1.4114421606063843
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 2/39 (0.05)
    explore-remote: 1/3 (0.25)
    explore-connect: 0/0 (NaN)
    exploit-local: 0/0 (NaN)
    exploit-remote: 0/5 (0.00)
    exploit-connect: 0/0 (NaN)
  exploit deflected to exploration: 0
  ## Episode: 26/3000 'DQL' ϵ=0.8960, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 26 ended at t=50 total_reward -1188.0 with loss=1.2944681644439697
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 2/40 (0.05)
    explore-remote: 0/1 (0.00)
    explore-connect: 0/0 (NaN)
    exploit-local: 0/0 (NaN)
    exploit-remote: 0/7 (0.00)
    exploit-connect: 0/0 (NaN)
  exploit deflected to exploration: 0
  ## Episode: 27/3000 'DQL' ϵ=0.8959, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 27 ended at t=50 total_reward -1139.0 with loss=1.2660026550292969
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 2/42 (0.05)
    explore-remote: 0/0 (NaN)
    explore-connect: 0/0 (NaN)
    exploit-local: 0/0 (NaN)
    exploit-remote: 0/6 (0.00)
    exploit-connect: 0/0 (NaN)
  exploit deflected to exploration: 0
  ## Episode: 28/3000 'DQL' ϵ=0.8957, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 28 ended at t=50 total_reward -1188.0 with loss=1.4521808624267578
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 2/39 (0.05)
    explore-remote: 0/1 (0.00)
    explore-connect: 0/0 (NaN)
    exploit-local: 0/0 (NaN)
    exploit-remote: 0/8 (0.00)
    exploit-connect: 0/0 (NaN)
  exploit deflected to exploration: 0
  ## Episode: 29/3000 'DQL' ϵ=0.8955, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 29 ended at t=50 total_reward -995.0 with loss=1.3645192384719849
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 2/45 (0.04)
    explore-remote: 0/2 (0.00)
    explore-connect: 0/0 (NaN)
    exploit-local: 0/0 (NaN)
    exploit-remote: 0/1 (0.00)
    exploit-connect: 0/0 (NaN)
  exploit deflected to exploration: 0
  ## Episode: 30/3000 'DQL' ϵ=0.8954, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 30 ended at t=50 total_reward -1120.0 with loss=1.2996902465820312
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 1/42 (0.02)
    explore-remote: 0/1 (0.00)
    explore-connect: 0/0 (NaN)
    exploit-local: 1/0 (1.00)
    exploit-remote: 0/5 (0.00)
    exploit-connect: 0/0 (NaN)
  exploit deflected to exploration: 0
  ## Episode: 31/3000 'DQL' ϵ=0.8952, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 31 ended at t=50 total_reward -1111.0 with loss=1.3280991315841675
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 1/41 (0.02)
    explore-remote: 0/3 (0.00)
    explore-connect: 0/0 (NaN)
    exploit-local: 1/0 (1.00)
    exploit-remote: 0/4 (0.00)
    exploit-connect: 0/0 (NaN)
  exploit deflected to exploration: 0
  ## Episode: 32/3000 'DQL' ϵ=0.8951, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 32 ended at t=50 total_reward -1196.0 with loss=1.3655550479888916
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 2/35 (0.05)
    explore-remote: 1/4 (0.20)
    explore-connect: 0/0 (NaN)
    exploit-local: 0/0 (NaN)
    exploit-remote: 0/8 (0.00)
    exploit-connect: 0/0 (NaN)
  exploit deflected to exploration: 0
  ## Episode: 33/3000 'DQL' ϵ=0.8949, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 33 ended at t=50 total_reward -1053.0 with loss=1.1139122247695923
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 1/44 (0.02)
    explore-remote: 0/2 (0.00)
    explore-connect: 0/0 (NaN)
    exploit-local: 1/0 (1.00)
    exploit-remote: 0/2 (0.00)
    exploit-connect: 0/0 (NaN)
  exploit deflected to exploration: 0
  ## Episode: 34/3000 'DQL' ϵ=0.8947, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 34 ended at t=50 total_reward -1188.0 with loss=1.4205697774887085
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 2/40 (0.05)
    explore-remote: 0/1 (0.00)
    explore-connect: 0/0 (NaN)
    exploit-local: 0/0 (NaN)
    exploit-remote: 0/7 (0.00)
    exploit-connect: 0/0 (NaN)
  exploit deflected to exploration: 0
  ## Episode: 35/3000 'DQL' ϵ=0.8946, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 35 ended at t=50 total_reward -1110.0 with loss=1.197160243988037
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 2/43 (0.04)
    explore-remote: 0/0 (NaN)
    explore-connect: 0/0 (NaN)
    exploit-local: 0/0 (NaN)
    exploit-remote: 0/5 (0.00)
    exploit-connect: 0/0 (NaN)
  exploit deflected to exploration: 0
  ## Episode: 36/3000 'DQL' ϵ=0.8944, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 36 ended at t=50 total_reward -1110.0 with loss=1.3628863096237183
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 2/43 (0.04)
    explore-remote: 0/0 (NaN)
    explore-connect: 0/0 (NaN)
    exploit-local: 0/0 (NaN)
    exploit-remote: 0/5 (0.00)
    exploit-connect: 0/0 (NaN)
  exploit deflected to exploration: 0
  ## Episode: 37/3000 'DQL' ϵ=0.8943, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 37 ended at t=50 total_reward -1121.0 with loss=1.1552034616470337
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 2/41 (0.05)
    explore-remote: 0/3 (0.00)
    explore-connect: 0/0 (NaN)
    exploit-local: 0/0 (NaN)
    exploit-remote: 0/4 (0.00)
    exploit-connect: 0/0 (NaN)
  exploit deflected to exploration: 0
  ## Episode: 38/3000 'DQL' ϵ=0.8941, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Unexpected exception formatting exception. Falling back to standard exception


In [None]:
# initialize the environment

# current_o = ctf_env_2.reset()
# wrapped_env = AgentWrapper(ctf_env_2, ActionTrackingStateAugmentation(ep, current_o))
DQL_agent = dqn_learning_run['learner']
logger.setLevel(logging.INFO) if log_results else ''

if log_results:
    logger.info("Saving model to directory " + log_dir)
    DQL_agent.save(os.path.join(log_dir, f"{exploit_train}_{train_while_exploit*'train_while_exploit'}_trainepisodes{training_episode_count}_checkpoint.tar"))


logger.info("")
logger.info("Now evaluate trained network")

In [None]:
# Use the trained agent to run the steps one by one

max_steps = iteration_count
verbosity = Verbosity.Normal
DQL_agent.load_best(os.path.join(log_dir, 'training'))
DQL_agent.train_while_exploit = train_while_exploit
DQL_agent.policy_net.eval()

current_o = ctf_env.reset()
wrapped_env = AgentWrapper(ctf_env, ActionTrackingStateAugmentation(ep, current_o))

In [None]:
# Evaluate DQL agent 10 times
for n_trial in range(10):
    # next action suggested by DQL agent
    h = []
    done = False
    total_reward = 0
    df = None
    current_o = wrapped_env.reset()
    for i in range(max_steps):
        logger.info(f"Step {i}")
        if done:
            break
        # run the suggested action
        action_style, next_action, _ = DQL_agent.exploit(wrapped_env, current_o)

        if next_action is None:
            logger.info(f"Inference ended with error: next action == None, returned with aciton_style {action_style}")
            break
        current_o, reward, done, info = wrapped_env.step(next_action)
        total_reward += reward
        action_str, reward_str = wrapped_env.internal_action_to_pretty_print(next_action, output_reward_str=True)
        h.append((i,  # wrapped_env.get_explored_network_node_properties_bitmap_as_numpy(current_o),
                  reward, total_reward,
                  action_str, action_style, info['precondition_str'], info['profile_str'], info["reward_string"]))  # "\t action  validity: " +

        df = pd.DataFrame(h, columns=["Step", "Reward", "Cumulative Reward", "Next action", "Processed by", "Precondition", "Profile", "Reward string"])
        df.set_index("Step", inplace=True)
        if log_results:
            df.to_csv(os.path.join(log_dir, f'{exploit_train}_{train_while_exploit*"train_while_exploit"}_trial{n_trial}_trainepisodes{training_episode_count}_output.csv'))

    print(f'len: {len(h)}, total reward: {total_reward}')
    pd.set_option("max_colwidth", 10**3)
    if df is not None:
        display(df)

In [None]:
    wrapped_env.render(mode='rgb_array', filename=None if not log_results else
                       os.path.join(log_dir, f'{exploit_train}_{train_while_exploit*"train_while_exploit"}_trial{n_trial}_trainepisodes{training_episode_count}_discovered_network.png'))