#### Build

In [1]:
# Upload the Github Zipfile
# Unzip the Github Zipfile
# Create python 3.8             conda create --name cse297 python=3.8.10
# Activate the environment      source activate cse297
# Within env install ipyk       pip install ipykernel
# Register kernel w/ jupy       python -m ipykernel install --user --name=capstone
# Change to package dir         cd CybORG-Competitive/CybORG/
# Install packages within env   pip install -e .
# Select 'capstone' kernel
# Confirm python version is 3.8.10
import sys
print(sys.version)

3.8.19 (default, Mar 20 2024, 19:55:45) [MSC v.1916 64 bit (AMD64)]


#### Import

In [2]:
from environments import build_blue_agent, build_red_agent, sample, get_timesteps, get_algorithm_select
from environments import build_cardiff_agent, sample_against_cardiff
from environments import build_mindrake_agent, sample_against_mindrake
import ray
import os, sys, shutil, time
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from time import sleep

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)
import logging
logging.disable(logging.WARNING)
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import tensorflow as tf
logger = logging.getLogger(__name__)



#### Verify GPU

In [3]:
physical_devices = tf.config.list_physical_devices('GPU')
for device in physical_devices:
    tf.config.experimental.set_memory_growth(device, True)

#### Train Competitive Red Agent

In [None]:
# Retrieve the timestep and the algorithm selected for training
selected_timestep = get_timesteps()
selected_algorithm = get_algorithm_select()

# Select which generation we want to start training from
# If we want to train from scratch set to 1 (default value is 1)
# If we want to train from the latest generation, set to latest generation in the competitive policy pool
start_from_generation = 1

# Load rewards if training from a checkpointed generation
blue_scores = []
red_scores = []

# Total number of generations create for red agent
generations = 10

# Error checking: Update either the 'starting_from_generation' or 'total_generations' parameter
if(start_from_generation > generations):
    raise ValueError("Starting generation and Total Generation incompatible")

# Number of batches without improvement before ending training
tolerance = 3

# Create Initial Policies
ray.init(ignore_reinit_error=True, log_to_driver=False)
blue = build_blue_agent(fresh=True, opponent=True)
print("Pass 1")
red = build_red_agent(fresh=True)
print("Pass 2")

# Convert float string into a float
# Useful if loading rewards from certain generation
blue_scores = [float(item) for item in blue_scores]
red_scores = [float(item) for item in red_scores]

print()
print("+--------------------------------+")
print("| Red Competitive Training Start |")
print("+--------------------------------+")
print()

for g in range(start_from_generation, generations+1):

    # Time how long each generation takes
    g_time = time.time()

    if (g < 10):
        dashes = 14
    elif (g < 100):
        dashes = 15
    else:
        dashes = 16
    print('+'+'-'*dashes+'+')            
    print(f"| Generation {g} |")
    print('+'+'-'*dashes+'+')
    print()

    red.restore(f"./policies/{selected_algorithm}/{selected_timestep}/red_competitive_pool/competitive_red_0/checkpoint_000000")

    b = 0
    red_max = 0
    tol = tolerance
    b_time = time.time()
    while True:
        b += 1
        start = time.time()
        result = red.train()
        
        # print(result)
        # print(result["info"])
        # print(result["info"]["learner"])
        
        end = time.time()
        elapsed_time = end-start
        # Algorithm specific score retrieval
        red_score = None
        if(selected_algorithm == "ppo" or selected_algorithm == "dqn"):
            red_score = result["sampler_results"]["episode_reward_mean"]
        else:
            red_score = result["episode_reward_mean"]
        entropy = vf_loss = 0
        if(selected_algorithm != "dqn"):
            entropy = result['info']['learner']['default_policy']['learner_stats']['entropy']
            vf_loss = result['info']['learner']['default_policy']['learner_stats']['vf_loss']
        print(f"Batch {b} -- Red Score: {red_score:0.2f}    Entropy: {entropy:0.2f}    VF Loss: {vf_loss:0.2f}", end="    ")
        print('Execution time:', time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))
        if b > 1:
            if (red_score > red_max):
                red_max = red_score
                tol = tolerance
                checkpoint_path = red.save(checkpoint_dir=f"./policies/{selected_algorithm}/{selected_timestep}/red_competitive_pool/competitive_red_{g}")
                path_file = open(f"./policies/{selected_algorithm}/{selected_timestep}/red_competitive_pool/competitive_red_{g}/checkpoint_path", "w")
                path_file.write(checkpoint_path)
                path_file.close()
            elif(tol > 1):
                tol -= 1
             # when agent is no longer improving, break and save the new best-response agent
            else:
                red_scores.append(red_max)
                red.restore(checkpoint_path)
                print('Batch time:', time.strftime("%H:%M:%S", time.gmtime(time.time()-b_time)))
                print(checkpoint_path)
                break

    pool_size = g
    pool_file = open(f"./policies/{selected_algorithm}/{selected_timestep}/red_competitive_pool/pool_size", "w")
    pool_file.write(str(pool_size))
    pool_file.close()
    print()

    blue.restore(f"./policies/{selected_algorithm}/{selected_timestep}/blue_opponent_pool/opponent_blue_0/checkpoint_000000")

    b = 0 # b tracks the batches of training completed
    blue_min = float('inf')
    tol = tolerance
    b_time = time.time()
    while True:
        b += 1
        start = time.time()
        result = blue.train()

        # print(result)
        # print(result["info"])
        # print(result["info"]["learner"])
        
        end = time.time()
        elapsed_time = end-start
        
        # Score retrieval based on algorithm
        blue_score = None
        if(selected_algorithm == "ppo" or selected_algorithm == "dqn"):
            blue_score = -result["sampler_results"]["episode_reward_mean"]
        else:
            blue_score = -result["episode_reward_mean"]
        entropy = vf_loss = 0
        if(selected_algorithm != "dqn"):
            entropy = result['info']['learner']['default_policy']['learner_stats']['entropy']
            vf_loss = result['info']['learner']['default_policy']['learner_stats']['vf_loss']
        print(f"Batch {b} -- Blue Score: {blue_score:0.2f}    Entropy: {entropy:0.2f}    VF Loss: {vf_loss:0.2f}", end="    ")
        print('Execution time:', time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))
        if b > 1:
            if (blue_score < blue_min):
                blue_min = blue_score
                tol = tolerance
                checkpoint_path = blue.save(checkpoint_dir=f"./policies/{selected_algorithm}/{selected_timestep}/blue_opponent_pool/opponent_blue_{g}")
                path_file = open(f"./policies/{selected_algorithm}/{selected_timestep}/blue_opponent_pool/opponent_blue_{g}/checkpoint_path", "w")
                path_file.write(checkpoint_path)
                path_file.close()
            elif(tol > 1):
                tol -= 1
            # when agent is no longer improving, break and save the new competitive agent
            else:
                blue_scores.append(blue_min)
                blue.restore(checkpoint_path)
                print('Batch time:', time.strftime("%H:%M:%S", time.gmtime(time.time()-b_time)))
                print(checkpoint_path)
                break

    pool_file = open(f"./policies/{selected_algorithm}/{selected_timestep}/blue_opponent_pool/pool_size", "w")
    pool_file.write(str(pool_size))
    pool_file.close()
    print()

    print(f'Blue Scores so far {["%.2f" % i for i in blue_scores]}')
    print(f'Red Scores so far {["%.2f" % i for i in red_scores]}')
    print("Total time: ",time.strftime("%H:%M:%S", time.gmtime(time.time()-g_time)))
    print()
    
    # print(f'-------- Sample Game for Generation {g} --------')
    # sample(red, blue, verbose=True, show_policy=True)
    # print()

selecting blue ppo config


#### Evaluate Red MinMax Performance

In [None]:
# Set the sample games (default is 50)
sample_games = 1

r_min = [float('inf')]*(generations+1) # red agent minimum scores
r_min_op = [0]*(generations+1) # id of blue opponent that got max score
r_best_score = 0
r_best_id = 0

# evaluate existing pool of agents
print('Evaluating Agents...')
g_start = time.time()

# iteration through red agents
for r in range(generations,0,-1):

    # If the generation exists
    if(os.path.exists(f"./policies/{selected_algorithm}/{selected_timestep}/red_competitive_pool/competitive_red_{r}/checkpoint_path")):
        start = time.time()
        path_file = open(f"./policies/{selected_algorithm}/{selected_timestep}/red_competitive_pool/competitive_red_{r}/checkpoint_path", "r")
        red_restore_path = path_file.read()
        path_file.close()
        red.restore(red_restore_path)
    
        # iterate through blue opponents
        for b in range(generations,0,-1):

            # If the generation exists
            if(os.path.exists(f"./policies/{selected_algorithm}/{selected_timestep}/blue_opponent_pool/opponent_blue_{b}/checkpoint_path")):
                path_file = open(f"./policies/{selected_algorithm}/{selected_timestep}/blue_opponent_pool/opponent_blue_{b}/checkpoint_path", "r")
                blue_restore_path = path_file.read()
                path_file.close()
                blue.restore(blue_restore_path)
                score = sample(red, blue, games=sample_games)
                if score < r_min[r]:
                    r_min[r] = score
                    r_min_op[r] = b
                    
        print(f'Red Agent {r} expects a minimum of {r_min[r]:0.2f} points, against Blue Opponent {r_min_op[r]}.', end="\t")
        print('Execution time:', time.strftime("%H:%M:%S", time.gmtime(time.time()-start)))
        if r_min[r] > r_best_score:
            r_best_score = r_min[r]
            r_best_id = r
print()
print(f'Top performing Red Agent is generation {r_best_id}')
print("Total Execution Time: ", time.strftime("%H:%M:%S", time.gmtime(time.time()-g_start)))

path_file = open(f"./policies/{selected_algorithm}/{selected_timestep}/red_competitive_pool/competitive_red_{r_best_id}/checkpoint_path", "r")
red_competitive_path = path_file.read()
path_file.close()
path_file = open(f"./policies/{selected_algorithm}/{selected_timestep}/competitive_red_policy", "w")
path_file.write(red_competitive_path)
path_file.close()

ray.shutdown()

#### Observe Exploitability of each Red Policy

In [None]:
r_exp = [] # exploitability of each Red Policy
for r in r_min[1:]:
    r_exp.append(r_best_score-r)
print(r_exp)

#### Plot Red Training Scores

In [None]:
id_plot = [id for id in range(1,len(red_scores)+1)]
data_plot = pd.DataFrame({"Generation":id_plot, "Training Score":red_scores, "Blue Scores":blue_scores})
plt.figure()
sns.lineplot(x = "Generation", y = "Training Score", data=data_plot, color='red', label="Red Agent Training Score")
sns.lineplot(x = "Generation", y = "Blue Scores", data=data_plot, color='blue', label="Blue Opponent Score")
plt.show()

# Minmax Evaluation
id_plot = [id for id in range(len(red_scores))]
data_plot = pd.DataFrame({"Generation":id_plot, "Min Expected Score":r_min[1:]})
plt.figure()
sns.regplot(x = "Generation", y = "Min Expected Score", data=data_plot, color='red', scatter_kws={'s':5}, label="Red Policy Min Expected Score")
plt.legend()
plt.show()

# Exploitability
id_plot = [id for id in range(len(red_scores))]
data_plot = pd.DataFrame({"Generation":id_plot, "Exploitability":r_exp})
plt.figure()
sns.regplot(x = "Generation", y = "Exploitability", data=data_plot, color='red', scatter_kws={'s':5}, label="RedPolicy Exploitability")
plt.legend()
plt.show()

#### Plot Red Training Scores (First 5 points dropped)

In [None]:
# Scores During Training
id_plot = [id for id in range(1,len(red_scores)+1)]
data_plot = pd.DataFrame({"Generation":id_plot[5:], "Training Score":red_scores[5:], "Blue Scores":blue_scores[5:]})
plt.figure()
sns.lineplot(x = "Generation", y = "Training Score", data=data_plot, color='red', label="Red Agent Training Score")
sns.lineplot(x = "Generation", y = "Blue Scores", data=data_plot, color='blue', label="Blue Opponent Score")
plt.show()

# Minmax Evaluation
id_plot = [id for id in range(len(red_scores))]
data_plot = pd.DataFrame({"Generation":id_plot[5:], "Min Expected Score":r_min[6:]})
plt.figure()
sns.regplot(x = "Generation", y = "Min Expected Score", data=data_plot, color='red', scatter_kws={'s':5}, label="Red Policy Min Expected Score")
plt.legend()
plt.show()

# Exploitability
id_plot = [id for id in range(len(red_scores))]
data_plot = pd.DataFrame({"Generation":id_plot[5:], "Exploitability":r_exp[5:]})
plt.figure()
sns.regplot(x = "Generation", y = "Exploitability", data=data_plot, color='red', scatter_kws={'s':5}, label="Red Policy Exploitability")
plt.legend()
plt.show()

#### START - ISOLATED TESTING ####

In [4]:
# Sample games to evaluate performance
sample_games = 1000

# Restore the best red agent thus far
selected_timestep = get_timesteps()
selected_algorithm = get_algorithm_select()

# Build both blue agents
# cardiff_agent = build_cardiff_agent()
mindrake_agent = build_mindrake_agent()

# Restore the best red agent
red = build_red_agent(fresh=False)
red_optimal_filepath = None
with open(f"./policies/{selected_algorithm}/{selected_timestep}/competitive_red_policy", "r") as red_file:
    red_optimal_filepath = red_file.read()
    print(red_optimal_filepath)
red.restore(red_optimal_filepath)

# Sample vs cardiff
# sample_against_cardiff(test_red=red, test_blue=cardiff_agent, verbose=False, games=sample_games)

# Sample vs mindrake
sample_against_mindrake(test_red=red, test_blue=mindrake_agent, verbose=False, games=sample_games)

Using checkpoint file (Controller): C:\Users\takys\OneDrive\Documents\UCSC\Year 6\cse297\Graduate-Capstone\CybORG-Competitive\mindrake/logs/bandits/controller_bandit/bandit_controller_15000.pkl
Using checkpoint file (B-line): C:\Users\takys\OneDrive\Documents\UCSC\Year 6\cse297\Graduate-Capstone\CybORG-Competitive\mindrake/logs/various/SR/PPO/checkpoint_001916/checkpoint-1916
Using checkpoint file (Red Meander): C:\Users\takys\OneDrive\Documents\UCSC\Year 6\cse297\Graduate-Capstone\CybORG-Competitive\mindrake/logs/various/PPO/PPO/checkpoint_001829/checkpoint-1829


[2m[36m(RolloutWorker pid=4300)[0m   import pkg_resources
[2m[36m(RolloutWorker pid=17352)[0m   import pkg_resources
[2m[36m(RolloutWorker pid=4300)[0m Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
[2m[36m(RolloutWorker pid=4300)[0m   declare_namespace(pkg)
[2m[36m(RolloutWorker pid=4300)[0m Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
[2m[36m(RolloutWorker pid=4300)[0m   declare_namespace(pkg)
[2m[36m(RolloutWorker pid=17352)[0m Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
[2

selecting red ppo config


[2m[36m(RolloutWorker pid=18056)[0m   import pkg_resources
[2m[36m(RolloutWorker pid=18056)[0m Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
[2m[36m(RolloutWorker pid=18056)[0m   declare_namespace(pkg)
[2m[36m(RolloutWorker pid=18056)[0m Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
[2m[36m(RolloutWorker pid=18056)[0m   declare_namespace(pkg)


[2m[36m(RolloutWorker pid=18056)[0m Creating red trainer
[2m[36m(RolloutWorker pid=18056)[0m opponent is selecting ppo




./policies/ppo/100/red_competitive_pool/competitive_red_9/checkpoint_000106
Scanned column likely inaccurate.
Scanned column likely inaccurate.
Scanned column likely inaccurate.
Scanned column likely inaccurate.
Scanned column likely inaccurate.
Scanned column likely inaccurate.
Scanned column likely inaccurate.
Scanned column likely inaccurate.
Scanned column likely inaccurate.
Scanned column likely inaccurate.
Scanned column likely inaccurate.
Scanned column likely inaccurate.
Scanned column likely inaccurate.
Scanned column likely inaccurate.
Scanned column likely inaccurate.
Scanned column likely inaccurate.
Scanned column likely inaccurate.
Scanned column likely inaccurate.
Scanned column likely inaccurate.
Scanned column likely inaccurate.
Scanned column likely inaccurate.
Scanned column likely inaccurate.
Scanned column likely inaccurate.
Scanned column likely inaccurate.
Scanned column likely inaccurate.
Scanned column likely inaccurate.
Scanned column likely inaccurate.
Scanne

#### STOP - ISOLATED TESTING ####

#### Competitive Red Agent Performance against Cardiff & Mindrake Blue Agents

In [None]:
# Sample games to evaluate performance
sample_games = 100

# Restore the best red agent thus far
selected_timestep = get_timesteps()
selected_algorithm = get_algorithm_select()

# Build both blue agents
cardiff_agent = build_cardiff_agent()
mindrake_agent = build_mindrake_agent()

# Restore the best red agent
red = build_red_agent(fresh=False)
red_optimal_filepath = None
with open(f"./policies/{selected_algorithm}/{selected_timestep}/competitive_red_policy", "r") as red_file:
    red_optimal_filepath = red_file.read()
    print(red_optimal_filepath)
red.restore(red_optimal_filepath)

# Sample vs cardiff
sample_against_cardiff(test_red=red, test_blue=cardiff_agent, verbose=False, games=sample_games)

# Sample vs mindrake
sample_against_mindrake(test_red=red, test_blue=mindrake_agent, verbose=False, games=sample_games)

#### Results saved in 'games' folder ####