In [None]:
import torch as t
import numpy as np
import os
import json
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay
import pandas
import pandas
from hyperparam_opt_wrapper import HyperparamOptimizerWrapper
import json
from hyperparameter_search_policies import opt_types
import statistics

In [None]:
FILE_PATH = "/workspace/results/[run]2025-2-20_12:18:54/"
result_files = os.listdir(FILE_PATH)
result_paths = []
for file in result_files:
    result_paths.append(f"{FILE_PATH}{file}")
FEATURE_NAMES = ['length_url', 'length_hostname', 'ip', 'nb_dots', 'nb_hyphens', 'nb_at', 'nb_qm', 'nb_and', 'nb_or', 'nb_eq', 'nb_underscore', 'nb_tilde', 'nb_percent', 'nb_slash', 'nb_star', 'nb_colon', 'nb_comma', 'nb_semicolumn', 'nb_dollar', 'nb_space', 'nb_www', 'nb_com', 'nb_dslash', 'http_in_path', 'https_token', 'ratio_digits_url', 'ratio_digits_host', 'punycode', 'port', 'tld_in_path', 'tld_in_subdomain', 'abnormal_subdomain', 'nb_subdomains', 'prefix_suffix', 'random_domain', 'shortening_service', 'path_extension', 'nb_redirection', 'nb_external_redirection', 'length_words_raw', 'char_repeat', 'shortest_words_raw', 'shortest_word_host', 'shortest_word_path', 'longest_words_raw', 'longest_word_host', 'longest_word_path', 'avg_words_raw', 'avg_word_host', 'avg_word_path', 'phish_hints', 'domain_in_brand', 'brand_in_subdomain', 'brand_in_path', 'suspecious_tld', 'statistical_report', 'whois_registered_domain', 'domain_registration_length', 'domain_age', 'web_traffic', 'dns_record', 'google_index', 'page_rank']

In [None]:
def select_by_identifier(file_names, identifier):
    remaining_file_names = []
    selected_file_names = []
    for file_name in file_names:
        if identifier in file_name:
            selected_file_names.append(file_name)
        else:
            remaining_file_names.append(file_name)
    return selected_file_names, remaining_file_names

In [None]:
def update_search_run_grid(data, axis=None):
    if axis is None:
        _, axis = plt.subplots(3, 2)
        axis[0,0].set_title("probability difference")
        axis[0,1].set_title("total target probability")
        axis[1,0].set_title("gower distance")
        axis[1,1].set_title("constraint loss")
        axis[2,0].set_title("cost function")

    # prob_diff
    axis[0,0].plot(data["step"], data["prob_diff"])

    #total_node_prob
    axis[0,1].plot(data["step"], data["total_node_prob"])

    # gower_dist
    axis[1,0].plot(data["step"], data["gower_dist"])

    # const_loss
    axis[1,1].plot(data["step"], data["const_loss"])

    # cost_func
    axis[2,0].plot(data["step"], data["cost_func"])

    return axis

In [None]:
def display_search_runs(logs, qualitative_samples = 2):
    for log in logs:
        with open(log, 'r') as file:
            log_dict = json.load(file)
        adversarial_id = log.split("/")[-1].split("]")[0][1:]
        quali_printed = 0
        while quali_printed < qualitative_samples:
            search_data = log_dict[list(log_dict.keys())[quali_printed]]
            plot_ax = update_search_run_grid(search_data)
            plt.show(False)
            plt.savefig(f"{FILE_PATH}[{adversarial_id}]quali_search_plot_{quali_printed}")
            plt.close("all")
            quali_printed += 1
        plot_ax = update_search_run_grid(log_dict[list(log_dict.keys())[0]])
        for key in list(log_dict.keys())[1:]:
            plot_ax = update_search_run_grid(log_dict[key], plot_ax)
        plt.show(False)
        plt.savefig(f"{FILE_PATH}[{adversarial_id}]quanti_search_plot_lines")   
        plt.close("all")

In [None]:
def get_best_node_metrics(log_dict):
    gower_dists = []
    total_probs = []
    prob_diffs = []
    for key in log_dict.keys():
        eval_func_search_trace = log_dict[key]["cost_func"]
        best_idx = eval_func_search_trace.index(min(eval_func_search_trace))
        gower_dists.append(log_dict[key]["gower_dist"][best_idx])
        total_probs.append(log_dict[key]["total_node_prob"][best_idx])
        prob_diffs.append(log_dict[key]["prob_diff"][best_idx])
    return gower_dists, total_probs, prob_diffs
        

In [None]:
def display_result_scatter(logs):
    for log in logs:
        with open(log, 'r') as file:
            log_dict = json.load(file)
        adversarial_id = log.split("/")[-1].split("]")[0][1:]
        gower_dists, total_probs, prob_diffs = get_best_node_metrics(log_dict)
        figure, axes = plt.subplots(1, 2)
        axes[0].scatter(gower_dists, total_probs)
        axes[0].set_title('gower by total probability')
        axes[0].set_xlabel("gower dist")
        axes[0].set_ylabel("total prob")
        axes[1].scatter(gower_dists, prob_diffs)
        axes[1].set_title('gower by probability change')
        axes[1].set_xlabel("gower dist")
        axes[1].set_ylabel("prob diff")
        plt.show(False)
        plt.savefig(f"{FILE_PATH}[{adversarial_id}]quanti_search_plot_scatter")   
        plt.close("all")

In [None]:
def display_conf_mats(logs):
    for log in logs:
        with open(log, 'r') as file:
            log_dict = json.load(file)
        adversarial_id = log.split("/")[-1].split("]")[0][1:]
        conf_mat_before = np.asarray(log_dict["before"])
        conf_mat_after = np.asarray(log_dict["after"])
        
        before_disp = ConfusionMatrixDisplay(confusion_matrix=conf_mat_before)
        before_disp.plot()
        plt.show(False)
        plt.savefig(f"{FILE_PATH}[{adversarial_id}]conf_mat_before")   
        plt.close("all")

        after_disp = ConfusionMatrixDisplay(confusion_matrix=conf_mat_after)
        after_disp.plot()
        plt.show(False)
        plt.savefig(f"{FILE_PATH}[{adversarial_id}]conf_mat_after")   
        plt.close("all")

In [None]:
def process_search_logs_runs(file_names):
    search_logs, remainder = select_by_identifier(file_names, "search_logs")
    display_search_runs(search_logs)
    return remainder

In [None]:
def process_search_logs_scatter(file_names):
    search_logs, remainder = select_by_identifier(file_names, "search_logs")
    display_result_scatter(search_logs)
    return remainder

In [None]:
def process_conf_mats(file_names):
    conf_mat_logs, remainder = select_by_identifier(file_names, "confusion_matrices")
    display_conf_mats(conf_mat_logs)
    return remainder

In [None]:
def get_feature_analysis_metrics(feature_distance_tensors):
    adj_feat_ratio_list = []
    mean_adj_list = []
    adj_var_list = []
    adv_id_map = {}
    tensor_idx = 0
    for tensor_name in feature_distance_tensors:
        feature_distance_tensor = t.load(tensor_name)
        if "]" in tensor_name:
            adversarial_id = tensor_name.split("/")[-1].split("]")[0][1:]
        else:
            identifier_list = tensor_name.split("/")[-1].split("_")
            adversarial_id = f"{identifier_list[0]}_{identifier_list[1]}"
        feat_adjusted_tensor = t.logical_not(t.eq(feature_distance_tensor, 0)).long()

        adjusted_feature_counts = t.sum(feat_adjusted_tensor, dim=0)
        adj_feat_ratio_list.append(t.div(adjusted_feature_counts, feature_distance_tensor.shape[0]))
        mean_adj_list.append(t.mean(feature_distance_tensor, dim=0))
        adj_var_list.append(t.var(feature_distance_tensor, dim=0))
        adv_id_map[tensor_idx] = adversarial_id
        tensor_idx += 1

    adj_feat_ratios = t.cat(adj_feat_ratio_list, dim=0)
    mean_adjs = t.cat(mean_adj_list, dim=0)
    adj_vars = t.cat(adj_var_list, dim=0)
    return adj_feat_ratios, mean_adjs, adj_vars, adv_id_map
        

In [None]:
def print_avg_feat_analysis(results_dict):
    avg_dict = {}
    for entry_id in results_dict.keys():
        local_dict = results_dict[entry_id]
        for feature in local_dict.keys():
            if feature not in avg_dict.keys():
                avg_dict[feature] = {}
            local_local_dict = local_dict[feature]
            for metric in local_local_dict.keys():
                if metric not in avg_dict[feature].keys():
                    avg_dict[feature][metric] = []
                avg_dict[feature][metric].append(results_dict[entry_id][feature][metric])
    for feature in avg_dict.keys():
        for metric in avg_dict[feature].keys():
            avg_dict[feature][metric] = statistics.mean(avg_dict[feature][metric])
    sortable_avg_dict = {}
    for feature_name in avg_dict.keys():
        if "feat_name" not in sortable_avg_dict.keys():
            sortable_avg_dict["feat_name"] = t.Tensor([feature_name])
        else:
            sortable_avg_dict["feat_name"] = t.cat([sortable_avg_dict["feat_name"], t.Tensor([feature_name])], dim=0)

        for metric in avg_dict[feature_name].keys():
            if metric not in sortable_avg_dict.keys():
                sortable_avg_dict[metric] = t.Tensor([feature_name])
            else:
                sortable_avg_dict[metric] = t.cat([sortable_avg_dict[metric], t.Tensor([feature_name])], dim=0) 

    sort_idxs = t.argsort(sortable_avg_dict["feat_ratio"])
    for key in sortable_avg_dict.keys():
        sortable_avg_dict[key] = sortable_avg_dict[key][sort_idxs]
    
    print("feat_name \t adj_ratio \t mean_dist \t dist_var")
    for idx, feature_name in enumerate(sortable_avg_dict["feat_name"]):
        print(f"{feature_name} \t {round(sortable_avg_dict['feat_ratio'][idx],5)} \t {round(sortable_avg_dict['mean_dist'][idx],5)} \t {round(sortable_avg_dict['dist_var'][idx],5)}")

In [None]:
def process_feature_analysis(file_names):
    feature_distances, remainder = select_by_identifier(file_names, "feat_dists")
    feat_ratios, mean_dists, dist_vars, adv_id_map = get_feature_analysis_metrics(feature_distances)
    result_dict = {}
    for idx in adv_id_map.keys():
        adv_id = adv_id_map[idx]
        sort_idxs = t.argsort(feat_ratios[idx])
        feat_ratios_sorted = feat_ratios[idx][sort_idxs]
        mean_dists_sorted = mean_dists[idx][sort_idxs]
        dist_vars_sorted = dist_vars[idx][sort_idxs]
        result_dict[adv_id] = {}
        for feat_idx in range(len(sort_idxs)):
            feat_ratio = feat_ratios_sorted[idx][feat_idx]
            mean_dist = mean_dists_sorted[idx][feat_idx]
            dist_var = dist_vars_sorted[idx][feat_idx]
            feat_name = FEATURE_NAMES[feat_idx]
            result_dict[adv_id][feat_name] = {"feat_ratio": feat_ratio, "mean_dist": mean_dist, "dist_var": dist_var}
    result_dict_obj = json.dumps(result_dict)
    with open(f"{FILE_PATH}feature_adj_analysis", "w") as outfile:
        outfile.write(result_dict_obj)
    print_avg_feat_analysis(result_dict)
    return result_dict, remainder

In [None]:
def stringify(dataframe):
    for key in dataframe.keys():
        if not isinstance(dataframe[key], dict) and not isinstance(dataframe[key], list):
            dataframe[key] = str(dataframe[key])
        elif isinstance(dataframe[key], dict):
            dataframe[key] = stringify(dataframe[key])
        else:
            dataframe[key] = [str(elem) for elem in dataframe[key]]
    return dataframe

In [None]:
def read_result_pkl(result_pkl_path, results=True):
    dictionary = pandas.read_pickle(result_pkl_path)
    print(type(dictionary))
    dictionary = stringify(dictionary)
    if results:
        with open(f"{FILE_PATH}results.json", "w") as outfile: 
            json.dump(dictionary, outfile)
    else:
        with open(f"{FILE_PATH}configs.json", "w") as outfile: 
            json.dump(dictionary, outfile)
    print(dictionary)

In [None]:
process_conf_mats(result_paths)

In [None]:
process_search_logs_runs(result_paths)

In [None]:
process_search_logs_scatter(result_paths)

In [None]:
read_result_pkl(f"{FILE_PATH}results.pkl")
read_result_pkl(f"{FILE_PATH}configs.pkl", False)

In [28]:
result_dict, remainder = process_feature_analysis(result_paths)
print_avg_feat_analysis(result_dict)

RuntimeError: Tensors must have same number of dimensions: got 1 and 2