In [1]:
# Set constants
DATASET = "Task 3"
ALLOW_ONLY_HP_MATCHES = True
TRIALS_REQUIRED = 3
DIFFERENCE_THRESHS = [1.05, 1.1, 1.5, 3, 4.5]

In [2]:
# Load data
import pandas as pd

data = pd.read_csv("latent_trials.csv")
data.head()

Unnamed: 0,Classifier,Based on AE,sigma,precision,recall,f-score,max_iter,auc,n_estimators,max_depth,...,gamma,kernel,probability,algorithm,leaf_size,n_neighbors,contamination,n_bins,sigma_base,sigma_mult
0,ReconstructionThreshold,20240209141704,0.8,0.434783,0.740741,0.547945,,,,,...,,,,,,,,,,
1,ReconstructionThreshold,20240209141704,1.0,0.472222,0.62963,0.539683,,,,,...,,,,,,,,,,
2,ReconstructionThreshold,20240209141704,1.2,0.555556,0.555556,0.555556,,,,,...,,,,,,,,,,
3,ReconstructionThreshold,20240209141704,1.4,0.578947,0.407407,0.478261,,,,,...,,,,,,,,,,
4,ReconstructionThreshold,20240209141704,1.6,0.5,0.222222,0.307692,,,,,...,,,,,,,,,,


In [3]:
# Add processed classifer column
data["Processed Classifier"] = [classifier + "_%.2f" % sigma if classifier == "ReconstructionThreshold" else classifier for classifier, sigma in data[["Classifier", "sigma"]].values]
# data["Processed Classifier"] = data["Classifier"]
data = data.drop(columns=["Classifier"])

data["Dataset"] = [DATASET for _ in data.index]

In [4]:
# Add AE data

ae_data = pd.read_csv("ae_trials.csv", keep_default_na=False)
ae_data = ae_data[["name", "type", "input_size", "attempt", "leader" , "masking", "mask_size"]]
ae_data["Primary Model"] = ["%d" % float(leader) if leader else "%d" % float(base) for base, leader in ae_data[["name", "leader"]].values]
ae_data = ae_data.drop(columns=["leader"])

ae_data["type"] = [type + " (%.2f Masked)" % mask_size if mask else type for type, mask, mask_size in ae_data[["type", "masking", "mask_size"]].values]

ae_data.head()

Unnamed: 0,name,type,input_size,attempt,masking,mask_size,Primary Model
0,20240209141704,Adversarial AE (0.25 Masked),12288,0,True,0.25,20240209141704
1,20240209141849,Adversarial AE (0.25 Masked),12288,0,True,0.25,20240209141849
2,20240209141955,Adversarial AE (0.75 Masked),12288,0,True,0.75,20240209141955
3,20240212142033,Adversarial AE (0.50 Masked),12288,0,True,0.5,20240212142033
4,20240212142411,Adversarial AE,12288,0,False,0.25,20240212142411


In [5]:
# Convert AE data to dict and add to ll data

ae_data = {name: [name, ae_type, input_size, attempt, primary, masking, mask_size] for name, ae_type, input_size, attempt, masking, mask_size, primary in ae_data.values}
data = data[[name in ae_data.keys() for name in data["Based on AE"].values]]
data[["AE", "Type", "Input Size", "Attempt", "Primary AE Model", "Mask", "Mask Size"]] = [ae_data[based_on] for based_on in data["Based on AE"].values.tolist()]

data[data["Processed Classifier"] == "Discriminator_Thresh_30"]["Type"]
data.head()

Unnamed: 0,Based on AE,sigma,precision,recall,f-score,max_iter,auc,n_estimators,max_depth,min_samples_leaf,...,sigma_mult,Processed Classifier,Dataset,AE,Type,Input Size,Attempt,Primary AE Model,Mask,Mask Size
0,20240209141704,0.8,0.434783,0.740741,0.547945,,,,,,...,,ReconstructionThreshold_0.80,Task 3,20240209141704,Adversarial AE (0.25 Masked),12288,0,20240209141704,True,0.25
1,20240209141704,1.0,0.472222,0.62963,0.539683,,,,,,...,,ReconstructionThreshold_1.00,Task 3,20240209141704,Adversarial AE (0.25 Masked),12288,0,20240209141704,True,0.25
2,20240209141704,1.2,0.555556,0.555556,0.555556,,,,,,...,,ReconstructionThreshold_1.20,Task 3,20240209141704,Adversarial AE (0.25 Masked),12288,0,20240209141704,True,0.25
3,20240209141704,1.4,0.578947,0.407407,0.478261,,,,,,...,,ReconstructionThreshold_1.40,Task 3,20240209141704,Adversarial AE (0.25 Masked),12288,0,20240209141704,True,0.25
4,20240209141704,1.6,0.5,0.222222,0.307692,,,,,,...,,ReconstructionThreshold_1.60,Task 3,20240209141704,Adversarial AE (0.25 Masked),12288,0,20240209141704,True,0.25


In [6]:
# Create groups and check if hyper-params are the same
hyper_params_to_check = [
    "max_iter",
    "n_estimators",
    "max_depth",
    "min_samples_leaf",
    "numEstimators",
    "C",
    "gamma",
    "kernel",
    "probability",
    "algorithm",
    "leaf_size",
    "n_neighbors",
    "n_bins",
    # "sigma_base",
    # "sigma_mult",
    "sigma",
]

data_uniqueness = data[["Primary AE Model", "Processed Classifier"] + hyper_params_to_check]
data_uniqueness = data_uniqueness.groupby(["Primary AE Model", "Processed Classifier"], group_keys=False).nunique()
data_uniqueness["HP Match"] = [all([n<=1 for n in v]) for v in data_uniqueness.values]

data_uniqueness = {(i[0], i[1]): same_hp for i, same_hp in zip(data_uniqueness.index.values, data_uniqueness["HP Match"].values)}

data["HP Match"] = [data_uniqueness[b_ae, pc] for b_ae, pc in data[["Primary AE Model", "Processed Classifier"]].values]
data["Classifier"] = data["Processed Classifier"]
data = data.drop(columns=["Processed Classifier"])

data = data[["Dataset", "Type", "Input Size", "Primary AE Model", "AE", "Classifier", "Attempt", "HP Match", "precision", "recall", "f-score", "auc"]]
data.head()

Unnamed: 0,Dataset,Type,Input Size,Primary AE Model,AE,Classifier,Attempt,HP Match,precision,recall,f-score,auc
0,Task 3,Adversarial AE (0.25 Masked),12288,20240209141704,20240209141704,ReconstructionThreshold_0.80,0,True,0.434783,0.740741,0.547945,
1,Task 3,Adversarial AE (0.25 Masked),12288,20240209141704,20240209141704,ReconstructionThreshold_1.00,0,True,0.472222,0.62963,0.539683,
2,Task 3,Adversarial AE (0.25 Masked),12288,20240209141704,20240209141704,ReconstructionThreshold_1.20,0,True,0.555556,0.555556,0.555556,
3,Task 3,Adversarial AE (0.25 Masked),12288,20240209141704,20240209141704,ReconstructionThreshold_1.40,0,True,0.578947,0.407407,0.478261,
4,Task 3,Adversarial AE (0.25 Masked),12288,20240209141704,20240209141704,ReconstructionThreshold_1.60,0,True,0.5,0.222222,0.307692,


In [7]:
# Rename p, r and f1
data = data.rename(columns={"precision": "Precision", "recall": "Recall", "f-score": "F1-Score", "auc": "Area Under Curve"})

In [8]:
# Create final table
def row_max(df):
    max_f1 = df["F1-Score"].max()
    return df[df["F1-Score"] == max_f1].iloc[0]

if ALLOW_ONLY_HP_MATCHES:
    data = data[data["HP Match"]]

data = data[["Dataset", "Type", "Input Size", "Primary AE Model", "AE", "Classifier", "Precision", "Recall", "F1-Score", "Area Under Curve"]]
data_std = data.groupby(["Dataset", "Type", "Input Size", "Primary AE Model", "Classifier"])[["Precision", "Recall", "F1-Score", "Area Under Curve"]].std()
aes = data.groupby(["Dataset", "Type", "Input Size", "Primary AE Model", "Classifier"])["AE"].apply(lambda x : ":".join(["%d" % i for i in x]))
data = data.groupby(["Dataset", "Type", "Input Size", "Primary AE Model", "Classifier"])[["Precision", "Recall", "F1-Score", "Area Under Curve"]].mean()

data[["Precision Standard Deviation", "Recall Standard Deviation", "F1-Score Standard Deviation", "Area Under Curve Standard Deviation"]] = data_std[["Precision", "Recall", "F1-Score", "Area Under Curve"]]
data["Autoencoders"] = aes
data["AE Count"] = [len(v.split(":")) for v in data["Autoencoders"].values]
print(data["AE Count"].max())
data = data[data["AE Count"] >= TRIALS_REQUIRED]

data = data[~((data["Recall"] == 1) & (data["Precision"] < 0.6))]

data = data.groupby(["Dataset", "Input Size", "Type", "Classifier"]).apply(row_max)

formated_strings = ["%.4f (%.4f, %.4f)" % (f, p, r) for p, r, f in data[["Precision", "Recall", "F1-Score"]].values]
data["Formated Results"] = formated_strings
data = data.sort_values(["Type", "Input Size", "F1-Score"], ascending=False)

data

5


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Precision,Recall,F1-Score,Area Under Curve,Precision Standard Deviation,Recall Standard Deviation,F1-Score Standard Deviation,Area Under Curve Standard Deviation,Autoencoders,AE Count,Formated Results
Dataset,Input Size,Type,Classifier,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Task 3,4096,Vanilla AE (0.75 Masked),ReconstructionThreshold_1.00,0.470774,0.850074,0.599097,,0.082598,0.097229,0.059678,,20240229000807:20240229001203:20240229001557:2...,5,"0.5991 (0.4708, 0.8501)"
Task 3,4096,Vanilla AE (0.75 Masked),ReconstructionThreshold_1.20,0.504099,0.729208,0.585571,,0.063050,0.158246,0.058545,,20240229000807:20240229001203:20240229001557:2...,5,"0.5856 (0.5041, 0.7292)"
Task 3,4096,Vanilla AE (0.75 Masked),ReconstructionThreshold_0.80,0.426074,0.887704,0.571588,,0.061711,0.077911,0.049588,,20240228234047:20240228234652:20240228235238:2...,5,"0.5716 (0.4261, 0.8877)"
Task 3,4096,Vanilla AE (0.75 Masked),ReconstructionThreshold_1.40,0.518160,0.668764,0.569412,,0.068650,0.174745,0.055031,,20240229000807:20240229001203:20240229001557:2...,5,"0.5694 (0.5182, 0.6688)"
Task 3,4096,Vanilla AE (0.75 Masked),ReconstructionThreshold_2.00,0.885263,0.394553,0.539987,,0.205277,0.039505,0.070861,,20240229004635:20240229004846:20240229005051:2...,5,"0.5400 (0.8853, 0.3946)"
Task 3,4096,...,...,...,...,...,...,...,...,...,...,...,...,...
Task 3,4096,Adversarial AE,ReconstructionThreshold_5.20,0.240400,0.794142,0.368932,,0.020211,0.062829,0.029581,,20240228220642:20240228221015:20240228221406:2...,5,"0.3689 (0.2404, 0.7941)"
Task 3,4096,Adversarial AE,ReconstructionThreshold_5.00,0.239375,0.794142,0.367697,,0.020036,0.062829,0.029174,,20240228220642:20240228221015:20240228221406:2...,5,"0.3677 (0.2394, 0.7941)"
Task 3,4096,Adversarial AE,ReconstructionThreshold_5.60,0.239691,0.779043,0.366395,,0.017027,0.055675,0.024420,,20240228220642:20240228221015:20240228221406:2...,5,"0.3664 (0.2397, 0.7790)"
Task 3,4096,Adversarial AE,ReconstructionThreshold_5.40,0.238516,0.779043,0.365051,,0.017466,0.055675,0.025253,,20240228220642:20240228221015:20240228221406:2...,5,"0.3651 (0.2385, 0.7790)"


In [9]:
# Save final table

data.to_csv("Final LL Classifiers Table.csv")