In [1]:
# Set constants
DATASET = "Task 1"
ALLOW_ONLY_HP_MATCHES = True
TRIALS_REQUIRED = 3
DIFFERENCE_THRESHS = [1.05, 1.1, 1.5, 3, 4.5]

In [2]:
# Load data
import pandas as pd

data = pd.read_csv("latent_trials.csv")
data.head()

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


Unnamed: 0,Classifier,Based on AE,sigma,precision,recall,f-score,max_iter,auc,n_estimators,max_depth,...,gamma,kernel,probability,algorithm,leaf_size,n_neighbors,sigma_base,sigma_mult,contamination,n_bins
0,ReconstructionThreshold,20240209090027,0.8,0.368421,0.4375,0.4,,,,,...,,,,,,,,,,
1,ReconstructionThreshold,20240209090027,1.0,0.5,0.3125,0.384615,,,,,...,,,,,,,,,,
2,ReconstructionThreshold,20240209090027,1.2,0.666667,0.25,0.363636,,,,,...,,,,,,,,,,
3,ReconstructionThreshold,20240209090027,1.4,0.666667,0.25,0.363636,,,,,...,,,,,,,,,,
4,ReconstructionThreshold,20240209090027,1.6,0.666667,0.25,0.363636,,,,,...,,,,,,,,,,


In [3]:
# Add processed classifer column
data["Processed Classifier"] = [classifier + "_%.2f" % sigma if classifier == "ReconstructionThreshold" else classifier for classifier, sigma in data[["Classifier", "sigma"]].values]
# data["Processed Classifier"] = data["Classifier"]
data = data.drop(columns=["Classifier"])

data["Dataset"] = [DATASET for _ in data.index]

In [4]:
# Add AE data

ae_data = pd.read_csv("ae_trials.csv", keep_default_na=False)
ae_data = ae_data[["name", "type", "input_size", "attempt", "leader" , "masking", "mask_size"]]
ae_data["Primary Model"] = ["%d" % float(leader) if leader else "%d" % float(base) for base, leader in ae_data[["name", "leader"]].values]
ae_data = ae_data.drop(columns=["leader"])

ae_data["type"] = [type + " (%.2f Masked)" % mask_size if mask else type for type, mask, mask_size in ae_data[["type", "masking", "mask_size"]].values]

ae_data.head()

Unnamed: 0,name,type,input_size,attempt,masking,mask_size,Primary Model
0,20240209090027,Adversarial AE (0.75 Masked),4096,0,True,0.75,20240209090027
1,20240209090242,Vanilla AE (0.75 Masked),4096,0,True,0.75,20240209090027
2,20240209090429,Adversarial AE (0.75 Masked),4096,1,True,0.75,20240209090027
3,20240209090636,Vanilla AE (0.75 Masked),4096,1,True,0.75,20240209090027
4,20240209090821,Adversarial AE (0.75 Masked),4096,2,True,0.75,20240209090027


In [5]:
# Convert AE data to dict and add to ll data

ae_data = {name: [name, ae_type, input_size, attempt, primary, masking, mask_size] for name, ae_type, input_size, attempt, masking, mask_size, primary in ae_data.values}
data = data[[name in ae_data.keys() for name in data["Based on AE"].values]]
data[["AE", "Type", "Input Size", "Attempt", "Primary AE Model", "Mask", "Mask Size"]] = [ae_data[based_on] for based_on in data["Based on AE"].values.tolist()]

data[data["Processed Classifier"] == "Discriminator_Thresh_30"]["Type"]
data.head()

Unnamed: 0,Based on AE,sigma,precision,recall,f-score,max_iter,auc,n_estimators,max_depth,min_samples_leaf,...,n_bins,Processed Classifier,Dataset,AE,Type,Input Size,Attempt,Primary AE Model,Mask,Mask Size
0,20240209090027,0.8,0.368421,0.4375,0.4,,,,,,...,,ReconstructionThreshold_0.80,Task 1,20240209090027,Adversarial AE (0.75 Masked),4096,0,20240209090027,True,0.75
1,20240209090027,1.0,0.5,0.3125,0.384615,,,,,,...,,ReconstructionThreshold_1.00,Task 1,20240209090027,Adversarial AE (0.75 Masked),4096,0,20240209090027,True,0.75
2,20240209090027,1.2,0.666667,0.25,0.363636,,,,,,...,,ReconstructionThreshold_1.20,Task 1,20240209090027,Adversarial AE (0.75 Masked),4096,0,20240209090027,True,0.75
3,20240209090027,1.4,0.666667,0.25,0.363636,,,,,,...,,ReconstructionThreshold_1.40,Task 1,20240209090027,Adversarial AE (0.75 Masked),4096,0,20240209090027,True,0.75
4,20240209090027,1.6,0.666667,0.25,0.363636,,,,,,...,,ReconstructionThreshold_1.60,Task 1,20240209090027,Adversarial AE (0.75 Masked),4096,0,20240209090027,True,0.75


In [6]:
# Create groups and check if hyper-params are the same
hyper_params_to_check = [
    "max_iter",
    "n_estimators",
    "max_depth",
    "min_samples_leaf",
    "numEstimators",
    "C",
    "gamma",
    "kernel",
    "probability",
    "algorithm",
    "leaf_size",
    "n_neighbors",
    "n_bins",
    "sigma_base",
    "sigma_mult",
    "sigma",
]

data_uniqueness = data[["Primary AE Model", "Processed Classifier"] + hyper_params_to_check]
data_uniqueness = data_uniqueness.groupby(["Primary AE Model", "Processed Classifier"], group_keys=False).nunique()
data_uniqueness["HP Match"] = [all([n<=1 for n in v]) for v in data_uniqueness.values]

data_uniqueness = {(i[0], i[1]): same_hp for i, same_hp in zip(data_uniqueness.index.values, data_uniqueness["HP Match"].values)}

data["HP Match"] = [data_uniqueness[b_ae, pc] for b_ae, pc in data[["Primary AE Model", "Processed Classifier"]].values]
data["Classifier"] = data["Processed Classifier"]
data = data.drop(columns=["Processed Classifier"])

data = data[["Dataset", "Type", "Input Size", "Primary AE Model", "AE", "Classifier", "Attempt", "HP Match", "precision", "recall", "f-score", "auc"]]

data.head()

Unnamed: 0,Dataset,Type,Input Size,Primary AE Model,AE,Classifier,Attempt,HP Match,precision,recall,f-score,auc
0,Task 1,Adversarial AE (0.75 Masked),4096,20240209090027,20240209090027,ReconstructionThreshold_0.80,0,True,0.368421,0.4375,0.4,
1,Task 1,Adversarial AE (0.75 Masked),4096,20240209090027,20240209090027,ReconstructionThreshold_1.00,0,True,0.5,0.3125,0.384615,
2,Task 1,Adversarial AE (0.75 Masked),4096,20240209090027,20240209090027,ReconstructionThreshold_1.20,0,True,0.666667,0.25,0.363636,
3,Task 1,Adversarial AE (0.75 Masked),4096,20240209090027,20240209090027,ReconstructionThreshold_1.40,0,True,0.666667,0.25,0.363636,
4,Task 1,Adversarial AE (0.75 Masked),4096,20240209090027,20240209090027,ReconstructionThreshold_1.60,0,True,0.666667,0.25,0.363636,


In [7]:
# Rename p, r and f1
data = data.rename(columns={"precision": "Precision", "recall": "Recall", "f-score": "F1-Score", "auc": "Area Under Curve"})

In [8]:
# Create final table
def row_max(df):
    max_f1 = df["F1-Score"].max()
    return df[df["F1-Score"] == max_f1].iloc[0]

if ALLOW_ONLY_HP_MATCHES:
    data = data[data["HP Match"]]

data = data[["Dataset", "Type", "Input Size", "Primary AE Model", "AE", "Classifier", "Precision", "Recall", "F1-Score", "Area Under Curve"]]
data_std = data.groupby(["Dataset", "Type", "Input Size", "Primary AE Model", "Classifier"])[["Precision", "Recall", "F1-Score", "Area Under Curve"]].std()
aes = data.groupby(["Dataset", "Type", "Input Size", "Primary AE Model", "Classifier"])["AE"].apply(lambda x : ":".join(["%d" % i for i in x]))
data = data.groupby(["Dataset", "Type", "Input Size", "Primary AE Model", "Classifier"])[["Precision", "Recall", "F1-Score", "Area Under Curve"]].mean()

data[["Precision Standard Deviation", "Recall Standard Deviation", "F1-Score Standard Deviation", "Area Under Curve Standard Deviation"]] = data_std[["Precision", "Recall", "F1-Score", "Area Under Curve"]]
data["Autoencoders"] = aes
data["AE Count"] = [len(v.split(":")) for v in data["Autoencoders"].values]
print(data["AE Count"].max())
data = data[data["AE Count"] >= TRIALS_REQUIRED]

data = data[~((data["Recall"] == 1) & (data["Precision"] < 0.6))]

data = data.groupby(["Dataset", "Input Size", "Type", "Classifier"]).apply(row_max)

formated_strings = ["%.4f (%.4f, %.4f)" % (f, p, r) for p, r, f in data[["Precision", "Recall", "F1-Score"]].values]
data["Formated Results"] = formated_strings
data = data.sort_values(["Type", "Input Size", "F1-Score"], ascending=False)

data.head()

6


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Precision,Recall,F1-Score,Area Under Curve,Precision Standard Deviation,Recall Standard Deviation,F1-Score Standard Deviation,Area Under Curve Standard Deviation,Autoencoders,AE Count,Formated Results
Dataset,Input Size,Type,Classifier,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Task 1,12288,Vanilla AE (0.75 Masked),ReconstructionThreshold_1.40,0.756566,0.4625,0.572233,,0.178076,0.071261,0.103908,,20240223074531:20240223074734:20240223074924:2...,5,"0.5722 (0.7566, 0.4625)"
Task 1,12288,Vanilla AE (0.75 Masked),ReconstructionThreshold_1.60,0.792439,0.45,0.571174,,0.148965,0.08149,0.096432,,20240223074531:20240223074734:20240223074924:2...,5,"0.5712 (0.7924, 0.4500)"
Task 1,12288,Vanilla AE (0.75 Masked),ReconstructionThreshold_1.20,0.659801,0.475,0.549435,,0.143219,0.071261,0.090882,,20240223074531:20240223074734:20240223074924:2...,5,"0.5494 (0.6598, 0.4750)"
Task 1,12288,Vanilla AE (0.75 Masked),GradientBoostingClassifier,0.527778,0.577778,0.528571,0.540741,0.240563,0.367171,0.258331,0.407542,20240223074531:20240223074924:20240223104052,3,"0.5286 (0.5278, 0.5778)"
Task 1,12288,Vanilla AE (0.75 Masked),ReconstructionThreshold_1.00,0.56548,0.475,0.513512,,0.12416,0.071261,0.087667,,20240223074531:20240223074734:20240223074924:2...,5,"0.5135 (0.5655, 0.4750)"


In [9]:
# Save final table

data.to_csv("Final LL Classifiers Table.csv")