In [1]:
# Set constants
DATASET = "Task 2 (10% Noise)"
ALLOW_ONLY_HP_MATCHES = True
TRIALS_REQUIRED = 3
DIFFERENCE_THRESHS = [1.05, 1.1, 1.5, 3, 4.5]

In [2]:
# Load data
import pandas as pd

data = pd.read_csv("latent_trials.csv")
data.head()

Unnamed: 0,Classifier,Based on AE,sigma,precision,recall,f-score,n_estimators,auc,max_depth,min_samples_leaf,...,kernel,probability,contamination,n_neighbors,n_bins,max_iter,algorithm,leaf_size,sigma_base,sigma_mult
0,ReconstructionThreshold,20240210000000.0,0.8,0.0,0.0,0.0,,,,,...,,,,,,,,,,
1,ReconstructionThreshold,20240210000000.0,1.0,0.0,0.0,0.0,,,,,...,,,,,,,,,,
2,ReconstructionThreshold,20240210000000.0,1.2,0.0,0.0,0.0,,,,,...,,,,,,,,,,
3,ReconstructionThreshold,20240210000000.0,1.4,0.0,0.0,0.0,,,,,...,,,,,,,,,,
4,ReconstructionThreshold,20240210000000.0,1.6,0.0,0.0,0.0,,,,,...,,,,,,,,,,


In [3]:
# Add processed classifer column
data["Processed Classifier"] = [classifier + "_%.2f" % sigma if classifier == "ReconstructionThreshold" else classifier for classifier, sigma in data[["Classifier", "sigma"]].values]
# data["Processed Classifier"] = data["Classifier"]
data = data.drop(columns=["Classifier"])

data["Dataset"] = [DATASET for _ in data.index]

In [4]:
# Add AE data

ae_data = pd.read_csv("ae_trials.csv", keep_default_na=False)
ae_data = ae_data[["name", "type", "input_size", "attempt", "leader" , "masking", "mask_size"]]
ae_data["Primary Model"] = ["%d" % float(leader) if leader else "%d" % float(base) for base, leader in ae_data[["name", "leader"]].values]
ae_data = ae_data.drop(columns=["leader"])

ae_data["type"] = [type + " (%.2f Masked)" % mask_size if mask else type for type, mask, mask_size in ae_data[["type", "masking", "mask_size"]].values]

ae_data.head()

Unnamed: 0,name,type,input_size,attempt,masking,mask_size,Primary Model
0,20240209091837,Adversarial AE (0.75 Masked),4096,0,True,0.75,20240209091837
1,20240209091932,Vanilla AE (0.75 Masked),4096,0,True,0.75,20240209091837
2,20240209092014,Adversarial AE (0.75 Masked),4096,1,True,0.75,20240209091837
3,20240209092121,Vanilla AE (0.75 Masked),4096,1,True,0.75,20240209091837
4,20240209092202,Adversarial AE (0.75 Masked),4096,2,True,0.75,20240209091837


In [5]:
# Convert AE data to dict and add to ll data

ae_data = {name: [name, ae_type, input_size, attempt, primary, masking, mask_size] for name, ae_type, input_size, attempt, masking, mask_size, primary in ae_data.values}
data = data[[name in ae_data.keys() for name in data["Based on AE"].values]]
data[["AE", "Type", "Input Size", "Attempt", "Primary AE Model", "Mask", "Mask Size"]] = [ae_data[based_on] for based_on in data["Based on AE"].values.tolist()]

data[data["Processed Classifier"] == "Discriminator_Thresh_30"]["Type"]
data.head()

Unnamed: 0,Based on AE,sigma,precision,recall,f-score,n_estimators,auc,max_depth,min_samples_leaf,numEstimators,...,sigma_mult,Processed Classifier,Dataset,AE,Type,Input Size,Attempt,Primary AE Model,Mask,Mask Size
0,20240210000000.0,0.8,0.0,0.0,0.0,,,,,,...,,ReconstructionThreshold_0.80,Task 2 (10% Noise),20240209091837,Adversarial AE (0.75 Masked),4096,0,20240209091837,True,0.75
1,20240210000000.0,1.0,0.0,0.0,0.0,,,,,,...,,ReconstructionThreshold_1.00,Task 2 (10% Noise),20240209091837,Adversarial AE (0.75 Masked),4096,0,20240209091837,True,0.75
2,20240210000000.0,1.2,0.0,0.0,0.0,,,,,,...,,ReconstructionThreshold_1.20,Task 2 (10% Noise),20240209091837,Adversarial AE (0.75 Masked),4096,0,20240209091837,True,0.75
3,20240210000000.0,1.4,0.0,0.0,0.0,,,,,,...,,ReconstructionThreshold_1.40,Task 2 (10% Noise),20240209091837,Adversarial AE (0.75 Masked),4096,0,20240209091837,True,0.75
4,20240210000000.0,1.6,0.0,0.0,0.0,,,,,,...,,ReconstructionThreshold_1.60,Task 2 (10% Noise),20240209091837,Adversarial AE (0.75 Masked),4096,0,20240209091837,True,0.75


In [6]:
# Create groups and check if hyper-params are the same
hyper_params_to_check = [
    "max_iter",
    "n_estimators",
    "max_depth",
    "min_samples_leaf",
    "numEstimators",
    "C",
    "gamma",
    "kernel",
    "probability",
    "algorithm",
    "leaf_size",
    "n_neighbors",
    "n_bins",
    "sigma_base",
    "sigma_mult",
    "sigma",
]

data_uniqueness = data[["Primary AE Model", "Processed Classifier"] + hyper_params_to_check]
data_uniqueness = data_uniqueness.groupby(["Primary AE Model", "Processed Classifier"], group_keys=False).nunique()
data_uniqueness["HP Match"] = [all([n<=1 for n in v]) for v in data_uniqueness.values]

data_uniqueness = {(i[0], i[1]): same_hp for i, same_hp in zip(data_uniqueness.index.values, data_uniqueness["HP Match"].values)}

data["HP Match"] = [data_uniqueness[b_ae, pc] for b_ae, pc in data[["Primary AE Model", "Processed Classifier"]].values]
data["Classifier"] = data["Processed Classifier"]
data = data.drop(columns=["Processed Classifier"])

data = data[["Dataset", "Type", "Input Size", "Primary AE Model", "AE", "Classifier", "Attempt", "HP Match", "precision", "recall", "f-score", "auc"]]

data.head()

Unnamed: 0,Dataset,Type,Input Size,Primary AE Model,AE,Classifier,Attempt,HP Match,precision,recall,f-score,auc
0,Task 2 (10% Noise),Adversarial AE (0.75 Masked),4096,20240209091837,20240209091837,ReconstructionThreshold_0.80,0,True,0.0,0.0,0.0,
1,Task 2 (10% Noise),Adversarial AE (0.75 Masked),4096,20240209091837,20240209091837,ReconstructionThreshold_1.00,0,True,0.0,0.0,0.0,
2,Task 2 (10% Noise),Adversarial AE (0.75 Masked),4096,20240209091837,20240209091837,ReconstructionThreshold_1.20,0,True,0.0,0.0,0.0,
3,Task 2 (10% Noise),Adversarial AE (0.75 Masked),4096,20240209091837,20240209091837,ReconstructionThreshold_1.40,0,True,0.0,0.0,0.0,
4,Task 2 (10% Noise),Adversarial AE (0.75 Masked),4096,20240209091837,20240209091837,ReconstructionThreshold_1.60,0,True,0.0,0.0,0.0,


In [7]:
# Rename p, r and f1
data = data.rename(columns={"precision": "Precision", "recall": "Recall", "f-score": "F1-Score", "auc": "Area Under Curve"})

In [8]:
# Create final table
def row_max(df):
    max_f1 = df["F1-Score"].max()
    return df[df["F1-Score"] == max_f1].iloc[0]

if ALLOW_ONLY_HP_MATCHES:
    data = data[data["HP Match"]]

data = data[["Dataset", "Type", "Input Size", "Primary AE Model", "AE", "Classifier", "Precision", "Recall", "F1-Score", "Area Under Curve"]]
data_std = data.groupby(["Dataset", "Type", "Input Size", "Primary AE Model", "Classifier"])[["Precision", "Recall", "F1-Score", "Area Under Curve"]].std()
aes = data.groupby(["Dataset", "Type", "Input Size", "Primary AE Model", "Classifier"])["AE"].apply(lambda x : ":".join(["%d" % i for i in x]))
data = data.groupby(["Dataset", "Type", "Input Size", "Primary AE Model", "Classifier"])[["Precision", "Recall", "F1-Score", "Area Under Curve"]].mean()

data[["Precision Standard Deviation", "Recall Standard Deviation", "F1-Score Standard Deviation", "Area Under Curve Standard Deviation"]] = data_std[["Precision", "Recall", "F1-Score", "Area Under Curve"]]
data["Autoencoders"] = aes
data["AE Count"] = [len(v.split(":")) for v in data["Autoencoders"].values]
print(data["AE Count"].max())
data = data[data["AE Count"] >= TRIALS_REQUIRED]

data = data[~((data["Recall"] == 1) & (data["Precision"] < 0.6))]

data = data.groupby(["Dataset", "Input Size", "Type", "Classifier"]).apply(row_max)

formated_strings = ["%.4f (%.4f, %.4f)" % (f, p, r) for p, r, f in data[["Precision", "Recall", "F1-Score"]].values]
data["Formated Results"] = formated_strings
data = data.sort_values(["Type", "Input Size", "F1-Score"], ascending=False)

data.head()

6


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Precision,Recall,F1-Score,Area Under Curve,Precision Standard Deviation,Recall Standard Deviation,F1-Score Standard Deviation,Area Under Curve Standard Deviation,Autoencoders,AE Count,Formated Results
Dataset,Input Size,Type,Classifier,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Task 2 (10% Noise),12288,Vanilla AE (0.75 Masked),SVC,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,20240219011356:20240219011549:20240219011752:2...,5,"1.0000 (1.0000, 1.0000)"
Task 2 (10% Noise),12288,Vanilla AE (0.75 Masked),GradientBoostingClassifier,0.9,1.0,0.9375,1.0,0.2,0.0,0.125,0.0,20240221230511:20240221230704:20240221230908:2...,4,"0.9375 (0.9000, 1.0000)"
Task 2 (10% Noise),12288,Vanilla AE (0.75 Masked),GMM,0.833333,1.0,0.90303,0.5,0.166667,0.0,0.100138,0.0,20240222182802:20240222183026:20240222183249,3,"0.9030 (0.8333, 1.0000)"
Task 2 (10% Noise),12288,Vanilla AE (0.75 Masked),XGBClassifier,0.875,0.9375,0.901786,0.854167,0.144338,0.125,0.121551,0.171796,20240221211209:20240221211556:20240221211747:2...,4,"0.9018 (0.8750, 0.9375)"
Task 2 (10% Noise),12288,Vanilla AE (0.75 Masked),LogisticRegression,0.805556,1.0,0.885714,0.638889,0.173472,0.0,0.103016,0.196419,20240221230511:20240221230704:20240221231256,3,"0.8857 (0.8056, 1.0000)"


In [9]:
# Save final table

data.to_csv("Final LL Classifiers Table.csv")