In [1]:
# Set constants
DATASET = "Task 2 (50% Noise)"
ALLOW_ONLY_HP_MATCHES = True
TRIALS_REQUIRED = 3
DIFFERENCE_THRESHS = [1.05, 1.1, 1.5, 3, 4.5]

In [2]:
# Load data
import pandas as pd

data = pd.read_csv("latent_trials.csv")
data.head()

Unnamed: 0,Classifier,Based on AE,sigma,precision,recall,f-score,max_iter,auc,n_estimators,max_depth,...,gamma,kernel,probability,algorithm,leaf_size,n_neighbors,sigma_base,sigma_mult,contamination,n_bins
0,ReconstructionThreshold,20240209103156,0.8,0.517241,0.9375,0.666667,,,,,...,,,,,,,,,,
1,ReconstructionThreshold,20240209103156,1.0,0.481481,0.8125,0.604651,,,,,...,,,,,,,,,,
2,ReconstructionThreshold,20240209103156,1.2,0.416667,0.625,0.5,,,,,...,,,,,,,,,,
3,ReconstructionThreshold,20240209103156,1.4,0.263158,0.3125,0.285714,,,,,...,,,,,,,,,,
4,ReconstructionThreshold,20240209103156,1.6,0.176471,0.1875,0.181818,,,,,...,,,,,,,,,,


In [3]:
# Add processed classifer column
data["Processed Classifier"] = [classifier + "_%.2f" % sigma if classifier == "ReconstructionThreshold" else classifier for classifier, sigma in data[["Classifier", "sigma"]].values]
# data["Processed Classifier"] = data["Classifier"]
data = data.drop(columns=["Classifier"])

data["Dataset"] = [DATASET for _ in data.index]

In [4]:
# Add AE data

ae_data = pd.read_csv("ae_trials.csv", keep_default_na=False)
ae_data = ae_data[["name", "type", "input_size", "attempt", "leader" , "masking", "mask_size"]]
ae_data["Primary Model"] = ["%d" % float(leader) if leader else "%d" % float(base) for base, leader in ae_data[["name", "leader"]].values]
ae_data = ae_data.drop(columns=["leader"])

ae_data["type"] = [type + " (%.2f Masked)" % mask_size if mask else type for type, mask, mask_size in ae_data[["type", "masking", "mask_size"]].values]

ae_data.head()

Unnamed: 0,name,type,input_size,attempt,masking,mask_size,Primary Model
0,20240209103156,Adversarial AE,4096,0,False,0.5,20240209103156
1,20240209103238,Vanilla AE,4096,0,False,0.5,20240209103156
2,20240209103321,Adversarial AE,4096,1,False,0.5,20240209103156
3,20240209103401,Vanilla AE,4096,1,False,0.5,20240209103156
4,20240209103446,Adversarial AE,4096,2,False,0.5,20240209103156


In [5]:
# Convert AE data to dict and add to ll data

ae_data = {name: [name, ae_type, input_size, attempt, primary, masking, mask_size] for name, ae_type, input_size, attempt, masking, mask_size, primary in ae_data.values}
data = data[[name in ae_data.keys() for name in data["Based on AE"].values]]
data[["AE", "Type", "Input Size", "Attempt", "Primary AE Model", "Mask", "Mask Size"]] = [ae_data[based_on] for based_on in data["Based on AE"].values.tolist()]

data[data["Processed Classifier"] == "Discriminator_Thresh_30"]["Type"]
data.head()

Unnamed: 0,Based on AE,sigma,precision,recall,f-score,max_iter,auc,n_estimators,max_depth,min_samples_leaf,...,n_bins,Processed Classifier,Dataset,AE,Type,Input Size,Attempt,Primary AE Model,Mask,Mask Size
0,20240209103156,0.8,0.517241,0.9375,0.666667,,,,,,...,,ReconstructionThreshold_0.80,Task 2 (50% Noise),20240209103156,Adversarial AE,4096,0,20240209103156,False,0.5
1,20240209103156,1.0,0.481481,0.8125,0.604651,,,,,,...,,ReconstructionThreshold_1.00,Task 2 (50% Noise),20240209103156,Adversarial AE,4096,0,20240209103156,False,0.5
2,20240209103156,1.2,0.416667,0.625,0.5,,,,,,...,,ReconstructionThreshold_1.20,Task 2 (50% Noise),20240209103156,Adversarial AE,4096,0,20240209103156,False,0.5
3,20240209103156,1.4,0.263158,0.3125,0.285714,,,,,,...,,ReconstructionThreshold_1.40,Task 2 (50% Noise),20240209103156,Adversarial AE,4096,0,20240209103156,False,0.5
4,20240209103156,1.6,0.176471,0.1875,0.181818,,,,,,...,,ReconstructionThreshold_1.60,Task 2 (50% Noise),20240209103156,Adversarial AE,4096,0,20240209103156,False,0.5


In [6]:
# Create groups and check if hyper-params are the same
hyper_params_to_check = [
    "max_iter",
    "n_estimators",
    "max_depth",
    "min_samples_leaf",
    "numEstimators",
    "C",
    "gamma",
    "kernel",
    "probability",
    "algorithm",
    "leaf_size",
    "n_neighbors",
    "n_bins",
    "sigma_base",
    "sigma_mult",
    "sigma",
]

data_uniqueness = data[["Primary AE Model", "Processed Classifier"] + hyper_params_to_check]
data_uniqueness = data_uniqueness.groupby(["Primary AE Model", "Processed Classifier"], group_keys=False).nunique()
data_uniqueness["HP Match"] = [all([n<=1 for n in v]) for v in data_uniqueness.values]

data_uniqueness = {(i[0], i[1]): same_hp for i, same_hp in zip(data_uniqueness.index.values, data_uniqueness["HP Match"].values)}

data["HP Match"] = [data_uniqueness[b_ae, pc] for b_ae, pc in data[["Primary AE Model", "Processed Classifier"]].values]
data["Classifier"] = data["Processed Classifier"]
data = data.drop(columns=["Processed Classifier"])

data = data[["Dataset", "Type", "Input Size", "Primary AE Model", "AE", "Classifier", "Attempt", "HP Match", "precision", "recall", "f-score", "auc"]]

data.head()

Unnamed: 0,Dataset,Type,Input Size,Primary AE Model,AE,Classifier,Attempt,HP Match,precision,recall,f-score,auc
0,Task 2 (50% Noise),Adversarial AE,4096,20240209103156,20240209103156,ReconstructionThreshold_0.80,0,True,0.517241,0.9375,0.666667,
1,Task 2 (50% Noise),Adversarial AE,4096,20240209103156,20240209103156,ReconstructionThreshold_1.00,0,True,0.481481,0.8125,0.604651,
2,Task 2 (50% Noise),Adversarial AE,4096,20240209103156,20240209103156,ReconstructionThreshold_1.20,0,True,0.416667,0.625,0.5,
3,Task 2 (50% Noise),Adversarial AE,4096,20240209103156,20240209103156,ReconstructionThreshold_1.40,0,True,0.263158,0.3125,0.285714,
4,Task 2 (50% Noise),Adversarial AE,4096,20240209103156,20240209103156,ReconstructionThreshold_1.60,0,True,0.176471,0.1875,0.181818,


In [7]:
# Rename p, r and f1
data = data.rename(columns={"precision": "Precision", "recall": "Recall", "f-score": "F1-Score", "auc": "Area Under Curve"})

In [8]:
# Create final table
def row_max(df):
    max_f1 = df["F1-Score"].max()
    return df[df["F1-Score"] == max_f1].iloc[0]

if ALLOW_ONLY_HP_MATCHES:
    data = data[data["HP Match"]]

data = data[["Dataset", "Type", "Input Size", "Primary AE Model", "AE", "Classifier", "Precision", "Recall", "F1-Score", "Area Under Curve"]]
data_std = data.groupby(["Dataset", "Type", "Input Size", "Primary AE Model", "Classifier"])[["Precision", "Recall", "F1-Score", "Area Under Curve"]].std()
aes = data.groupby(["Dataset", "Type", "Input Size", "Primary AE Model", "Classifier"])["AE"].apply(lambda x : ":".join(["%d" % i for i in x]))
data = data.groupby(["Dataset", "Type", "Input Size", "Primary AE Model", "Classifier"])[["Precision", "Recall", "F1-Score", "Area Under Curve"]].mean()

data[["Precision Standard Deviation", "Recall Standard Deviation", "F1-Score Standard Deviation", "Area Under Curve Standard Deviation"]] = data_std[["Precision", "Recall", "F1-Score", "Area Under Curve"]]
data["Autoencoders"] = aes
data["AE Count"] = [len(v.split(":")) for v in data["Autoencoders"].values]
print(data["AE Count"].max())
data = data[data["AE Count"] >= TRIALS_REQUIRED]

data = data[~((data["Recall"] == 1) & (data["Precision"] < 0.6))]

data = data.groupby(["Dataset", "Input Size", "Type", "Classifier"]).apply(row_max)

formated_strings = ["%.4f (%.4f, %.4f)" % (f, p, r) for p, r, f in data[["Precision", "Recall", "F1-Score"]].values]
data["Formated Results"] = formated_strings
data = data.sort_values(["Type", "Input Size", "F1-Score"], ascending=False)

data.head()

6


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Precision,Recall,F1-Score,Area Under Curve,Precision Standard Deviation,Recall Standard Deviation,F1-Score Standard Deviation,Area Under Curve Standard Deviation,Autoencoders,AE Count,Formated Results
Dataset,Input Size,Type,Classifier,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Task 2 (50% Noise),12288,Vanilla AE (0.75 Masked),SVC,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,20240228115759:20240228120005:20240228120231:2...,5,"1.0000 (1.0000, 1.0000)"
Task 2 (50% Noise),12288,Vanilla AE (0.75 Masked),GMM,0.833333,1.0,0.90303,0.5,0.166667,0.0,0.100138,0.0,20240228133608:20240228133818:20240228134500,3,"0.9030 (0.8333, 1.0000)"
Task 2 (50% Noise),12288,Vanilla AE (0.75 Masked),LogisticRegression,0.888889,0.888889,0.866667,0.6875,0.19245,0.19245,0.11547,0.441942,20240228133608:20240228133818:20240228134500,3,"0.8667 (0.8889, 0.8889)"
Task 2 (50% Noise),12288,Vanilla AE (0.75 Masked),XGBClassifier,0.7625,0.916667,0.829365,0.763889,0.205649,0.166667,0.18254,0.205537,20240228152834:20240228153240:20240228153452:2...,4,"0.8294 (0.7625, 0.9167)"
Task 2 (50% Noise),12288,Vanilla AE (0.75 Masked),GradientBoostingClassifier,0.75,0.9,0.811111,0.768056,0.165831,0.136931,0.132346,0.24363,20240228115759:20240228120005:20240228120231:2...,5,"0.8111 (0.7500, 0.9000)"


In [9]:
# Save final table

data.to_csv("Final LL Classifiers Table.csv")