In [None]:
import comet_ml as comet

import IPython
import numpy as np
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
from typing import Dict, List, Any, DefaultDict
import numpy as np
from collections import defaultdict
from dotenv import load_dotenv

load_dotenv()
API = comet.API()

### General Setup

**NOTE:** It is expected that the Notebooks are run **inside VS Code** as it allows the pathing for `task_configs` to work. If it is run outside a VS Code instance, please adjust the following line:

```py
notebook_name = "/".join(
    IPython.extract_module_locals()[1]["__vsc_ipynb_file__"].split("/")[-5:]
)
```

The cell down below has the following configuration attributes, which might need adjustment depending on changes of the experimental design

- `metrics`: Inside this dictionary the keys represent the actual names of the metric, as they are displayed on `comet`, while the values are simply just given the according type that will be fetched from online.
  
- `parameters`: The parameters describe general experimental setup information, which were passed as arguments upon execution
  
- `task_names`: The task names represent the data sets upon which the Outlier Detection Strategies were trained on
  
- `task_configs`: The task configs represent the path to the configuration files of the `task names`.

In [None]:
metrics = {
    "AutoFilter_Chen_Like_HTL Count": float,
    "AutoFilter_Chen_Like_avg_duration": float,
    "AutoFilter_Chen_Like_medF1 (No HTL)": float,
    "AutoFilter_Chen_Like_medF1 (With HTL)": float,
    "AutoFilter_Chen_Like_avgF1 (random replacement)": float,
    "AutoFilter_Chen_Like_avgF1 (No HTL)": float,
    "AutoFilter_Chen_Like_avgF1 (With HTL)": float,
    "AutoFilter_Chen_Like_medF1 (random replacement)": float,

    "HDBScanFilter_HTL Count": float,
    "HDBScanFilter_avg_duration": float,
    "HDBScanFilter_medF1 (No HTL)": float,
    "HDBScanFilter_medF1 (With HTL)": float,
    "HDBScanFilter_medF1 (random replacement)": float,
    "HDBScanFilter_avgF1 (No HTL)": float,
    "HDBScanFilter_avgF1 (With HTL)": float,
    "HDBScanFilter_avgF1 (random replacement)": float,

    "IsolationForestFilter_HTL Count": float,
    "IsolationForestFilter_avg_duration": float,
    "IsolationForestFilter_avgF1 (No HTL)": float,
    "IsolationForestFilter_avgF1 (With HTL)": float,
    "IsolationForestFilter_avgF1 (random replacement)": float,
    "IsolationForestFilter_medF1 (No HTL)": float,
    "IsolationForestFilter_medF1 (With HTL)": float,
    "IsolationForestFilter_medF1 (random replacement)": float,

    "LocalOutlierFactorFilter_HTL Count": float,
    "LocalOutlierFactorFilter_avg_duration": float,
    "LocalOutlierFactorFilter_avgF1 (No HTL)": float,
    "LocalOutlierFactorFilter_avgF1 (With HTL)": float,
    "LocalOutlierFactorFilter_avgF1 (random replacement)": float,
    "LocalOutlierFactorFilter_medF1 (No HTL)": float,
    "LocalOutlierFactorFilter_medF1 (With HTL)": float,
    "LocalOutlierFactorFilter_medF1 (random replacement)": float,

    "LoserFilter_Plain_HTL Count": float,
    "LoserFilter_Plain_avg_duration": float,
    "LoserFilter_Plain_avgF1 (No HTL)": float,
    "LoserFilter_Plain_avgF1 (With HTL)": float,
    "LoserFilter_Plain_avgF1 (random replacement)": float,
    "LoserFilter_Plain_medF1 (No HTL)": float,
    "LoserFilter_Plain_medF1 (With HTL)": float,
    "LoserFilter_Plain_medF1 (random replacement)": float,

    "SingleStepEntropy_SimplePseudo_HTL Count": float,
    "SingleStepEntropy_SimplePseudo_avg_duration": float,
    "SingleStepEntropy_SimplePseudo_avgF1 (No HTL)": float,
    "SingleStepEntropy_SimplePseudo_avgF1 (With HTL)": float,
    "SingleStepEntropy_SimplePseudo_avgF1 (random replacement)": float,
    "SingleStepEntropy_SimplePseudo_medF1 (No HTL)": float,
    "SingleStepEntropy_SimplePseudo_medF1 (With HTL)": float,
    "SingleStepEntropy_SimplePseudo_medF1 (random replacement)": float,

}
parameters = {
    "strategy_name": str,
    "filter_strategy_name": str,
    "seed": int,
    "task": str,
}

task_names = [
    "ag-news",
    "banking77"
    # "dbpedia",
    # "fnc_one",
    # "imdb",
    # "mnli",
    # "qnli",
    # "rotten-tomatoes",
    # "sst2",
    # "trec-coarse",
    # "trec",
    # "wiki-talk",
    # "yelp"
]

version = "x"
task_names = [version+t for t in task_names]

# This gets the location of the Notebook, needs VSCode to be executed correctly
notebook_name = "/".join(
    IPython.extract_module_locals()[1]["__vsc_ipynb_file__"].split("/")[-5:]
)

BASE_PATH = Path(notebook_name).parent
CONFIGS_PATH = BASE_PATH.parent / 'Configs' / 'Tasks'

task_configs = {
    "ag-news": CONFIGS_PATH / "ag_news.json",
    "banking77" : CONFIGS_PATH / "bank77.json",
    "dbpedia": CONFIGS_PATH / "dbpedia.json",
    "fnc1": CONFIGS_PATH / "fnc_one.json",
    "imdb": CONFIGS_PATH / "imdb.json",
    "mnli": CONFIGS_PATH / "mnli.json",
    "qnli": CONFIGS_PATH / "qnli.json",
    "rotten-tomatoes": CONFIGS_PATH / "rotten_tomatoes.json",
    "sst2": CONFIGS_PATH / "sst2.json",
    "trec-coarse": CONFIGS_PATH / "trec_coarse.json",
    "trec": CONFIGS_PATH / "trec.json",
    "wiki-talk": CONFIGS_PATH / "wiki_talk.json",
    "yelp": CONFIGS_PATH / "yelp.json"
}

seed_count = 10 # How many different seeds do we expect?

filter_names = ["HDBScanFilter LocalOutlierFactorFilter IsolationForestFilter SimpleDSM SemanticAE SimpleSS"]

In [None]:
def extract_metric_value(metrics_used: List[Dict[str, Any]], metric_name: str) -> str | float:
    """
    Extracts the value of a specified metric from a list of metrics.

    Args:
        metrics_used (List[Dict[str, Any]]): A list of dictionaries containing metrics information.
        metric_name (str): The name of the metric to extract.

    Returns:
        str | float: The value of the specified metric.
    """
    metrics_dict = [entry for entry in metrics_used if entry.get("metricName") == metric_name]
    return metrics_dict[0]["metricValue"]

def extract_paremeter_value(parameters_used: List[Dict[str, Any]], parameter_name: str) -> str | float:
    """
    Extracts the current value of a specified parameter from a list of parameters.

    Args:
        parameters_used (List[Dict[str, Any]]): A list of dictionaries containing parameter information.
        parameter_name (str): The name of the parameter to extract.

    Returns:
        str | float: The current value of the specified parameter.
    """
    parameters_dict = [entry for entry in parameters_used if entry.get("name") == parameter_name]
    return parameters_dict[0]["valueCurrent"]


def load_experiment_data(experiment: comet.APIExperiment) -> DefaultDict[str, DefaultDict[str, DefaultDict[str, Dict[str, Any]]]]:
    """
    Loads and organizes experiment data, including metrics, parameters, and assets.

    Args:
        experiment (comet.APIExperiment): A Comet APIExperiment object containing experiment data.

    Returns:
        DefaultDict[str, DefaultDict[str, DefaultDict[str, Dict[str, Any]]]]: A nested dictionary with tasks, seeds, and experiment data.
    """
    data = defaultdict(lambda: defaultdict(lambda: defaultdict(dict)))

    experiment_parameters = experiment.get_parameters_summary()
    task = extract_paremeter_value(experiment_parameters, "task")
    seed = extract_paremeter_value(experiment_parameters, "seed")

    metrics_dict = {}
    metrics_used = experiment.get_metrics()
    for metric_name in metrics.keys():
        metric_value = extract_metric_value(metrics_used, metric_name)
        if not metric_value:
            return None
        else:
            metrics_dict[metric_name] = metric_value

    params_dict = {}
    for param_name in parameters.keys():
        param_value = extract_paremeter_value(experiment_parameters, param_name)
        if not experiment_parameters:
            return None
        else:
            params_dict[param_name] = param_value

    assets = download_assets(experiment, task, seed)

    data[task][seed]["metrics"].update(metrics_dict)
    data[task][seed]["parameters"].update(params_dict)
    data[task][seed]["assets"].update(assets)
    return data

def convert_dataframe_types(df: pd.DataFrame) -> pd.DataFrame:
    """
    Converts the data types of DataFrame columns based on predefined mappings.

    Args:
        df (pd.DataFrame): The DataFrame to convert.

    Returns:
        pd.DataFrame: The DataFrame with converted column data types.
    """
    for col in df.columns:
        if col in metrics:
            df[col] = df[col].astype(metrics[col])
        elif col in parameters:
            df[col] = df[col].astype(parameters[col])

    return df

def create_data_frames(experiment_data: List[DefaultDict[str, DefaultDict[str, DefaultDict[str, Dict[str, Any]]]]]) -> List[DefaultDict[str, DefaultDict[str, DefaultDict[str, pd.DataFrame]]]]:
    """
    Creates DataFrames from experiment data and converts their data types.

    Args:
        experiment_data (List[DefaultDict[str, DefaultDict[str, DefaultDict[str, Dict[str, Any]]]]]): A list of nested dictionaries containing experiment data.

    Returns:
         DefaultDict[str, DefaultDict[str, DefaultDict[str, pd.DataFrame]]]: A nested dictionary with pd.DataFrames for each section of the experiment data.
    """
    for index, entry in enumerate(experiment_data):
        for task, seed_dict in entry.items():
            for seed, section_dict in seed_dict.items():
                for section, hyperparameters_dict in section_dict.items():
                    df = pd.DataFrame.from_dict([hyperparameters_dict])
                    df_converted = convert_dataframe_types(df=df)

                    entry[task][seed][section] = df_converted

        experiment_data[index] = entry

    return experiment_data

def load_workspace_data(project_name: str) -> List[DefaultDict[str, DefaultDict[str, DefaultDict[str, pd.DataFrame]]]]:
    """
    Loads experiment data for a specific project from the Comet workspace.

    Args:
        project_name (str): The name of the project to load data from.

    Returns:
        DefaultDict[str, DefaultDict[str, DefaultDict[str, pd.DataFrame]]]: A dictionarty of nested dictionaries containing experiment data and DataFrames.
    """
    experiments = API.get(workspace="outlier-detection", project_name=project_name)
    experiment_data = []
    for exp in experiments:
        try:
            loaded_data = load_experiment_data(exp)
            if loaded_data is not None:
                experiment_data.append(loaded_data)
        except:
            continue

    experiment_data = create_data_frames(experiment_data=experiment_data)
    return experiment_data


def download_assets(experiment, task: str, seed: str) -> Dict[str, np.ndarray]:
    """
    Downloads and saves the assets of an experiment, filtering out unnecessary files.

    Args:
        experiment (comet.APIExperiment): A Comet APIExperiment object containing the experiment data.
        task (str): The task name associated with the experiment.
        seed (str): The seed value associated with the experiment.

    Returns:
        Dict[str, np.ndarray]: A dictionary of downloaded assets, loaded as NumPy arrays.
    """
    assets = experiment.get_asset_list()
    filtered_assets = [
        asset for asset in assets
        if "durations" not in asset["fileName"] and not asset["fileName"].endswith(".py")
    ]

    asset_ids = []
    for asset in filtered_assets:
        asset_ids.append((asset["fileName"], asset["assetId"]))

    assets_downloaded = {}
    for file_name, idx in asset_ids:
        asset_data = experiment.get_asset(idx)
        asset_path = Path(f"./cache/assets/{task}/{seed}/{file_name}")
        asset_path.parent.mkdir(parents=True, exist_ok=True)
        with open(asset_path, "wb") as f:
            f.write(asset_data)
        assets_downloaded[file_name[:-4]] = np.load(asset_path)
    return assets_downloaded


# df = load_workspace_data("xag-news")

In [None]:
def collect_all_seeds(data: List[DefaultDict[str, DefaultDict[str, DefaultDict[str, pd.DataFrame]]]], task_name: str):
    collected_seeds = []

    for data_dict in data:
        seed = next(iter(data_dict[task_name]))
        collected_seeds.append(seed)

    return np.array(collected_seeds, dtype=int)

def get_filter_strategy_name(group: List[DefaultDict[str, DefaultDict[str, DefaultDict[str, pd.DataFrame]]]], task_name: str):
    data = group[0]
    seed = next(iter(data[task_name]))
    filter_strategy = data[task_name][seed]["parameters"]["filter_strategy_name"].iat[0]

    return filter_strategy

def process_group(group: List[DefaultDict[str, DefaultDict[str, DefaultDict[str, pd.DataFrame]]]], task_name: str, workspace_name: str):

    seeds = collect_all_seeds(group, task_name=task_name)
    missing_seeds = np.setdiff1d(np.arange(42, 42 + seed_count), seeds)
    print(len(missing_seeds))

    # Take the first element of the list, look up the first seed (because every experiment has the same outlier filter strategy), and then take the result from the first row at column 'filter_strategy_name'
    filter_strategy = get_filter_strategy_name(group, task_name)
    df = pd.DataFrame([{"seed":seed, "filter":filter_strategy, "task": task_name} for seed in missing_seeds])

    return df

def load_data(workspace_name: str):
    loaded_data = load_workspace_data(workspace_name)
    task = next(iter(loaded_data[0])) # Get only back the first entry of the list, as all elements inside it are going to have the same task name
    processed_data = process_group(loaded_data, task, workspace_name)

    return processed_data

# load_data("xag-news")

In [None]:
results = []
for task in tqdm(task_names):
    try:
        data = load_data(task)
        results.append(data)
    except:
        # missing_seeds = np.arange(42,42+seed_count)
        # df = pd.DataFrame([{"seed":seed, "filter":filter_names[0], "task": task} for seed in missing_seeds])
        print(f"{task}: Missing")


missing_experiments = pd.concat(results)

missing_experiments = missing_experiments[missing_experiments["filter"].isin(filter_names)]
missing_experiments.to_csv("missing_std_experiments.csv")
missing_experiments # 941, 856, 848, 762, 747, (856, 715, 650, 607, 516, 503, 384, 370, 322, 290, 66)

In [None]:
import seaborn as sns

def add_boxplots(results, filter, l:list):
    # Adds BoxPlot to the graph
    # Adds multiple medians to the graph
    for f in ["NoneR", filter, "NoneE"]:
        data = []
        for task in task_names:
            data += list(results[task]["f1s"][f])
        l.append(data)
        


l = []
filter_names_ = [
    # "AutoFilter_LSTM_SIMPLE",
    # "AutoFilter_LSTM",
    "AutoFilter_Chen_Like",
    "LoserFilter_Plain",
    # "LoserFilter_Optimized_Pseudo_Labels",
    # "LoserFilter_SSL_Variety",
    # "TeachingFilter",
    # "TeachingFilter_WOW",
    # "TeachingFilter_Smooth",
    "SingleStepEntropy_SimplePseudo",
    # "SingleStepEntropy",
    "HDBScanFilter",
    "IsolationForestFilter",
    "LocalOutlierFactorFilter"
]

filter_names_clean = {
    # "LoserFilter_SSL_Variety": "EXPANDED DSM", 
    "LoserFilter_Plain": "SIMPLE DSM",
    # "LoserFilter_Optimized_Pseudo_Labels": "MC DSM", 
    "AutoFilter_Chen_Like": "SEMANTIC AE", 
    # "AutoFilter_LSTM": "LSTM ENSEMBLE AE",
    # "AutoFilter_LSTM_SIMPLE": "SIMPLE LSTM AE", 
    # "SingleStepEntropy": "MC SS", 
    "SingleStepEntropy_SimplePseudo": "SIMPLE SS",
    # "TeachingFilter": "SIMPLE LE", 
    # "TeachingFilter_Smooth": "SMOOTH LE", 
    "TeachingFilter_WOW": "HIGH ENTROPY LE",
    "HDBScanFilter" : "HDBScan",
    "IsolationForestFilter": "IsolationForest",
    "LocalOutlierFactorFilter": "LocalOutlierFactor"
    }

for filter in filter_names_:
    add_boxplots(results, filter, l)


fig, ax = plt.subplots()
intra_group_dist = 0.75
inter_group_dist = 1.5
positions = [i*intra_group_dist + (i//3)*inter_group_dist for i in range(len(l))]
bp = ax.boxplot(l, showfliers=False, positions=positions, patch_artist=True)
colors = ['#FF7F50', '#7cda9e', '#8fdeff']
for i, patch in enumerate(bp['boxes']):
    patch.set_facecolor(colors[i%3])


for i, median in enumerate(bp['medians']):
    median_x, median_y = median.get_xydata()[1]  # Get the median line's X and Y data
    # Hide the median line
    median.set_visible(False)
    # Plot a diamond marker at the median position
    offset = 0.25
    ax.plot(median_x-offset, median_y, 'd', color='#082239', markersize=3)

tick_positions = [np.mean(positions[(i*3):(i*3)+3])-len(filter_names_clean[f])*0.18 for i, f in enumerate(filter_names_)]

plt.xticks(tick_positions, [filter_names_clean[f] for f in filter_names_], rotation=45)
plt.tight_layout()

plt.savefig("endresults.pdf")
plt.show()

In [None]:
for task in task_names:
    def add_boxplots(results, filter, l:list):
        # Adds BoxPlot to the graph
        # Adds multiple medians to the graph
        for f in ["NoneR", filter, "NoneE"]:
            data = []
            data += list(results[task]["f1s"][f])
            l.append(data)
            
    
    
    l = []
    filter_names_ = [
        "AutoFilter_LSTM_SIMPLE",
        "AutoFilter_LSTM",
        "AutoFilter_Chen_Like",
        "LoserFilter_Plain",
        "LoserFilter_Optimized_Pseudo_Labels",
        "LoserFilter_SSL_Variety",
        "TeachingFilter",
        "TeachingFilter_WOW",
        "TeachingFilter_Smooth",
        "SingleStepEntropy_SimplePseudo",
        "SingleStepEntropy",
    ]
    
    filter_names_clean = {
        "LoserFilter_SSL_Variety": "EXPANDED DSM", 
        "LoserFilter_Plain": "SIMPLE DSM",
        "LoserFilter_Optimized_Pseudo_Labels": "MC DSM", 
        "AutoFilter_Chen_Like": "SEMANTIC AE", 
        "AutoFilter_LSTM": "LSTM ENSEMBLE AE",
        "AutoFilter_LSTM_SIMPLE": "SIMPLE LSTM AE", 
        "SingleStepEntropy": "MC SS", 
        "SingleStepEntropy_SimplePseudo": "SIMPLE SS",
        "TeachingFilter": "SIMPLE LE", 
        "TeachingFilter_Smooth": "SMOOTH LE", 
        "TeachingFilter_WOW": "HIGH ENTROPY LE"}
    
    for filter in filter_names_:
        add_boxplots(results, filter, l)
    
    
    fig, ax = plt.subplots()
    intra_group_dist = 0.75
    inter_group_dist = 1.5
    positions = [i*intra_group_dist + (i//3)*inter_group_dist for i in range(len(l))]
    bp = ax.boxplot(l, showfliers=False, positions=positions, patch_artist=True)
    colors = ['#FF7F50', '#7cda9e', '#8fdeff']
    for i, patch in enumerate(bp['boxes']):
        patch.set_facecolor(colors[i%3])
    
    
    for i, median in enumerate(bp['medians']):
        median_x, median_y = median.get_xydata()[1]  # Get the median line's X and Y data
        # Hide the median line
        median.set_visible(False)
        # Plot a diamond marker at the median position
        offset = 0.25
        ax.plot(median_x-offset, median_y, 'd', color='#082239', markersize=3)
    
    tick_positions = [np.mean(positions[(i*3):(i*3)+3])-len(filter_names_clean[f])*0.18 for i, f in enumerate(filter_names_)]
    
    plt.xticks(tick_positions, [filter_names_clean[f] for f in filter_names_], rotation=45)
    plt.tight_layout()
    
    plt.savefig(f"endresults-{task}.pdf")
    plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Initialize the figure and axes for the boxplots
fig, ax = plt.subplots()
ax.set_title('Incremental Boxplots')

# This list will hold different datasets for individual boxplots
datasets = []

# Assuming you have a mechanism to add datasets one at a time
for i in range(1, 6):  # Example loop to simulate adding 5 datasets incrementally
    # Simulate generating or loading a new dataset
    new_data = np.random.normal(loc=i, scale=0.5, size=100)
    datasets.append(new_data)  # Add the new dataset to the list
    
# Clear the axes for fresh plot (optional if you want to redraw the boxplots)
ax.clear()
# Plot all the current datasets as individual boxplots
ax.boxplot(datasets)
ax.set_title('Incremental Boxplots')
plt.draw()  # Redraw the plot with the new data
plt.pause(0.5)  # Pause to visually confirm the addition, adjust or remove as needed

plt.show()

In [None]:
import deepsig
import pandas as pd
aso_test = {}
for filter_name in filter_names:
    data = []
    task_aso = {}
    for task in task_names:
        htl = results[task]["f1s"]["NoneR"]
        no_htl = results[task]["f1s"][filter_name]
        better = deepsig.aso(no_htl, htl, seed=42)
        task_aso[task+"_no_htl_is_better"] = better
    aso_test[filter_name] = task_aso
    
pd.DataFrame(aso_test)    

In [None]:
import seaborn as sns

# Creating a new DataFrame for the pairs
paired_data = pd.DataFrame({
    'Value': np.concatenate([data['A'], data['B'], data['C'], data['D'], data['E'], data['F']]),
    'Variable': np.concatenate([np.repeat('A', 100), np.repeat('B', 100), 
                                np.repeat('C', 100), np.repeat('D', 100),
                                np.repeat('E', 100), np.repeat('F', 100)]),
    'Pair': np.concatenate([np.repeat('Pair AB', 200), np.repeat('Pair CD', 200), np.repeat('Pair EF', 200)])
})

# Plotting the paired boxplots
plt.figure(figsize=(10,6))

sns.boxplot(x="Pair", y="Value", hue="Variable", data=paired_data, palette="Set3")

plt.title("Paired Boxplots")
plt.xlabel("Pairs")
plt.ylabel("Values")

plt.show()