# GEMSS experiment evaluation per test case

Experiments on artificial data are ordered in 7 tiers that cover even more test cases. Since no experiments were repeated, they must be combined across tiers to answer questions regarding the algorithm performance (e.g. How is the performance affected by dimension/missing data/noise? How do binary classification and regression compare?).
Many test cases combine experiments from Tiers 2-7 with corresponding subsets of Tier 1.


In [None]:
import pandas as pd
import plotly.io as pio
from IPython.display import display, Markdown

from gemss.experiment_assessment.experiment_results_interactive import (
    show_interactive_performance_overview,
    show_interactive_solution_comparison,
    show_interactive_comparison_with_grouping,
    show_interactive_heatmap,
    show_interactive_si_asi_comparison,
)
from gemss.experiment_assessment.experiment_results_visualizations import (
    plot_metric_analysis_overview,
    plot_solution_comparison,
    plot_solution_grouped,
    plot_heatmap,
)
from gemss.experiment_assessment.experiment_results_analysis import (
    CATEGORY_ORDER,
    COVERAGE_METRICS,
    CORE_METRICS,
    SOLUTION_OPTIONS,
    ALL_PARAMETERS,
    DEFAULT_METRIC,
    DEFAULT_AGGREGATION_FUNC,
    DEFAULT_SOLUTION,
    get_all_experiment_results,
    choose_best_solution_per_group,
    filter_df_best_solutions,
    get_average_metrics_per_group,
    analyze_metric_results,
    compute_performance_overview,
    show_performance_overview,
)

from gemss.experiment_assessment.case_analysis import (
    CASE_DESCRIPTION,
    CASE_SET_RANGES,
    SUMMARY_CASES,
    COLORING_PARAM_PER_CASESET,
    SYMBOL_PARAM_PER_CASESET,
    COLORING_PARAM_PER_CASE,
    SYMBOL_PARAM_PER_CASE,
    get_df_cases,
    concatenate_cases,
)

# pio.renderers.default = "vscode"  # Ensures plotly plots show in VSCode

# For HTML export compatibility
pio.renderers.default = "notebook_connected"  # Show plots in exported notebooks
# pio.renderers.default = "iframe"  # Use this for HTML exports with embedded plots
# pio.renderers.default = "json"    # Use this for completely static exports

In [None]:
df = get_all_experiment_results(verbose=False)
df["TIER_ID"] = df["TIER_ID"].astype(int)

# Assign experiments to test cases



In [None]:
df_cases = get_df_cases(df)
df_all_cases = concatenate_cases(df_cases)

In [None]:
df_all_cases.info()

# Interactive performance overview for all solution types

In [None]:
show_interactive_performance_overview(
    df_all_cases,
    group_identifier="CASE_ID",
)

# Select solution type

Find best solution types for each test case. However, proceed with "top" solutions for all cases.

In [None]:
best_solutions = choose_best_solution_per_group(
    df_all_cases,
    group_identifier="CASE_ID",
    metric=DEFAULT_METRIC,
    verbose=True,
)

In [None]:
# actually choose the "top" solution type for all cases
chosen_solutions = {
    case_id: DEFAULT_SOLUTION for case_id in df_all_cases["CASE_ID"].unique()
}

df_all_cases_filtered = filter_df_best_solutions(
    df_all_cases,
    # best_solutions=best_solutions,
    best_solutions=chosen_solutions,
    group_identifier="CASE_ID",
    verbose=True,
)

## Overview of best solutions' performance

In [None]:
df_performance_overview = compute_performance_overview(
    df_all_cases_filtered,
    select_metrics=CORE_METRICS,
)
show_performance_overview(
    df_performance_overview,
    select_metrics=CORE_METRICS,
)

# Analysis of test cases

## Interactive widgets

In [None]:
show_interactive_comparison_with_grouping(
    df_all_cases_filtered,
    group_identifier="CASE_ID",
)

In [None]:
show_interactive_heatmap(
    df_all_cases_filtered,
    group_identifier="CASE_ID",
)

## Interactive plots

In [None]:
def analyze_case_set(case_set):
    """
    Run analysis for all cases in a given case set.
    """
    case_range = CASE_SET_RANGES[case_set]
    display(
        Markdown(
            f"## Analysis for case set **{case_set.upper()}** ({len(case_range)} test cases)"
        )
    )
    for i in case_range:
        display(Markdown(f"### Performance for **CASE_ID = {i}**"))
        display(Markdown(CASE_DESCRIPTION[i]))

        coloring_param = COLORING_PARAM_PER_CASE[case_set]
        symbol_param = SYMBOL_PARAM_PER_CASE[case_set]

        # Quick performance overview
        plot_metric_analysis_overview(
            df_all_cases_filtered,
            identifiers_list=[i],
            group_identifier="CASE_ID",
            metric_name=DEFAULT_METRIC,
            solution_type="all types",
            custom_title=f"{DEFAULT_METRIC} performance for CASE_ID {i}",
        )
        # Main performance plot
        hover_params = [
            param
            for param in ALL_PARAMETERS
            if (param in df_all_cases_filtered.columns)
            and (df_all_cases_filtered[param].nunique() > 1)
        ]
        plot_solution_grouped(
            df=df_all_cases_filtered.sort_values(by="EXPERIMENT_ID"),
            solution_type=best_solutions[f"CASE_ID = {i}"],
            metric_name=DEFAULT_METRIC,
            color_by=coloring_param,
            symbol_by=symbol_param,
            x_axis="EXPERIMENT_ID",  # subject to change
            group_identifier="CASE_ID",
            identifiers_list=[i],
            hover_params=hover_params,
        )

        if i in SUMMARY_CASES:
            # Additional heatmap plot to show performance map for all N_FEATURES and N_SAMPLES combinations
            plot_heatmap(
                df=df_all_cases_filtered,
                solution_type=best_solutions[f"CASE_ID = {i}"],
                x_axis=coloring_param,
                y_axis="N_FEATURES" if coloring_param != "N_FEATURES" else "N_SAMPLES",
                metric_name=DEFAULT_METRIC,
                group_identifier="CASE_ID",
                identifiers_list=[i],
            )
            # Additional plot to compare Adjusted Success Index
            plot_solution_grouped(
                df=df_all_cases_filtered.sort_values(by="EXPERIMENT_ID"),
                solution_type=best_solutions[f"CASE_ID = {i}"],
                metric_name="Adjusted_Success_Index",
                color_by=coloring_param,
                symbol_by=symbol_param,
                x_axis="EXPERIMENT_ID",  # subject to change
                group_identifier="CASE_ID",
                identifiers_list=[i],
                hover_params=hover_params,
            )

        # Compute mean and median metrics for cases grouped by the coloring_param
        display(Markdown(f"### Summary statistics for CASE_ID = {i}"))
        for agg_type in ["median", "mean"]:
            averages = get_average_metrics_per_group(
                df_all_cases_filtered[df_all_cases_filtered["CASE_ID"] == i],
                group_identifier=coloring_param,
                aggregation_func=agg_type,
            )
            df_averages = pd.concat(averages.values(), keys=averages.keys())
            display(
                Markdown(
                    f"\n- **{agg_type.capitalize()}** values of core metrics grouped by **{coloring_param}**:"
                )
            )
            display(df_averages)

        display(Markdown("---"))
        display(Markdown("<br>"))
    display(Markdown("---"))
    display(Markdown("<br>"))

In [None]:
for case_set in CASE_SET_RANGES.keys():
    if case_set in ["baseline", "scalability", "samplerich"]:
        analyze_case_set(case_set)

In [None]:
for case_set in CASE_SET_RANGES.keys():
    if case_set in ["adversity", "unbalanced"]:
        analyze_case_set(case_set)

In [None]:
for case_set in CASE_SET_RANGES.keys():
    if case_set in ["jaccard"]:
        analyze_case_set(case_set)

In [None]:
for case_set in CASE_SET_RANGES.keys():
    if case_set in ["reg_baseline", "reg_scalability", "reg_adversity"]:
        analyze_case_set(case_set)

In [None]:
for case_set in CASE_SET_RANGES.keys():
    if case_set in ["reg_vs_class"]:
        analyze_case_set(case_set)