# Setup

In [None]:
# When working in AI LRZ
%cd ~/cma/CMA_Fairness

In [None]:
%run analysis_setup_cp.ipynb

In [None]:
from pathlib import Path

RUN_TO_ANALYSE = "1"
OUTPUT_DIR = Path(".") / "output"

# Directory that will contain outputs from analysis
ANALYSIS_OUTPUT_DIR = OUTPUT_DIR / "analyses" / (str(RUN_TO_ANALYSE))

DATA_DIR = OUTPUT_DIR / "runs" / str(RUN_TO_ANALYSE) / "data"

In [None]:
PREFIX_SETTINGS = "sett_"
PREFIX_EVAL = "sett_eval_"
PREFIX_PERFORMANCE = "perf_"
PREFIX_FAIRNESS = "fair_"

In [None]:
main_fairness_metric = "fair_main_equalized_odds_difference"

# Data Loading

In [None]:
df_agg_raw = pd.read_csv(DATA_DIR / f"agg_{RUN_TO_ANALYSE}_run_outputs.csv.gz")

In [None]:
import json

df_settings = pd.json_normalize(
    df_agg_raw["universe_settings"].apply(json.loads)
).add_prefix(
    PREFIX_SETTINGS
)

df_agg_full = df_settings.join(df_agg_raw)
df_agg_full.head()

In [None]:
rows, columns = df_agg_full.shape
print(f"The data has N = {rows} rows and N = {columns} columns.")

In [None]:
cols_settings = list(df_agg_full.columns[df_agg_full.columns.str.startswith(PREFIX_SETTINGS)])
cols_eval = list(df_agg_full.columns[df_agg_full.columns.str.startswith(PREFIX_EVAL)])
cols_non_eval = list(set(cols_settings) - set(cols_eval))
cols_performance = list(df_agg_full.columns[df_agg_full.columns.str.startswith(PREFIX_PERFORMANCE)])
cols_fairness = list(df_agg_full.columns[df_agg_full.columns.str.startswith(PREFIX_FAIRNESS)])

In [None]:
df_agg_full["universe_id"]

In [None]:
drop_mask = df_agg_full["sett_exclude_subgroups"].isin(['keep-largest_race_1'])
print(f"Dropping N = {drop_mask.sum()} rows, keeping N = {(~drop_mask).sum()}")

df_agg_full = df_agg_full[~drop_mask]

In [None]:
df_agg_full["sett_exclude_subgroups"].unique()

In [None]:

df_agg_full["sett_eval_fairness_grouping"].unique()

# Save Parsed Data

In [None]:
df_agg_full.to_csv(ANALYSIS_OUTPUT_DIR / "df_agg_full.csv.gz")

## Filter Data by Fairness Grouping

Most analyses only make sense for one value of fairness grouping, so we explicitly filter the data here to use one of the two values and create a new dataframe that holds the full dataset.

In [None]:
df_agg = df_agg_full[df_agg_full["sett_eval_fairness_grouping"] == "majority-minority"]
rows, columns = df_agg.shape
print(f"The data has N = {rows} rows and N = {columns} columns.")

In [None]:
import plotly.express as px

fig = px.histogram(df_agg_full, x="fair_main_equalized_odds_difference", marginal="rug")
fig.show(renderer="notebook")


In [None]:
import plotly.express as px

fig = px.histogram(df_agg_full, x="fair_main_equalized_odds_difference", color="sett_eval_fairness_grouping", marginal="rug")
fig.show(renderer="notebook")


In [None]:
df_agg_full["sett_eval_fairness_grouping"].unique()

# Descriptive Analysis

In [None]:
import ipywidgets as widgets
from IPython.display import clear_output

# Helper Function to get interactive refreshing dropdowns
def interactive_single_var_dropdown(options, render_function, description='Column:'):
    dd = widgets.Dropdown(
        options=options,
        description=description,
    )

    def refresh():
        display(dd)

        render_function(dd.value)

    def on_change(change):
        if change['type'] == 'change' and change['name'] == 'value':
            clear_output()

            refresh()

    dd.observe(on_change)

    refresh()

In [None]:
def render_simple_density(colname):
    # Show default density plot
    df_agg[colname].plot.kde()

interactive_single_var_dropdown(options = cols_fairness + cols_performance, render_function=render_simple_density)

In [None]:
import plotly.express as px

def render_plotly_density(colname):
    fig = px.histogram(
        df_agg,
        x=colname,
        marginal="rug",
        hover_data=cols_settings
    )
    fig.show(renderer="notebook")

interactive_single_var_dropdown(options = cols_fairness + cols_performance, render_function=render_plotly_density)

In [None]:
px.scatter(
    df_agg,
    x="perf_ovrl_accuracy",
    y=main_fairness_metric,
    marginal_x="violin",
    marginal_y= "violin",
    hover_data=cols_settings,
    title="Accuracy x Fairness"
)

## Exploratory Analysis of Fairness based on Settings

In [None]:
import plotly.express as px

def fairness_violin(column_to_compare):
    fig = px.violin(
        df_agg,
        x = column_to_compare,
        y = main_fairness_metric,
        color = column_to_compare,
        points = "all",
        hover_data = cols_settings
    )
    # fig.update_traces(pointpos=0)
    display(fig)

interactive_single_var_dropdown(options = cols_settings, render_function=fairness_violin)