# Setup Notebook

In [1]:
# When working in AI LRZq
%cd ~/cma/CMA_Fairness_v2

/dss/dsshome1/0C/ra93lal2/cma/CMA_Fairness_v2


  bkms = self.shell.db.get('bookmarks', {})
  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [2]:
%run analysis_setup_cp.ipynb

/dss/dsshome1/0C/ra93lal2/cma/CMA_Fairness_v2
The data has N = 900 rows and N = 14 columns.


## Prepare Data


In [3]:
import pandas as pd
from pathlib import Path

RUN_TO_ANALYSE = "16"
CP_DIR = Path("output") / "runs" / str(RUN_TO_ANALYSE)

df_agg = pd.read_csv(CP_DIR / "combined_cp_metrics.csv")

FileNotFoundError: [Errno 2] No such file or directory: 'output/runs/16/combined_cp_metrics.csv'

In [None]:
df_agg.head()

In [None]:
df_agg.shape

In [None]:
main_cp_metric = "avg_size" # "avg_size", "cov_nongerman_female"

# Calculate Variable Importance

## Use a Lasso Regression to estimate Importance of Settings

In [None]:
cols_design_dec = ["universe_training_year", 
                   "universe_training_size", 
                   "universe_scale",
                   "universe_model", 
                   "universe_exclude_features", 
                   "universe_exclude_subgroups"]
X = df_agg[cols_design_dec]
y = df_agg[main_cp_metric]

In [None]:
X.head()

In [None]:
X.shape

In [None]:
y.shape

### Main Effects of Settings Only (i.e. no interactions)

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LassoCV

# Do preprocessing in a separate pipeline from model fitting
# (for eli5 to work)
preprocessor = make_pipeline(
    OneHotEncoder(), 
).fit(X) 
X_processed = preprocessor.transform(X) 

# Fit a Lasso regression model with cross-validation to the processed data
lasso_reg = LassoCV(cv=5, random_state=0).fit(X_processed, y)

# Check whether it's predictive at all
lasso_reg.score(X_processed, y) # Evaluate the model's R^2 score on the training data

R² = 1.0: Perfect predictions

R² = 0.0: Model does no better than predicting the mean

R² < 0.0: Model is worse than just predicting the mean

Weights of importance:

In [None]:
# Visualize the importance of each feature in trained lasso_reg model
import eli5
eli5.show_weights(lasso_reg, top=-1, feature_names = preprocessor.get_feature_names_out())

### Including Interactions

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import LassoCV

# Do preprocessing w/o a pipeline for eli5 to work
preprocessor = make_pipeline(
    OneHotEncoder(),
    PolynomialFeatures(degree=2),
).fit(X)
X_processed = preprocessor.transform(X)

lasso_reg = LassoCV(cv=5, random_state=0).fit(X_processed, y)

# Check whether it's predictive at all
lasso_reg.score(X_processed, y) # Evaluate the model's R^2 score on the training data

Weights of importance:

In [None]:
import eli5
eli5.show_weights(lasso_reg, top=-1, feature_names = preprocessor.get_feature_names_out())

## Use a functinoal ANOVA (fANOVA) to Analyze Setting Importance

Based on the following paper:

Hutter, F., Hoos, H., & Leyton-Brown, K. (2014). An Efficient Approach for Assessing Hyperparameter Importance. Proceedings of the 31st International Conference on Machine Learning, 754–762. https://proceedings.mlr.press/v32/hutter14.html


In [None]:
from fairness_multiverse.analysis import MultiverseFanova

m_fanova = MultiverseFanova(features = df_agg[cols_design_dec], outcome = df_agg[main_cp_metric])

In [None]:
m_fanova.quantify_individual_importance()

In [None]:
m_fanova.quantify_importance(save_to = CP_DIR/ f"fanova_importance_interactions-overall_{main_cp_metric}.csv")

In [None]:
# Latex Table "10 most important decisions or decision interactions"

pd.set_option('display.max_colwidth', None)  

fanova_df = m_fanova.quantify_importance()

top10 = (
    fanova_df
    .sort_values('individual importance', ascending=False)
    .head(10)
    .reset_index(drop=True)
)

# Detect main vs. N-way interaction
def effect_type(row):
    levels = [row[f'level_{i}'] for i in range(5) if pd.notna(row.get(f'level_{i}'))]
    return 'main' if len(levels) == 1 else f'{len(levels)}-way int.'

top10['Effect Type'] = top10.apply(effect_type, axis=1)

def clean_level(name: str) -> str:
    if pd.isna(name): 
        return ""
    # strip leading universe_
    if name.startswith("universe_"):
        name = name[len("universe_"):]
    # snake_case → CamelCase
    return "".join(part.capitalize() for part in name.split("_"))

def fmt_level(name: str) -> str:
    return f"\\textit{{{clean_level(name)}}}"

top10["Decision / Interaction of Decisions"] = (
    top10
    .apply(
        lambda row: " $\\times$ ".join(
            fmt_level(row[f"level_{i}"])
            for i in range(5)                      # loop over 0–4
            if pd.notna(row.get(f"level_{i}"))
        ),
        axis=1,
    )
)

out = top10[[
    'Effect Type',
    'Decision / Interaction of Decisions',
    'individual importance',
    'individual std'
]].copy()
out.columns = [
    'Effect Type',
    'Decision / Interaction of Decisions',
    'Importance',
    'Std. Deviation'
]

out['Importance']    = out['Importance'].round(3)
out['Std. Deviation']= out['Std. Deviation'].round(3)

raw = out.to_latex(
    index=False,
    escape=False,       
    column_format='llrr'
)

lines = raw.splitlines()
new_lines = []
hcount = 0
for ln in lines:
    if ln.strip() == r'\hline':
        hcount += 1
        if hcount == 1:
            new_lines.append(r'\toprule')
        elif hcount == 2:
            new_lines.append(r'\midrule')
        else:
            new_lines.append(r'\bottomrule')
    else:
        new_lines.append(ln)
fixed_tabular = "\n".join(new_lines)

# 9. wrap in full table environment
latex = f"""\\begin{{table}}
\\centering
\\caption{{The 10 most important decisions or decision interactions and their relative importance for the average prediction set size.}}
\\label{{tab:fanova_top10}}
{fixed_tabular}
\\end{{table}}
"""

output_path = CP_DIR / f"fanova_top10_{main_cp_metric}.tex"
output_path.write_text(latex)

# Visualizations Avg. Prediction Set Size

In [None]:
# Plot 1D marginal
#from fanova import visualizer

#vis = visualizer.Visualizer(m_fanova.fanova, m_fanova.configuration_space, directory = str(CP_DIR))
#vis.plot_marginal(4)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.gridspec as gridspec

# Label mapping
label_map = {
    'universe_training_size': 'Training Size',
    'universe_training_year': 'Training Year',
    'universe_scale': 'Scaling',
    'universe_model': 'Model',
    'universe_exclude_features': 'Excluded Features',
    'universe_exclude_subgroups': 'Excluded Subgroups'
}

# Custom x-tick renaming
category_renames = {
    '2010_14': '2010-14',
    '2012_14': '2012-14',
    'scale': 'scaled',
    'do-not-scale': 'not-scaled',
    'drop-non-german': 'drop',
    'penalized_logreg': 'pen. logreg',
    'rf': 'rand. forest'
}

# Custom order per parameter
custom_orders = {
    'universe_training_size': ['1k', '5k', '25k'],
    'universe_model': ['logreg', 'penalized_logreg', 'elasticnet', 'gbm', 'rf'],
    'universe_exclude_features': ['none', 'age', 'nationality', 'sex', 'nationality-sex']
}

# Plot layout
row1 = [
    'universe_training_size',
    'universe_training_year',
    'universe_scale',
    'universe_exclude_subgroups'
]
row2 = ['universe_model', 'universe_exclude_features']

widths_row1 = [1.3, 1.3, 1.0, 1.0]
widths_row2 = [2.0, 2.0] 

colors_row1 = ['#FFFF99', '#FDBF6F', '#FB9A99', '#CAB2D6']
colors_row2 = ['aquamarine', 'lightskyblue'] 

# Styling
sns.set(style="whitegrid", palette="pastel", font_scale=1.2)
fig = plt.figure(figsize=(24, 15))
outer_gs = gridspec.GridSpec(2, 1, height_ratios=[1, 1], hspace=0.5)

# Font sizes
axis_label_size = 24
tick_label_size = 20
category_label_size = 24

# Row 1
gs_row1 = gridspec.GridSpecFromSubplotSpec(
    nrows=1, ncols=4, subplot_spec=outer_gs[0], width_ratios=widths_row1, wspace=0.3
)

for idx, param in enumerate(row1): 
    ax = fig.add_subplot(gs_row1[0, idx]) 

    order = custom_orders.get(param, sorted(df_agg[param].dropna().unique()))
    sns.violinplot(
        data=df_agg, x=param, y='avg_size', ax=ax,
        inner='box', cut=0, order=order, color=colors_row1[idx]
    )

    # Overlay median dots
    medians = df_agg.groupby(param)['avg_size'].median()
    for i, cat in enumerate(order):
        if cat in medians:
            ax.plot(i, medians[cat], color='lemonchiffon', marker='o', markersize=8, zorder=3)

    # Rename x-tick labels
    xticks = [label.get_text() for label in ax.get_xticklabels()]
    renamed = [category_renames.get(label, label) for label in xticks]
    ax.set_xticklabels(renamed, fontsize=category_label_size, rotation=30)

    ax.set_xlabel(label_map[param], fontsize=axis_label_size)
    if idx == 0:
        ax.set_ylabel('Avg. Prediction Set Size', fontsize=axis_label_size)
    else:
        ax.set_ylabel('')
        ax.set_yticklabels([])
        ax.tick_params(axis='y', left=False)
    ax.tick_params(axis='y', labelsize=tick_label_size)
    ax.set_title("")

# Row 2
gs_row2 = gridspec.GridSpecFromSubplotSpec(
    nrows=1, ncols=2, subplot_spec=outer_gs[1], width_ratios=widths_row2, wspace=0.1
)
for idx, param in enumerate(row2): 
    ax = fig.add_subplot(gs_row2[0, idx])  

    order = custom_orders.get(param, sorted(df_agg[param].dropna().unique()))
    sns.violinplot(
        data=df_agg, x=param, y='avg_size', ax=ax,
        inner='box', cut=0, order=order, color=colors_row2[idx]
    )

    # Overlay median dots
    medians = df_agg.groupby(param)['avg_size'].median()
    for i, cat in enumerate(order):
        if cat in medians:
            ax.plot(i, medians[cat], color='lemonchiffon', marker='o', markersize=8, zorder=3)

    # Rename x-tick labels
    xticks = [label.get_text() for label in ax.get_xticklabels()]
    renamed = [category_renames.get(label, label) for label in xticks]
    ax.set_xticklabels(renamed, fontsize=category_label_size, rotation=30)

    ax.set_xlabel(label_map[param], fontsize=axis_label_size)
    if idx == 0:
        ax.set_ylabel('Avg. Prediction Set Size', fontsize=axis_label_size)
    else:
        ax.set_ylabel('')
        ax.set_yticklabels([])
        ax.tick_params(axis='y', left=False)
    ax.tick_params(axis='y', labelsize=tick_label_size)
    ax.set_title("")

output_path = CP_DIR / f"violin_plots_{main_cp_metric}.png"
plt.savefig(output_path, dpi=300, bbox_inches='tight')

plt.show()

In [None]:
# 5 most important 2-way interactions
best_p_margs = m_fanova.fanova.get_most_important_pairwise_marginals(n=5)
print(best_p_margs)

In [None]:
from fanova import visualizer

vis = visualizer.Visualizer(m_fanova.fanova, m_fanova.configuration_space, directory = str(CP_DIR))

In [None]:
# Plot 2D pairwise marginal
vis.plot_pairwise_marginal(['universe_model', 'universe_scale'])

In [None]:
# Recover mapping
col = "universe_model"
original_labels = df_agg[col].unique()

# Get corresponding numeric codes
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
codes = le.fit_transform(df_agg[col].values)
label_map = dict(zip(le.transform(original_labels), original_labels))

# Sort by code
label_map = dict(sorted(label_map.items()))
print(label_map)

In [None]:
# Heatmap avg. prediction set size

vis.plot_pairwise_marginal(['universe_model', 'universe_training_size'])

plt.xticks(
    ticks=[0,1,2,3,4], 
    labels=['elasticnet', 'gbm', 'logreg', 'pen. logreg', 'rand. forest'], 
    rotation=45,
    fontsize=16)

plt.yticks(
    ticks=[0,1,2], 
    labels=['1k', '25k', '5k'],
    fontsize=16)

cbar = plt.gcf().axes[-1] 
cbar.set_ylabel("Avg. Prediction Set Size", rotation=90, labelpad=15, fontsize=20)
cbar.tick_params(labelsize=16)

plt.xlabel("Model", fontsize=20)
plt.ylabel("Training Size", fontsize=20)

ax = plt.gca()
im = ax.get_images()[0] 
im.set_cmap("bwr")

plt.title("")
plt.tight_layout()

output_path = CP_DIR / f"heatmap_{main_cp_metric}.png"  # or .jpg, .pdf, .svg
plt.savefig(output_path, dpi=300, bbox_inches='tight')

plt.show()

In [None]:
m_fanova.fanova.marginal_mean_prediction({"universe_model": 0, "universe_training_size": 1})


# Visualizations Conditional Coverage

In [None]:
# Violin plots

import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.gridspec as gridspec

# Label mapping
label_map = {
    'universe_training_size': 'Training Size',
    'universe_training_year': 'Training Year',
    'universe_scale': 'Scaling',
    'universe_model': 'Model',
    'universe_exclude_features': 'Excluded Features',
    'universe_exclude_subgroups': 'Excluded Subgroups'
}

# Custom x-tick renaming
category_renames = {
    '2010_14': '2010-14',
    '2012_14': '2012-14',
    'scale': 'scaled',
    'do-not-scale': 'not-scaled',
    'drop-non-german': 'drop',
    'penalized_logreg': 'pen. logreg',
    'rf': 'rand. forest'
}

# Custom order per parameter
custom_orders = {
    'universe_training_size': ['1k', '5k', '25k'],
    'universe_model': ['logreg', 'penalized_logreg', 'elasticnet', 'gbm', 'rf'],
    'universe_exclude_features': ['none', 'age', 'nationality', 'sex', 'nationality-sex']
}

# Plot layout
row1 = [
    'universe_training_size',
    'universe_training_year',
    'universe_scale',
    'universe_exclude_subgroups'
]
row2 = ['universe_model', 'universe_exclude_features']

widths_row1 = [1.3, 1.3, 1.0, 1.0]
widths_row2 = [2.0, 2.0]

colors_row1 = ['#FFFF99', '#FDBF6F', '#FB9A99', '#CAB2D6']
colors_row2 = ['aquamarine', 'lightskyblue'] 

# Styling
sns.set(style="whitegrid", palette="pastel", font_scale=1.2)
fig = plt.figure(figsize=(24, 15))
outer_gs = gridspec.GridSpec(2, 1, height_ratios=[1, 1], hspace=0.5)

# Font sizes
axis_label_size = 24
tick_label_size = 20
category_label_size = 24

# Row 1
gs_row1 = gridspec.GridSpecFromSubplotSpec(
    nrows=1, ncols=4, subplot_spec=outer_gs[0], width_ratios=widths_row1, wspace=0.3
)

for idx, param in enumerate(row1): 
    ax = fig.add_subplot(gs_row1[0, idx]) 

    order = custom_orders.get(param, sorted(df_agg[param].dropna().unique()))
    sns.violinplot(
        data=df_agg, x=param, y='cov_nongerman_female', ax=ax,
        inner='box', cut=0, order=order, color=colors_row1[idx]
    )

    # Overlay median dots
    medians = df_agg.groupby(param)['cov_nongerman_female'].median()
    for i, cat in enumerate(order):
        if cat in medians:
            ax.plot(i, medians[cat], color='lemonchiffon', marker='o', markersize=8, zorder=3)

    # Rename x-tick labels
    xticks = [label.get_text() for label in ax.get_xticklabels()]
    renamed = [category_renames.get(label, label) for label in xticks]
    ax.set_xticklabels(renamed, fontsize=category_label_size, rotation=30)

    ax.set_xlabel(label_map[param], fontsize=axis_label_size)
    if idx == 0:
        ax.set_ylabel('Cov. Non-German Female', fontsize=axis_label_size)
    else:
        ax.set_ylabel('')
        ax.set_yticklabels([])
        ax.tick_params(axis='y', left=False)
    ax.tick_params(axis='y', labelsize=tick_label_size)
    ax.set_title("")

# Row 2
gs_row2 = gridspec.GridSpecFromSubplotSpec(
    nrows=1, ncols=2, subplot_spec=outer_gs[1], width_ratios=widths_row2, wspace=0.1
)
for idx, param in enumerate(row2): 
    ax = fig.add_subplot(gs_row2[0, idx])  

    order = custom_orders.get(param, sorted(df_agg[param].dropna().unique()))
    sns.violinplot(
        data=df_agg, x=param, y='cov_nongerman_female', ax=ax,
        inner='box', cut=0, order=order, color=colors_row2[idx]
    )

    # Overlay median dots
    medians = df_agg.groupby(param)['cov_nongerman_female'].median()
    for i, cat in enumerate(order):
        if cat in medians:
            ax.plot(i, medians[cat], color='lemonchiffon', marker='o', markersize=8, zorder=3)

    # Rename x-tick labels
    xticks = [label.get_text() for label in ax.get_xticklabels()]
    renamed = [category_renames.get(label, label) for label in xticks]
    ax.set_xticklabels(renamed, fontsize=category_label_size, rotation=30)

    ax.set_xlabel(label_map[param], fontsize=axis_label_size)
    if idx == 0:
        ax.set_ylabel('Cov. Non-German Female', fontsize=axis_label_size)
    else:
        ax.set_ylabel('')
        ax.set_yticklabels([])
        ax.tick_params(axis='y', left=False)
    ax.tick_params(axis='y', labelsize=tick_label_size)
    ax.set_title("")

output_path = CP_DIR / f"violin_plots_{main_cp_metric}.png"
plt.savefig(output_path, dpi=300, bbox_inches='tight')

plt.show()

In [None]:
# Recover mapping
col = "universe_scale"
original_labels = df_agg[col].unique()

# Get corresponding numeric codes
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
codes = le.fit_transform(df_agg[col].values)
label_map = dict(zip(le.transform(original_labels), original_labels))

# Sort by code
label_map = dict(sorted(label_map.items()))
print(label_map)

In [None]:
from fanova import visualizer

vis = visualizer.Visualizer(m_fanova.fanova, m_fanova.configuration_space, directory = str(CP_DIR))

In [None]:
# Heatmap

vis.plot_pairwise_marginal(['universe_model', 'universe_scale'])

import matplotlib.collections as mcoll

# ── remove the white cell borders ───────────────────────────────
ax  = plt.gca()
# the first artist in the axis is the QuadMesh produced by pcolormesh
quad = next(obj for obj in ax.get_children() if isinstance(obj, mcoll.QuadMesh))
quad.set_linewidth(0)          # no grid lines
quad.set_edgecolor('face')     # edges take the same colour as the face
# ────────────────────────────────────────────────────────────────

plt.xticks(
    ticks=[0,1,2,3,4], 
    labels=['elasticnet', 'gbm', 'logreg', 'pen. logreg', 'rand. forest'], 
    rotation=45,
    fontsize=16)

plt.yticks(
    ticks=[0,1], 
    labels=['not-scaled', 'scaled'],
    fontsize=16)

cbar = plt.gcf().axes[-1] 
cbar.set_ylabel("Cov. Non-German Female", rotation=90, labelpad=15, fontsize=20)
cbar.tick_params(labelsize=16)

plt.xlabel("Model", fontsize=20)
plt.ylabel("Scaling", fontsize=20)

ax = plt.gca()
im = ax.get_images()[0] 
im.set_cmap("bwr") #reverse so that undercoverage is red and overcoverage b+lue

plt.title("")
plt.tight_layout()

output_path = CP_DIR / f"heatmap_{main_cp_metric}.png"  # or .jpg, .pdf, .svg
#plt.savefig(output_path, dpi=300, bbox_inches='tight')

plt.show()

In [None]:
# Heatmap

vis.plot_pairwise_marginal(['universe_scale', 'universe_model'])

plt.yticks(
    ticks=[0,1,2,3,4], 
    labels=['elasticnet', 'gbm', 'logreg', 'pen. logreg', 'rand. forest'], 
    fontsize=18)

plt.xticks(
    ticks=[0,1], 
    labels=['not-scaled', 'scaled'],
    rotation=45,
    fontsize=18)

cbar = plt.gcf().axes[-1] 
cbar.set_ylabel("Cov. NG Female", rotation=90, labelpad=15, fontsize=20)
cbar.tick_params(labelsize=20)

plt.xlabel("Model", fontsize=20)
plt.ylabel("Scaling", fontsize=20)

ax = plt.gca()
im = ax.get_images()[0] 
im.set_cmap("bwr") #reverse so that undercoverage is red and overcoverage blue ???


plt.title("")
plt.tight_layout()

output_path = CP_DIR / f"heatmap_{main_cp_metric}.png"
#plt.savefig(output_path, dpi=300, bbox_inches='tight')

plt.show()

## Quantify Importance with Partial Data

In [None]:
from tqdm import tqdm

PARTIAL_FANOVA_DIR = ANALYSIS_OUTPUT_DIR / "partial_fanova" / "overall"
PARTIAL_FANOVA_DIR.mkdir(parents=True, exist_ok=True)

N_ITERATIONS = 10

In [None]:
from fairness_multiverse.analysis import MultiverseFanova
import joblib

def quantify_importance_for_fraction(fraction: float, base_directory = PARTIAL_FANOVA_DIR):
    # Get random subset of the data
    df = df_agg.sample(frac = fraction).reset_index(drop = True)
    data_hash = joblib.hash(df)

    # Create directory for this fraction
    directory = base_directory / f"fraction-{fraction}"
    directory.mkdir(exist_ok = True)

    # Run FANOVA on subset
    partial_fanova = MultiverseFanova(features = df[["universe_model", "universe_exclude_features", "universe_exclude_subgroups"]], outcome = df[main_cp_metric])
    partial_fanova.quantify_importance(save_to = directory / f"partial-fanova_importance_interactions-majmin-{fraction}-{data_hash}.csv")

In [None]:
for i in tqdm(range(N_ITERATIONS)):
    quantify_importance_for_fraction(0.01)

In [None]:
for i in tqdm(range(N_ITERATIONS)):
    quantify_importance_for_fraction(0.05)

In [None]:
for i in tqdm(range(N_ITERATIONS)):
    quantify_importance_for_fraction(0.1)

In [None]:
for i in tqdm(range(N_ITERATIONS)):
    quantify_importance_for_fraction(0.2)

# Old Visualizations - To be deleted?

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# 1. Define your mapping and desired order (bottom to top)
#label_map = {
#    0: "1k",
#    1: "25k",
#    2: "5k"
#}
desired_order = ["25k", "5k", "1k"]  # bottom to top

# 2. Prep data
param = "universe_training_size"
target = main_cp_metric
df_plot = df_agg[[param, target]].copy()
df_plot[param] = df_plot[param].map(lambda x: label_map.get(x, x))
df_plot[param] = pd.Categorical(df_plot[param], categories=desired_order, ordered=True)

# 3. Plot
plt.figure(figsize=(6, 4))
sns.violinplot(
    data=df_plot,
    y=param,
    x=target,
    order=desired_order,      # explicitly enforce order
    palette="muted",
    scale="area",
    inner="point",
    linewidth=1
)
plt.xlabel("Average Prediction Set Size")
plt.ylabel("Training Size")
plt.tight_layout()
plt.show()

In [None]:
# Subset the relevant columns
param = "universe_model"
df_plot = df_agg[[param, main_cp_metric]].copy()
df_plot.columns = ["Model", "Prediction"]

# Optional: Order models as you like (alphabetical or custom)
order = sorted(df_plot["Model"].unique())  # or provide manually
plt.figure(figsize=(8, 4))
sns.violinplot(data=df_plot, y="Model", x="Prediction", order=order,
               palette="Set2", inner="point", linewidth=1)

plt.xlabel("Average Prediction Set Size")
plt.ylabel("Model")
plt.tight_layout()
plt.show()