# Journal of Medical Imaging Experiment

Submission title:
> Effective User Interaction in Online Interactive Semantic Segmentation for Glioblastoma MRI<br>
> Jens Petersen, Martin Bendszus, Jürgen Debus, Sabine Heiland, Klaus H. Maier-Hein

Methods:
1. UNCERTAIN
2. MISCLASS
3. MISCLASS-B
4. UNCERTAIN-MB
5. CERTAIN-MB

For description of methods etc. please see publication.

Author of this document ist Jens Petersen.

## Imports

In [16]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import os
import sys
import scipy.stats as stats
from sklearn.ensemble import RandomForestClassifier
import time
import random
import pandas as pd
import pickle
import xarray as xr
from util import dataio, interactive, metrics, parse

## Configuration

In [17]:
config = {
    "labels" : [1, 2, 3, 4, (1, 2, 3, 4)],
    "label_names" : ["Tumor Core", "Edema", "Non-enhancing Abnormality", "Enhancing Tumor", "Whole Tumor"],
    "subjects" : ["HG0001", "HG0002", "HG0003", "HG0004", "HG0005", "HG0006", "HG0007", "HG0008", "HG0009", "HG0010",
                  "HG0011", "HG0012", "HG0013", "HG0014", "HG0015", "HG0022", "HG0024", "HG0025", "HG0026", "HG0027"],
    "methods" : ["UNCERTAIN", "MISCLASS", "MISCLASS-B", "UNCERTAIN-MB", "CERTAIN-MB"],
    "epochs" : 50,
    "repetitions" : 5,
    "uncertainty_threshold" : 0.8,
    "stroke_length" : 10,
    "uncertainty_type" : "entropy",  # entropy, inv_margin, inv_confidence
    "uncertainty_func" : metrics.entropy,
    "start_indices_type" : "weighted",  # weighted, random
    "n_start_indices" : 50
}

forest_config = {
    "n_estimators": 50,
    "max_depth": 10,
    "n_jobs": -1
}

data_locations = []
truth_locations = []

## User Interactions

In [18]:
def UNCERTAIN(data, truth, config, forest_config):
    
    # shapes
    base_shape = truth.shape
    n_features = data.shape[-1]
    
    # points for initialization
    if config["start_indices_type"] == "weighted":
        start_indices = interactive.weighted_start_indices(truth, config["n_start_indices"])
    elif config["start_indices_type"] == "random":
        start_indices = interactive.random_indices(base_shape, config["n_start_indices"])
    else:
        raise ValueError("Unknown option for start_indices_type.")
        
    # construct training data
    training_data = []
    training_labels = []
    for index in start_indices:
        training_data.append(data[index])
        training_labels.append(truth[index])
    
    # initialize forest
    rf = RandomForestClassifier(**forest_config)
    rf.fit(training_data, training_labels)
    probabilities = dataio.inflate(rf.predict_proba(dataio.flat(data)), base_shape)
    uncertainty = config["uncertainty_func"](probabilities)
    mask = interactive.maximum_uncertainty_region(uncertainty,
        **parse.config_for_function(interactive.maximum_uncertainty_region, config))
    
    # evaluation
    eval_suite = metrics.EvaluationSuite(reference=truth, labels=config["labels"], label_names=config["label_names"])
    scores = []
        
    # do experiment
    for epoch in range(config["epochs"]):
        
        try:
            current_indices = interactive.random_stroke(base_shape, config["stroke_length"], mask=mask, groundtruth=truth)
        except IndexError:
            current_indices = interactive.random_stroke(base_shape, config["stroke_length"], groundtruth=truth)
        for index in current_indices:
            training_data.append(data[index])
            training_labels.append(truth[index])
            
        rf.fit(training_data, training_labels)
        probabilities = rf.predict_proba(dataio.flat(data))
        segmentation = rf.classes_.take(np.argmax(probabilities, axis=1), axis=0).reshape(base_shape)
        probabilities = dataio.inflate(probabilities, base_shape)
        uncertainty = config["uncertainty_func"](probabilities)
        mask = interactive.maximum_uncertainty_region(uncertainty,
            **parse.config_for_function(interactive.maximum_uncertainty_region, config))
        
        eval_suite.set_test(segmentation)
        eval_suite.evaluate()
        scores.append(eval_suite.to_pandas())
        
    return scores

def MISCLASS(data, truth, config, forest_config):
    
    # shapes
    base_shape = truth.shape
    n_features = data.shape[-1]
    
    # points for initialization
    if config["start_indices_type"] == "weighted":
        start_indices = interactive.weighted_start_indices(truth, config["n_start_indices"])
    elif config["start_indices_type"] == "random":
        start_indices = interactive.random_indices(base_shape, config["n_start_indices"])
    else:
        raise ValueError("Unknown option for start_indices_type.")
        
    # construct training data
    training_data = []
    training_labels = []
    for index in start_indices:
        training_data.append(data[index])
        training_labels.append(truth[index])
    
    # initialize forest
    rf = RandomForestClassifier(**forest_config)
    rf.fit(training_data, training_labels)
    segmentation = rf.predict(dataio.flat(data)).reshape(base_shape)
    mask = segmentation != truth
    
    # evaluation
    eval_suite = metrics.EvaluationSuite(reference=truth, labels=config["labels"], label_names=config["label_names"])
    scores = []
        
    # do experiment
    for epoch in range(config["epochs"]):
        
        try:
            current_indices = interactive.random_stroke(base_shape, config["stroke_length"], mask=mask, groundtruth=truth)
        except IndexError:
            current_indices = interactive.random_stroke(base_shape, config["stroke_length"], groundtruth=truth)
        for index in current_indices:
            training_data.append(data[index])
            training_labels.append(truth[index])
            
        rf.fit(training_data, training_labels)
        segmentation = rf.predict(dataio.flat(data)).reshape(base_shape)
        mask = segmentation != truth
        
        eval_suite.set_test(segmentation)
        eval_suite.evaluate()
        scores.append(eval_suite.to_pandas())
        
    return scores

def MISCLASS_B(data, truth, config, forest_config):
    
    # shapes
    base_shape = truth.shape
    n_features = data.shape[-1]
    
    # points for initialization
    if config["start_indices_type"] == "weighted":
        start_indices = interactive.weighted_start_indices(truth, config["n_start_indices"])
    elif config["start_indices_type"] == "random":
        start_indices = interactive.random_indices(base_shape, config["n_start_indices"])
    else:
        raise ValueError("Unknown option for start_indices_type.")
        
    # construct training data
    training_data = []
    training_labels = []
    for index in start_indices:
        training_data.append(data[index])
        training_labels.append(truth[index])
    
    # initialize forest
    rf = RandomForestClassifier(**forest_config)
    rf.fit(training_data, training_labels)
    segmentation = rf.predict(dataio.flat(data)).reshape(base_shape)
    mask = segmentation != truth
    
    # evaluation
    eval_suite = metrics.EvaluationSuite(reference=truth, labels=config["labels"], label_names=config["label_names"])
    scores = []
        
    # do experiment
    for epoch in range(config["epochs"]):
        
        # random label from mask
        random_label = random.choice(np.unique(truth))
        mask *= (segmentation == random_label) + (truth == random_label)
        
        try:
            current_indices = interactive.random_stroke(base_shape, config["stroke_length"], mask=mask, groundtruth=truth)
        except IndexError:
            current_indices = interactive.random_stroke(base_shape, config["stroke_length"], groundtruth=truth)
        for index in current_indices:
            training_data.append(data[index])
            training_labels.append(truth[index])
            
        rf.fit(training_data, training_labels)
        segmentation = rf.predict(dataio.flat(data)).reshape(base_shape)
        mask = segmentation != truth
        
        eval_suite.set_test(segmentation)
        eval_suite.evaluate()
        scores.append(eval_suite.to_pandas())
        
    return scores

def UNCERTAIN_MB(data, truth, config, forest_config):
    
    # shapes
    base_shape = truth.shape
    n_features = data.shape[-1]
    
    # points for initialization
    if config["start_indices_type"] == "weighted":
        start_indices = interactive.weighted_start_indices(truth, config["n_start_indices"])
    elif config["start_indices_type"] == "random":
        start_indices = interactive.random_indices(base_shape, config["n_start_indices"])
    else:
        raise ValueError("Unknown option for start_indices_type.")
        
    # construct training data
    training_data = []
    training_labels = []
    for index in start_indices:
        training_data.append(data[index])
        training_labels.append(truth[index])
    
    # initialize forest
    rf = RandomForestClassifier(**forest_config)
    rf.fit(training_data, training_labels)
    probabilities = rf.predict_proba(dataio.flat(data))
    segmentation = rf.classes_.take(np.argmax(probabilities, axis=1), axis=0).reshape(base_shape)
    probabilities = dataio.inflate(probabilities, base_shape)
    uncertainty = config["uncertainty_func"](probabilities)
    mask = segmentation != truth
    
    # evaluation
    eval_suite = metrics.EvaluationSuite(reference=truth, labels=config["labels"], label_names=config["label_names"])
    scores = []
        
    # do experiment
    for epoch in range(config["epochs"]):
        
        # random label from mask
        random_label = random.choice(np.unique(truth))
        mask *= (segmentation == random_label) + (truth == random_label)
        
        # uncertainty range of mask
        min_uncertainty = np.min(uncertainty[mask])
        max_uncertainty = np.max(uncertainty[mask])
        mask *= uncertainty > (config["uncertainty_threshold"] * (max_uncertainty - min_uncertainty) + min_uncertainty)
        if not np.any(mask):
            mask = np.ones(base_shape, dtype=np.bool)
        
        try:
            current_indices = interactive.random_stroke(base_shape, config["stroke_length"], mask=mask, groundtruth=truth)
        except IndexError:
            current_indices = interactive.random_stroke(base_shape, config["stroke_length"], groundtruth=truth)
        for index in current_indices:
            training_data.append(data[index])
            training_labels.append(truth[index])
            
        rf.fit(training_data, training_labels)
        probabilities = rf.predict_proba(dataio.flat(data))
        segmentation = rf.classes_.take(np.argmax(probabilities, axis=1), axis=0).reshape(base_shape)
        probabilities = dataio.inflate(probabilities, base_shape)
        uncertainty = config["uncertainty_func"](probabilities)
        mask = segmentation != truth
        
        eval_suite.set_test(segmentation)
        eval_suite.evaluate()
        scores.append(eval_suite.to_pandas())
        
    return scores

def CERTAIN_MB(data, truth, config, forest_config):
    
    # shapes
    base_shape = truth.shape
    n_features = data.shape[-1]
    
    # points for initialization
    if config["start_indices_type"] == "weighted":
        start_indices = interactive.weighted_start_indices(truth, config["n_start_indices"])
    elif config["start_indices_type"] == "random":
        start_indices = interactive.random_indices(base_shape, config["n_start_indices"])
    else:
        raise ValueError("Unknown option for start_indices_type.")
        
    # construct training data
    training_data = []
    training_labels = []
    for index in start_indices:
        training_data.append(data[index])
        training_labels.append(truth[index])
    
    # initialize forest
    rf = RandomForestClassifier(**forest_config)
    rf.fit(training_data, training_labels)
    probabilities = rf.predict_proba(dataio.flat(data))
    segmentation = rf.classes_.take(np.argmax(probabilities, axis=1), axis=0).reshape(base_shape)
    probabilities = dataio.inflate(probabilities, base_shape)
    uncertainty = config["uncertainty_func"](probabilities)
    mask = segmentation != truth
    
    # evaluation
    eval_suite = metrics.EvaluationSuite(reference=truth, labels=config["labels"], label_names=config["label_names"])
    scores = []
        
    # do experiment
    for epoch in range(config["epochs"]):
        
        # random label from mask
        random_label = random.choice(np.unique(truth))
        mask *= (segmentation == random_label) + (truth == random_label)
        if not np.any(mask):
            mask = np.ones(base_shape, dtype=np.bool)
        
        # uncertainty range of mask
        min_uncertainty = np.min(uncertainty[mask])
        max_uncertainty = np.max(uncertainty[mask])
        mask *= uncertainty < ((1 - config["uncertainty_threshold"]) * (max_uncertainty - min_uncertainty) + min_uncertainty)
        
        try:
            current_indices = interactive.random_stroke(base_shape, config["stroke_length"], mask=mask, groundtruth=truth)
        except IndexError:
            current_indices = interactive.random_stroke(base_shape, config["stroke_length"], groundtruth=truth)
        for index in current_indices:
            training_data.append(data[index])
            training_labels.append(truth[index])
            
        rf.fit(training_data, training_labels)
        probabilities = rf.predict_proba(dataio.flat(data))
        segmentation = rf.classes_.take(np.argmax(probabilities, axis=1), axis=0).reshape(base_shape)
        probabilities = dataio.inflate(probabilities, base_shape)
        uncertainty = config["uncertainty_func"](probabilities)
        mask = segmentation != truth
        
        eval_suite.set_test(segmentation)
        eval_suite.evaluate()
        scores.append(eval_suite.to_pandas())
        
    return scores

## Run Experiment

In [None]:
scores = xr.DataArray(np.zeros((len(config["methods"]),
                                len(config["subjects"]),
                                config["repetitions"],
                                config["epochs"],
                                len(config["label_names"]),
                                len(metrics.EvaluationSuite._metrics))),
                      [("Methods", config["methods"]),
                       ("Subjects", config["subjects"]),
                       ("Repetitions", range(config["repetitions"])),
                       ("Epochs", range(config["epochs"])),
                       ("Labels", config["label_names"]),
                       ("Metrics", metrics.EvaluationSuite._metrics)],
                      name="Scores")

for s, subject in enumerate(config["subjects"]):
    
    current_data = dataio.copy_from_file(data_locations[s], dtype=np.float32)
    current_truth = dataio.copy_from_file(truth_locations[s], dtype=np.int)
    
    for r in range(config["repetitions"]):
        for m, method in enumerate(config["methods"]):
            
            print subject, r, method, "...",
            
            current_scores = eval(method.replace("-", "_"))(current_data, current_truth, config, forest_config)
            for epoch, score in enumerate(current_scores):
                scores.loc[method, subject, r, epoch, :, :] = score.values
                
            print "done"   

In [30]:
scores_rep_mean = scores.mean("Repetitions")
scores_mean = scores.mean(["Subjects", "Repetitions"])
scores_std = scores_rep_mean.std("Subjects")

## Significance Test

In [33]:
test_func = stats.wilcoxon

n_comparisons = sum(range(len(scores_rep_mean["Methods"])))
test_scores = np.zeros((n_comparisons,
                        len(scores_rep_mean["Epochs"]),
                        len(scores_rep_mean["Labels"]),
                        len(scores_rep_mean["Metrics"]),
                        3))

comparison_labels = []
for m1, method1 in enumerate(scores_rep_mean["Methods"].values[:-1]):
    for m2, method2 in enumerate(scores_rep_mean["Methods"].values[m1+1:]):
        
        comparison_labels.append("{} v {}".format(method1, method2))
        
        for e, epoch in enumerate(scores_rep_mean["Epochs"].values):
            for l, label in enumerate(scores_rep_mean["Labels"].values):
                for m, metric in enumerate(scores_rep_mean["Metrics"].values):
                
                    test_stat, p = test_func(scores_rep_mean.loc[method1, :, epoch, label, metric],
                                             scores_rep_mean.loc[method2, :, epoch, label, metric])
                    median_delta = np.median(scores_rep_mean.loc[method1, :, epoch, label, metric]) -\
                                   np.median(scores_rep_mean.loc[method2, :, epoch, label, metric])
                    test_scores[len(comparison_labels) - 1, e, l, m] = [test_stat, p, median_delta]
                
test_scores = xr.DataArray(test_scores,
                           [("Comparisons", comparison_labels),
                            ("Epochs", scores_rep_mean["Epochs"].values),
                            ("Labels", scores_rep_mean["Labels"].values),
                            ("Metrics", scores_rep_mean["Metrics"].values),
                            ("Statistics", ["Statistic", "p", "Delta of Medians"])],
                           name="T-test")

## Plot

In [41]:
# configure
colors = [mpl.cm.viridis(1.*i/len(config["methods"])) for i in range(len(config["methods"]))]
styles = ["--", "-", "-", "--", "--"]
linewidth = 1.5
y_label = "Dice Score"
show_metric = "dice"
method_explanations = [
    "Annotate in most uncertain region",
    "Randomly correct classifier",
    "Correct classifier in random class",
    "Correct classifier in random class\nwhere most uncertain",
    "Correct classifier in random class\nwhere most certain",
]

Dice score over time

In [42]:
# configure
n_cols = 2
n_rows = 4
show_std_for_method = "MISCLASS-B"
std_color = colors[list(scores_mean["Methods"]).index(show_std_for_method)]
std_alpha = 0.2
text_positions = [
    (2, 0.9, "left"),
    (4, 0.8, "left"),
    (4, 0.8, "left"),
    (4, 0.8, "left"),
    (45, 0.1, "right"),
]

# initialize figure and gridspec
fig = plt.figure(figsize=(8.27, 10))
gs0 = mpl.gridspec.GridSpec(2, 1)
gs1 = mpl.gridspec.GridSpecFromSubplotSpec(2, 2, subplot_spec=gs0[1], wspace=0, hspace=0)
gs0.update(left=0.1, right=0.95, top=0.98, bottom=0.05)

# invisible plot to carry axis label
bg_ax = fig.add_subplot(gs0[1])
for s in bg_ax.spines:
    bg_ax.spines[s].set_color("none")
bg_ax.tick_params(labelcolor="w", top="off", bottom="off", left="off", right="off", labelbottom="off", labelleft="off")
bg_ax.set_ylabel(y_label, labelpad=30)
bg_ax.patch.set_alpha(0)

# initialize single plots
axes = []
axes.append(fig.add_subplot(gs0[0]))
axes.append(fig.add_subplot(gs1[0, 0]))
axes.append(fig.add_subplot(gs1[0, 1], sharey=axes[1]))
axes.append(fig.add_subplot(gs1[1, 0]))
axes.append(fig.add_subplot(gs1[1, 1]))
plt.setp(axes[1].get_xticklabels(), visible=False)
plt.setp(axes[2].get_xticklabels(), visible=False)
plt.setp(axes[2].get_yticklabels(), visible=False)
plt.setp(axes[4].get_yticklabels(), visible=False)

# plot
for l, label in enumerate(scores_mean["Labels"].values):
    
    for m, method in enumerate(scores_mean["Methods"].values):
        axes[l].plot(scores_mean["Epochs"].values,
                     scores_mean.loc[method, :, label, show_metric],
                     color=colors[m],
                     linestyle=styles[m],
                     linewidth=linewidth)
    
    axes[l].set_xlim(scores_mean["Epochs"].values[0], scores_mean["Epochs"].values[-1])
    if l == 0:
        axes[l].set_ylim(0, 1)
    else:
        axes[l].set_ylim(0.01, 1)
    
    axes[l].fill_between(scores_mean["Epochs"].values,
                         scores_mean.loc[show_std_for_method, :, label, show_metric] -\
                         scores_std.loc[show_std_for_method, :, label, show_metric],
                         scores_mean.loc[show_std_for_method, :, label, show_metric] +\
                         scores_std.loc[show_std_for_method, :, label, show_metric],
                         linewidth=0,
                         alpha=std_alpha,
                         facecolor=std_color)
    
    axes[l].text(text_positions[l][0], text_positions[l][1], r"{}".format(label),
                 ha=text_positions[l][2], multialignment=text_positions[l][2])
    
axes[0].set_xlabel("Epoch", labelpad=10)
axes[0].set_ylabel(y_label)

# legend in first subplot
lines = []
for m, method in enumerate(scores_mean["Methods"].values):
    lines.append(mpl.lines.Line2D([], [], color=colors[m], linestyle=styles[m], linewidth=linewidth))
labels = list(scores_mean["Methods"].values)
labels = map(lambda x: x[0] + " - " + x[1], zip(labels, method_explanations))
axes[0].legend(handles=lines, labels=labels, loc=4, frameon=False, fontsize=12)

plt.show()

Bar plots

In [43]:
# configure
epoch = 19
bar_width = 0.9
significance_threshold = 0.001
significance_dash_length = 0.05
significance_spacing = 0.1
significance_text_spacing = 0.05
with_error = True
color_low_diff = "k"
color_high_diff = "k"
lower_lim = 0

# initialize figure and gridspec
fig = plt.figure(figsize=(8.27, 10))
gs0 = mpl.gridspec.GridSpec(2, 1, hspace=0.3)
gs1 = mpl.gridspec.GridSpecFromSubplotSpec(2, 2, subplot_spec=gs0[1], wspace=0, hspace=0)
gs0.update(left=0.1, right=0.95, top=0.98)

# invisible plot that carries labels
bg_ax = fig.add_subplot(gs0[1])
for s in bg_ax.spines:
    bg_ax.spines[s].set_color("none")
bg_ax.tick_params(labelcolor="w", top="off", bottom="off", left="off", right="off", labelbottom="off", labelleft="off")
bg_ax.set_ylabel(y_label, labelpad=35)
bg_ax.patch.set_alpha(0)

# create empty plots
axes = []
axes.append(fig.add_subplot(gs0[0]))
axes.append(fig.add_subplot(gs1[0, 0]))
axes.append(fig.add_subplot(gs1[0, 1], sharey=axes[1]))
axes.append(fig.add_subplot(gs1[1, 0]))
axes.append(fig.add_subplot(gs1[1, 1]))

# hide axes for subplots
plt.setp(axes[1].get_xticklabels(), visible=False)
plt.setp(axes[2].get_xticklabels(), visible=False)
plt.setp(axes[2].get_yticklabels(), visible=False)
plt.setp(axes[4].get_yticklabels(), visible=False)

axes[0].set_ylabel(y_label)

# plot bars
for l, label in enumerate(scores_mean["Labels"].values):
    
    if l == 0:
        current_lw = linewidth
    else:
        current_lw = linewidth / 2.
    
    # get standard error of the mean
    current_error = np.copy(scores_std.loc[:, epoch, label, show_metric].values)
    current_error /= np.sqrt(len(scores["Subjects"]))
    
    # plot bars
    current_bars = axes[l].bar(range(len(scores_mean["Methods"])),
                               scores_mean.loc[:, epoch, label, show_metric],
                               width=bar_width,
                               yerr=current_error,
                               error_kw=dict(lw=current_lw, ecolor="k", capthick=current_lw, capsize=3*current_lw))
    
    # set bar colors and plot errors
    for m in range(len(scores_mean["Methods"])):
        current_bars[m].set_color(colors[m])
        
    axes[l].set_xlim(axes[l].get_xlim()[0] - 0.5*bar_width, axes[l].get_xlim()[1] + 0.5*bar_width)
    axes[l].set_xticks(np.arange(len(scores_mean["Methods"])) + 0.5*bar_width)
    axes[l].set_xticklabels(scores_mean["Methods"].values, rotation=35, ha="right")
    axes[l].set_yticks(np.linspace(0, 1, 5))

# highlight significant differences
max_height_all = 0
for l, label in enumerate(scores_mean["Labels"].values):
    
    if l == 0:
        current_lw = linewidth
    else:
        current_lw = linewidth / 2.
    
    max_height = 0
    current_height = np.max(scores_mean.loc[:, epoch, label, show_metric]) + significance_spacing    
    indices_sorted = np.argsort(scores_mean.loc[:, epoch, label, show_metric].values)
    if with_error:
        current_height += scores_std.loc[:, epoch, label, show_metric].values[indices_sorted[-1]]
    
    # compare all pairs
    for i1, index1 in enumerate(indices_sorted[:-1]):
        for index2 in indices_sorted[i1+1:]:
            
            method1 = scores_mean["Methods"].values[index1]
            method2 = scores_mean["Methods"].values[index2]
            
            comp1 = "{} v {}".format(method1, method2)
            comp2 = "{} v {}".format(method2, method1)
            
            try:
                current_p = test_scores.loc[comp1, epoch, label, show_metric, "p"].values
                current_stat = test_scores.loc[comp1, epoch, label, show_metric, "Statistic"].values
            except KeyError:
                current_p = test_scores.loc[comp2, epoch, label, show_metric, "p"].values
                current_stat = test_scores.loc[comp2, epoch, label, show_metric, "Statistic"].values
                
            if current_p < significance_threshold:
                
                dash_position = index1 + 0.5*bar_width
                
                if index1 < index2:
                    current_color = color_low_diff
                else:
                    current_color = color_high_diff

                axes[l].plot([index1 + 0.5*bar_width, index2 + 0.5*bar_width],
                             [current_height, current_height],
                             color=current_color,
                             linewidth=current_lw,
                             solid_capstyle="round")             
                axes[l].plot([dash_position, dash_position],
                             [current_height, current_height - significance_dash_length],
                             color=current_color,
                             linewidth=current_lw,
                             solid_capstyle="round")
                
                current_height += significance_spacing
                max_height = max(max_height, current_height)
                
    max_height_all = max(max_height_all, max_height)

# adjust y limits
for l, label in enumerate(scores_mean["Labels"].values):
    axes[l].set_ylim(lower_lim, max_height_all)

# display label names
text_positions = [
    (5, max_height_all - 0.15, "right"),
    (0, max_height_all - 0.25, "left"),
    (5, max_height_all - 0.25, "right"),
    (0, max_height_all - 0.25, "left"),
    (0, max_height_all - 0.25, "left"),
]
for l, label in enumerate(scores_mean["Labels"].values):
    axes[l].text(text_positions[l][0], text_positions[l][1], label, ha=text_positions[l][2],
                 multialignment=text_positions[l][2])

plt.show()