# The Enigma of Task Decomposition in Neural Networks
Framing:
  - Teaching Neural Networks Task Decomposition remains widely unsolved (Subgoal model predicts subgoals with small errors that the Synthesizer model omits, but synth model also knows how to decompose)
  - Sampling is an alternative way to decompose tasks 

In [1]:
import pandas as pd
import json
import numpy as np
import re

import matplotlib.pyplot as plt 
import seaborn as sns
import matplotlib.patches as mpatches
from IPython.core.pylabtools import figsize
from matplotlib_inline.backend_inline import set_matplotlib_formats
import tikzplotlib

%matplotlib inline
set_matplotlib_formats('retina') # high-res display in notebook

# Set the Seaborn theme
context = 'paper'  # paper, talk, or notebook

save_settings = {'format': 'svg'}  # always use svg as it allows scaling figures up & down without losing resolution! 

if context == 'talk':
    pal = sns.color_palette('deep')
    PALETTE = {"TIIPS": "#115e17", "ExeDec": "#a2af9f", "Ground truth": "#6e7a6c", 'No-Subgoal Ablation': "#0094c2", 'Baseline': "#00618C", 'Both': "#6e7a6c", 'MLITS Only': "#115e17", "ExeDec Only": "#a2af9f", "No-Subgoal Ablation Only":"#0094c2", 'Both - MLITS': "#115e17", "Both - ExeDec": "#a2af9f", "Both - No-Subgoal Ablation": "#0094c2"}
    sns.set_theme(context='talk', style='whitegrid', palette=sns.color_palette(PALETTE.values()), font_scale=2)
else:
    pal = sns.color_palette('colorblind')
    PALETTE = {"TIIPS": "#115e17", "ExeDec": "#a2af9f", "Ground truth": "#6e7a6c", 'No-Subgoal Ablation': "#0094c2", 'Baseline': "#00618C", 'Both': "#6e7a6c", 'MLITS Only': "#115e17", "ExeDec Only": "#a2af9f", "No-Subgoal Ablation Only":"#0094c2", 'Both - MLITS': "#115e17", "Both - ExeDec": "#a2af9f", "Both - No-Subgoal Ablation": "#0094c2"}
    sns.set_theme(context=context, style='whitegrid', palette=sns.color_palette(PALETTE.values()), font_scale=2)

In [2]:
SEEDS = [10, 20, 30, 40, 50]
SYNTH_MODEL = "synthesizer_model"
EXPERIMENTS = ["NONE", "LENGTH_GENERALIZATION", "COMPOSE_DIFFERENT_CONCEPTS", "SWITCH_CONCEPT_ORDER", "COMPOSE_NEW_OP", "ADD_OP_FUNCTIONALITY"]
EXP_LBL_MAP = {"NONE": 'Test on training distribution', "LENGTH_GENERALIZATION": 'Length generalization', "COMPOSE_DIFFERENT_CONCEPTS": 'Compose different concepts', "SWITCH_CONCEPT_ORDER": 'Switch concept order', "COMPOSE_NEW_OP": 'Compose new operation', "ADD_OP_FUNCTIONALITY": 'Add operation functionality'}
PT_MAPPING = {"separate": "ExeDec", "baseline": "Baseline", 'tiips': 'TIIPS'}
DATASET = {"robustfill": "String manipulation", "deepcoder": "List manipulation"}

HANDLES = [
        mpatches.Patch(color="#115e17", label="TIIPS"),
        mpatches.Patch(color="#115e17", label="Baseline"),
        mpatches.Patch(color="#a2af9f", label="ExeDec"),
    ]

### Method Definition

In [12]:
def get_length(solution):
    subprograms = solution.split(' | ')
    num_subprograms = len(subprograms)
    num_inputs = sum(1 for sub in subprograms if 'INPUT' in sub)
    length = num_subprograms - num_inputs
    return length

def load_decomposition_data(ds, key, task_index=True, threshold=1000):
    path = f"./tiips_results/evaluation/{ds}_e2e_predict_1/end_to_end_predict-{ds}-run-e2e_predict_1-"
    results = {"TIIPS": {s: {k: [] * 5 for k in EXPERIMENTS} for s in SEEDS}, "ExeDec": {s: {k: [] * 5 for k in EXPERIMENTS} for s in SEEDS}, "Baseline": {s: {k: [] * 5 for k in EXPERIMENTS} for s in SEEDS}}
    
    for prediction_type in ["tiips", "baseline", 'separate']:
        pt = prediction_type
        for experiment in EXPERIMENTS:
            
            for j, seed in enumerate(SEEDS):
                pt = PT_MAPPING[prediction_type]

                try:
                    with open(path + prediction_type  + f'-/tb/hparams-dataset_type={ds},prediction_type={prediction_type},experiment={experiment},beam_size=10,seed={seed}/results-{prediction_type}.json') as f:
                        data = json.load(f)
                except FileNotFoundError:
                    continue    

                if len(data) != threshold:
                    continue

                try:
                    for task in data:
                        r = task[key]
                        if task['success'] and get_length(task['solution']) <= 1:
                            r = np.nan
                        if prediction_type == 'separate':
                            if r:
                                if task[key] > get_length(task['solution']) and task['success']:
                                    r = get_length(task['solution'])
                                elif not task['success'] and task[key] > get_length(task['ground_truth']):
                                    r = get_length(task['ground_truth'])
                                
                            results[PT_MAPPING[prediction_type]][seed][experiment].append({"success": task['success'], "subgoal_success": r / get_length(task['solution']) if task['success'] else r / get_length(task['ground_truth']), 'subprogram_success_rate': sum([1 if a == b else 0 for a, b in zip(task['solution'].split(' | '), task['ground_truth'].split(' | '))]) / len(task['solution'].split(' | ')), 'Task': task['test_example_index'] if task_index else task, 'Subgoal Error': task['subgoal_first_error']})
                        else:
                            results[PT_MAPPING[prediction_type]][seed][experiment].append({"success": task['success'], "subgoal_success": task['synth_subgoal_success'] / get_length(task['solution']) if task['success'] else task['synth_subgoal_success'] / get_length(task['ground_truth']), 'subprogram_success_rate': sum([1 if a == b else 0 for a, b in zip(task['solution'].split(' | '), task['ground_truth'].split(' | '))]) / len(task['solution'].split(' | ')), 'Task': task['test_example_index'] if task_index else task, 'Subgoal Error': -1})
                except KeyError:
                    continue
      
    df_list = []
    for pred_type, subdict in results.items():
        for s, subsubdict in subdict.items():
            for exp, values in subsubdict.items():
                for value in values:
                    df_list.append({'Approach': pred_type, 'Success': value["success"], 'Correct Subgoals': value['subgoal_success'] * 100, 'Correct Subprograms': value['subprogram_success_rate'] * 100, 'Experiment': exp, 'Seed': s, 'Task': value['Task'], 'Subgoal First Error': value['Subgoal Error']})
    
    df = pd.DataFrame(df_list)
    return df

import re

def load_performance_data(ds, threshold=1000):
    path = f"./tiips_results/evaluation/{ds}_e2e_predict_1/end_to_end_predict-{ds}-run-e2e_predict_1-"
    accuracies = {"Baseline": [], "ExeDec": {}, "TIIPS": {}}
    num_steps_sp = {"separate": [], "gt": []}
    num_steps_bu = {"baseline": [], "gt": []}
    num_steps_ml = {"tiips": [], "gt": []}
    for prediction_type in ["tiips", "baseline", 'separate']:
        exp = {k: [np.nan] * 5 for k in EXPERIMENTS}
        for experiment in EXPERIMENTS:
            for j, seed in enumerate(SEEDS):

                try:
                    with open(path + prediction_type  + f'-/tb/hparams-dataset_type={ds},prediction_type={prediction_type},experiment={experiment},beam_size=10,seed={seed}/results-{prediction_type}.json') as f:
                        data = json.load(f)
                except FileNotFoundError:
                    continue

                exp[experiment][j] = len([ele for ele in data if ele["success"]]) / len(data) * 100 if len(data) == threshold else np.nan
                
                if prediction_type == "separate": 
                    num_steps_sp[prediction_type] += [get_length(ele["solution"]) for ele in data if ele["success"]]
                    num_steps_sp["gt"] += [ele["ground_truth_length"] for ele in data if ele["success"]]
                elif prediction_type == "baseline": 
                    num_steps_bu[prediction_type] += [get_length(ele["solution"]) for ele in data if ele["success"]]
                    num_steps_bu["gt"] += [ele["ground_truth_length"] for ele in data if ele["success"]]
                elif prediction_type == "tiips": 
                    num_steps_ml[prediction_type] += [get_length(ele["solution"]) for ele in data if ele["success"]]
                    num_steps_ml["gt"] += [ele["ground_truth_length"] for ele in data if ele["success"]]
            k = PT_MAPPING[prediction_type]
        accuracies[k] = exp
    return accuracies

def extract_operations(program):
    """
    Extracts operations used in the program by splitting at '|' and removing variables.
    
    Args:
        program (str): The program as a string.
    
    Returns:
        list: A list of cleaned operations found in the program.
    """
    operations = []
    for part in program.split('|'):
        # Remove variable assignments and usages (e.g., 'x0 = ', 'x1', etc.)
        cleaned_part = re.sub(r'\bx\d+\b|=', '', part).strip()
        if cleaned_part != 'INPUT':
            operations.append(cleaned_part)
    return operations

def is_prefix_in_set(operation, operation_set):
    """
    Checks if the operation starts with any prefix in the operation_set.
    
    Args:
        operation (str): The operation to check.
        operation_set (set): A set of operation prefixes.
    
    Returns:
        bool: True if the operation starts with any prefix in the operation_set, False otherwise.
    """
    for op in operation_set:
        if operation.startswith(op):
            return True
    return False

def round_to_two_significant_digits(x):
    if x == 0 or x == np.nan:
        return 0
    else:
        from math import log10, floor
        return round(x, -int(floor(log10(abs(x)))) + 1)

In [16]:
def replace(original_string, old, new, count):
    # Split the string into two parts: before and after the nth occurrence
    parts = original_string.split(old)  # Limit the number of splits to `n`
    if count < 3:
        return original_string.replace(' ', '\n')
    
    return parts[0] + ' ' + '\n'.join(parts[1:])

# Normalize marginal KDE plots
def normalize_marginals_kde(g, data_d, x, y, hue, palette=PALETTE):
    categories = data_d[hue].unique()
    
    for category in categories:
        subset = data_d[data_d[hue] == category]
        joint = sns.kdeplot(
            data=subset,
            x="Correct Subgoals",
            y="Correct Subprograms",
            ax=g.ax_joint,
            fill=True,
            alpha=0.6,
            label=category,
            clip=((0, 100), (0, 100)),
            common_norm=True,
            color=palette[category]
        )
    # Marginal Y
    for category in categories:
        subset = data_d[data_d[hue] == category]
        sns.kdeplot(
            y=subset[y],
            ax=g.ax_marg_y,
            fill=True,
            label=f"{category} (X)",
            common_norm=True,  # Normalize densities to sum to 1
            alpha=0.4,
            clip=(0, 100),
            color=palette[category]
        )
        
    # Marginal X
    for category in categories:
        subset = data_d[data_d[hue] == category]
        sns.kdeplot(
            x=subset[x],
            ax=g.ax_marg_x,
            fill=True,
            label=f"{category} (X)",
            common_norm=True,  # Normalize densities to sum to 1
            alpha=0.4,
            clip=(0, 100), 
            color=palette[category]
        )
    return joint

# Qualitative analysis
Plot intent match, i.e., the (post-hoc) overlap with subtask outputs, and syntactic overlap, i.e., the overlap with subprograms


In [None]:
dataset = 'deepcoder'
df = load_decomposition_data(dataset, "num_subgoal_success", threshold=10)
app = 'TIIPS'
filtered_tasks = df[df['Approach'] == app]
solved_tasks = filtered_tasks[filtered_tasks['Success'] == True]

g = sns.jointplot(solved_tasks, x="Correct Subgoals", y="Correct Subprograms", fill=True,  kind='kde', clip=((0, 100), (0, 100)), color=PALETTE[app])
g.ax_marg_x.set_title(f'Density of solved tasks: {app}')
plt.ylabel('Syntactic overlap per task [%]')
plt.xlabel('Intent match per task [%]')
plt.ylim((0, 100))
plt.xlim((0, 100))
plt.tight_layout()

app = 'ExeDec'
filtered_tasks = df[df['Approach'] == app]
solved_tasks = filtered_tasks[filtered_tasks['Success'] == True]

g = sns.jointplot(solved_tasks, x="Correct Subgoals", y="Correct Subprograms", fill=True,  kind='kde', clip=((0, 100), (0, 100)), color=PALETTE[app])
g.ax_marg_x.set_title(f'Density of solved tasks: {app}')
plt.ylabel('Syntactic overlap per task [%]')
plt.xlabel('Intent match per task [%]')
plt.ylim((0, 100))
plt.xlim((0, 100))
plt.tight_layout()


# Baseline Effect
When repeatedly invoking ExeDec's Synthesizer Model, similar results can be achieved on the list manipulation domain as reported for ExeDec.

In [14]:
dataset = 'deepcoder'
p = f"./tiips_results/evaluation/{dataset}_e2e_predict_1/end_to_end_predict-{dataset}-run-e2e_predict_1-"
exp = {k: [np.nan] * 5 for k in EXPERIMENTS}
num_steps = {e: {"ExeDec": {"ExeDec": [], "Ground truth": []}, "Baseline": {"Baseline": [], "Ground truth": []}, "TIIPS": {"TIIPS": [], "Ground truth": []}} for e in EXPERIMENTS}
sols = {"ExeDec": {e: {"Matches": [], "Mismatches": []} for e in EXPERIMENTS}, "Baseline": {e: {"Mismatches": [], "Matches": []} for e in EXPERIMENTS}, "TIIPS": {e: {"Mismatches": [], "Matches": []} for e in EXPERIMENTS}}
for experiment in EXPERIMENTS:
    for j, seed in enumerate([10, 20, 30, 40, 50]):
        for prediction_type in ["separate", "tiips", "baseline"]:
            pt = PT_MAPPING[prediction_type]
            try:
                with open(p + prediction_type + f'-/tb/hparams-dataset_type={dataset},prediction_type={prediction_type},experiment={experiment},beam_size=10,seed={seed}/results-{prediction_type}.json') as f:
                    data = json.load(f)
            except FileNotFoundError:
                continue
            
            exp[experiment][j] = len([ele for ele in data if ele["success"]]) / len(data) * 100 if len(data) == 1000 else np.nan
            num_steps[experiment][pt][pt] += [get_length(ele["solution"]) for ele in data if ele["success"]]
            num_steps[experiment][pt]["Ground truth"] += [ele["ground_truth_length"] for ele in data if ele["success"]]

            if prediction_type == "separate" or prediction_type == "baseline": 
                match_ratios = []
                for s, g in zip([ele["solution"] for ele in data if ele["success"]], [ele["ground_truth"] for ele in data if ele["success"]]):
                    s_parts = s.split(' | ')
                    g_parts = g.split(' | ')
                    match_ratios.append(sum(1 for sp, gp in zip(s_parts, g_parts) if sp == gp) / len(s_parts))
                sols[pt][experiment]['Matches'].append(match_ratios)

In [None]:
from scipy.stats import norm
rows = []
for pt, data in num_steps.items():
    for category, metrics in data.items():
        for key, values in metrics.items():
            for value in values:
                rows.append({"Approach": category, "Category": replace(EXP_LBL_MAP[pt], ' ', '\n', EXP_LBL_MAP[pt].count(' ')), "Legend": key, "Value": value})

# Convert the list of dictionaries to a pandas DataFrame
overall_df = pd.DataFrame(rows)
means = overall_df.groupby(['Approach', 'Category', 'Legend']).mean()
stddevs = overall_df.groupby(['Approach', 'Category', 'Legend']).std() / np.sqrt(overall_df.groupby(['Approach', 'Category', 'Legend']).count())

fig_width = 20
z_value = norm.ppf(0.5 + 0.95 / 2)

for m, app in enumerate(list(overall_df['Approach'].unique())):
    fig, axs = plt.subplots(1, 6, figsize=(fig_width, 6))# , gridspec_kw={'width_ratios': [fig_width / 6] * 6})
    df = overall_df[overall_df['Approach'] == app]
    for j, category in enumerate(list(df["Category"].unique())):
        axs[j] = sns.barplot(
            data=df[df["Category"] == category],
            x="Category",
            y="Value",
            hue="Legend",
            dodge=True,
            ax=axs[j],
            palette=PALETTE
        )
        if j == 0:
            axs[j].set_ylabel("Number of decompositions")
        else:
            axs[j].set_ylabel("")
        axs[j].set_xlabel("")
        axs[j].set_ylim((0, means.max()['Value'] + 1.5))
        if j != 5:
            axs[j].get_legend().remove()
        else:
            legend = axs[j].legend(loc="upper right", ncol=1, fontsize=16)
        # axs[j].tick_params('x', labelsize=10)
        for n, lgd in enumerate([app, 'Ground truth']):
            ci_upper = means.loc[(app, category, lgd), 'Value'] + z_value *  + stddevs.loc[(app, category, lgd), 'Value']
            axs[j].text(-4/fig_width if n == 0 else 4/fig_width, ci_upper + 0.1, f"{means.loc[(app, category, lgd), 'Value']:.1f}", 
                 ha='center', va='bottom', fontweight='bold', color='black', fontsize=16)
    for ax in axs:
        legend = ax.get_legend()
        if legend is not None and not hasattr(legend, "_ncol"):
            legend._ncol = legend._ncols if hasattr(legend, "_ncols") else 1
    fig.suptitle(f"Number of decompositions by {app}")
    fig.tight_layout()


# Quantitative Performance
TIIPS outperforms ExeDec

In [None]:
dataset = 'deepcoder'
accuracies = load_performance_data(dataset, threshold=10)

# Transform data into a DataFrame suitable for seaborn plotting
df_list = []
for hue_key, subdict in accuracies.items():
    for category, values in subdict.items():
        for value in values:
            df_list.append({'Category': replace(EXP_LBL_MAP[category], ' ', '\n', EXP_LBL_MAP[category].count(' ')), 'Value': value, 'Type': hue_key})

df = pd.DataFrame(df_list)
df = df[df['Type'] != 'No-Subgoal Ablation']

# Filter out the 'None' category for overall average calculation
df_no_none = df[df['Category'] != 'Test on\ntraining\ndistribution']
# df_no_none = df_no_none[df_no_none['Category'] != 'Length\ngeneralization']
# df_no_none = df_no_none[df_no_none['Category'] != 'Compose\nnew\noperation']

# Calculate overall average across all categories except 'None' for each Type
overall_averages = df_no_none.groupby('Type')['Value'].mean().reset_index()
overall_averages['Category'] = 'Generalization\naverage'

# Append the overall average to the DataFrame
df = pd.concat([df, overall_averages], ignore_index=True)

# Calculate average accuracy for each Category-Type combination
averages = df.groupby(['Category', 'Type'])['Value'].mean().reset_index()
averages['std'] = df.groupby(['Category', 'Type'])['Value'].std().reset_index()['Value']
averages['std'] = averages['std'].fillna(0)

# Create the barplot
fig, ax = plt.subplots(figsize=(12, 8))
ax = sns.barplot(data=df, x='Category', y='Value', hue='Type', palette=PALETTE, ax=ax)
for line in ax.lines:
    line.set_color("gray")

hue_types = df['Type'].unique()
num_hues = len(hue_types)
offsets = {hue_type: -0.2725 + 0.545 / (num_hues - 1) * i for i, hue_type in enumerate(hue_types)}

# Add average accuracy annotations with dynamic offsets
for i, row in averages.iterrows():
    category = row['Category']
    avg_value = row['Value']
    std = row['std']
    type_ = row['Type']
    x = list(df['Category'].unique()).index(category)  # x position for the category
    
    text = f"{avg_value:.1f}" # if avg_value < 10 else f" {avg_value:.1f}"
    
    plt.text(x + offsets[type_], avg_value + 1.1 * std, f"{avg_value:.2g}", 
             ha='center', va='bottom', fontweight='bold', color='black', fontsize=12)


plt.title(f"Out-of-distribution generalization results on the {DATASET[dataset].lower()} domain")
plt.xlabel("")
plt.ylabel("End-to-end test accuracy [%]")
fig.gca().set_ylim(0, 102)
plt.tight_layout()
ax.legend().set_title('')
ax.legend(loc='upper right', bbox_to_anchor=(0.91, 1.0))
legend = ax.get_legend()
if legend is not None and not hasattr(legend, "_ncol"):
    legend._ncol = legend._ncols if hasattr(legend, "_ncols") else 1

In [20]:
dataset = 'deepcoder'

exp = {k: [np.nan] * 5 for k in EXPERIMENTS}
num_steps = {e: {"ExeDec": {"ExeDec": [], "Ground truth": []}, "Baseline": {"Baseline": [], "Ground truth": []}, "TIIPS": {"TIIPS": [], "Ground truth": []}} for e in EXPERIMENTS}
sols = {"ExeDec": {e: {"Matches": [], "Mismatches": []} for e in EXPERIMENTS}, "Baseline": {e: {"Mismatches": [], "Matches": []} for e in EXPERIMENTS}, "TIIPS": {e: {"Mismatches": [], "Matches": []} for e in EXPERIMENTS}}
for experiment in EXPERIMENTS:
    for j, seed in enumerate([10, 20, 30, 40, 50]):
        for prediction_type in ["separate", "tiips", "baseline"]:
            pt = PT_MAPPING[prediction_type]
            p = f"tiips_results/evaluation/{dataset}_e2e_predict_1/end_to_end_predict-{dataset}-run-e2e_predict_1-"
            try:
                with open(p + prediction_type + f'-/tb/hparams-dataset_type={dataset},prediction_type={prediction_type},experiment={experiment},beam_size=10,seed={seed}/results-{prediction_type}.json') as f:
                    data = json.load(f)
            except FileNotFoundError:
                continue
            
            exp[experiment][j] = len([ele for ele in data if ele["success"]]) / len(data) * 100 if len(data) == 1000 else np.nan
            num_steps[experiment][pt][pt] += [ele['num_steps'] if prediction_type == 'separate-' else get_length(ele['solution']) for ele in data if ele["success"]]
            num_steps[experiment][pt]["Ground truth"] += [ele["ground_truth_length"] for ele in data if ele["success"]]

            if prediction_type == "separate" or prediction_type == "baseline":
                match_ratios = []
                for s, g in zip([ele["solution"] for ele in data if ele["success"]], [ele["ground_truth"] for ele in data if ele["success"]]):
                    s_parts = s.split(' | ')
                    g_parts = g.split(' | ')
                    match_ratios.append(sum(1 for sp, gp in zip(s_parts, g_parts) if sp == gp) / len(s_parts))
                sols[pt][experiment]['Matches'].append(match_ratios)


In [None]:
from scipy.stats import norm
rows = []
for pt, data in num_steps.items():
    for category, metrics in data.items():
        for key, values in metrics.items():
            for value in values:
                rows.append({"Approach": category, "Category": replace(EXP_LBL_MAP[pt], ' ', '\n', EXP_LBL_MAP[pt].count(' ')), "Legend": key, "Value": value})

# Convert the list of dictionaries to a pandas DataFrame
overall_df = pd.DataFrame(rows)
overall_df = overall_df[overall_df['Legend'] != 'Ground truth']
means = overall_df.groupby(['Approach', 'Category', 'Legend']).mean()
stddevs = overall_df.groupby(['Approach', 'Category', 'Legend']).std() / np.sqrt(overall_df.groupby(['Approach', 'Category', 'Legend']).count())

fig_width = 20
z_value = norm.ppf(0.5 + 0.95 / 2)

#for m, app in enumerate(list(overall_df['Approach'].unique())):
fig, axs = plt.subplots(1, 6, figsize=(fig_width, 6))# , gridspec_kw={'width_ratios': [fig_width / 6] * 6})
df = overall_df# [overall_df['Approach'] == app]
for j, category in enumerate(list(df["Category"].unique())):
    axs[j] = sns.barplot(
        data=df[df["Category"] == category],
        x="Category",
        y="Value",
        hue="Legend",
        dodge=True,
        ax=axs[j],
        palette=PALETTE
    )
    if j == 0:
        axs[j].set_ylabel("Number of guidance calls")
    else:
        axs[j].set_ylabel("")
    axs[j].set_xlabel("")
    axs[j].set_ylim((0, means.max()['Value'] + 1.5))
    if j != 5:
        axs[j].get_legend().remove()
    else:
        legend = axs[j].legend(loc="upper right", ncol=1, fontsize=16)
    # axs[j].tick_params('x', labelsize=10)
    for n, app in enumerate(['ExeDec', 'TIIPS']):
        ci_upper = means.loc[(app, category, app), 'Value'] + z_value *  + stddevs.loc[(app, category, app), 'Value']
        axs[j].text(-4/fig_width if n == 0 else 4/fig_width, ci_upper + 0.1, f"{means.loc[(app, category, app), 'Value']:.1f}", 
                ha='center', va='bottom', fontweight='bold', color='black', fontsize=16)
for ax in axs:
    legend = ax.get_legend()
    if legend is not None and not hasattr(legend, "_ncol"):
        legend._ncol = legend._ncols if hasattr(legend, "_ncols") else 1
fig.suptitle(f"Number of calls to the transductive guidance model")
fig.tight_layout()
