In [12]:
import pandas as pd
import matplotlib
matplotlib.use('TkAgg')  # Force the TkAgg backend on Windows
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns


In [8]:
df1 = pd.read_csv("data/group1.csv")
df2 = pd.read_csv("data/group2_filtered.csv")

add_task0 = True

if add_task0:
    # Add task 0 with value 25 to both groups
    task0_df = pd.DataFrame({
        'task number': [0],
        'estimation 1': [25],
        'estimation 2': [25]
    })
    df1 = pd.concat([task0_df, df1], ignore_index=True)
    df2 = pd.concat([task0_df, df2], ignore_index=True)

# Ensure task number is consistent in sorting
df1["task number"] = df1["task number"].str.extract(r'(\d+)').fillna(0).astype(int)
df2["task number"] = df2["task number"].str.extract(r'(\d+)').fillna(0).astype(int)

In [9]:
def calculate_stats(df, column):
    """Helper function to calculate mean and standard error."""
    grouped = df.groupby("task number")
    means = grouped[column].mean()
    # Standard error of the mean = std / sqrt(n)
    errors = grouped[column].std() / np.sqrt(grouped[column].count())

    # If task 0 has only one sample, error is zero
    if 0 in errors.index:
        errors.loc[0] = 0.0
    
    return means, errors

In [21]:
df_joint = pd.concat([df1, df2], ignore_index=True)

sns.set_theme(style="whitegrid", palette="Set2")
# Prepare figure and axes
fig, axes = plt.subplots(nrows=3, ncols=1, figsize=(8, 9), sharex=True)
fig.suptitle("Estimates with Standard Error Bars", fontsize=14)

# We'll track the global min/max for the y-axis across subplots
global_ymin, global_ymax = float('inf'), float('-inf')

for ax, df, title in zip(
    axes, 
    [df1, df2, df_joint], 
    ["Group 1", "Group 2", "Group 1 + Group 2"]
):
    # Calculate means and errors for Estimation 1
    means1, errors1 = calculate_stats(df, "estimation 1")
    ci_lower1 = means1 - 1.96 * errors1
    ci_upper1 = means1 + 1.96 * errors1
    
    # Calculate means and errors for Estimation 2
    means2, errors2 = calculate_stats(df, "estimation 2")
    ci_lower2 = means2 - 1.96 * errors2
    ci_upper2 = means2 + 1.96 * errors2
    
    # Update global y-limits
    local_min = min(ci_lower1.min(), ci_lower2.min())
    local_max = max(ci_upper1.max(), ci_upper2.max())
    global_ymin = min(global_ymin, local_min)
    global_ymax = max(global_ymax, local_max)

    # Plot Estimation 1 (line + CI)
    ax.plot(means1.index, means1.values, label="Estimation 1 mean")
    ax.fill_between(
        means1.index, ci_lower1.values, ci_upper1.values, alpha=0.2, label="Estimation 1 (95% CI)"
    )

    # Plot Estimation 2 (line + CI)
    ax.plot(means2.index, means2.values, label="Estimation 2 mean")
    ax.fill_between(
        means2.index, ci_lower2.values, ci_upper2.values, alpha=0.2, label="Estimation 2 (95% CI)"
    )
    
    ax.set_title(title)

# Enforce the same y-limits on all subplots
for ax in axes:
    ax.set_ylim(global_ymin, global_ymax)
    # Format y-axis as int percentages
    ax.yaxis.set_major_formatter(
        plt.FuncFormatter(lambda x, p: f"{int(x)}%")
    )

# We want only one legend for the entire figure
# Let's grab the handles and labels from the last subplot
handles, labels = axes[-1].get_legend_handles_labels()
fig.legend(handles, labels, loc="upper right")

# Label the x-axis on the bottom subplot
axes[-1].set_xlabel("Task Number")

plt.tight_layout()

#save the plot
plt.savefig("raw_data_vizualisation.png")

#plt.show()


