In [None]:
import pandas as pd
import numpy as np
from scipy.stats import shapiro, ttest_rel, mannwhitneyu
import matplotlib.pyplot as plt
import seaborn as sns
from cliffs_delta import cliffs_delta


# 读取结果
lgb_results_df = pd.read_csv("lgbKsatresults_all.csv")
rf_results_df = pd.read_csv("rfKsatresults_all.csv")

In [None]:
# Extract sampling method, dataset level, and set_id
lgb_results_with_methods = lgb_results_df.copy()
lgb_results_with_methods["sampling_method"] = lgb_results_with_methods["dataset"].apply(
    lambda x: x.split("_")[0]
)
lgb_results_with_methods["dataset_level"] = lgb_results_with_methods["dataset"].apply(
    lambda x: x.split("_")[-3]
)
lgb_results_with_methods["set_id"] = lgb_results_with_methods["dataset"].apply(
    lambda x: x.split("_")[-1]
)
lgb_results_with_methods["model"] = "LGB"

rf_results_with_methods = rf_results_df.copy()
rf_results_with_methods["sampling_method"] = rf_results_with_methods["dataset"].apply(
    lambda x: x.split("_")[0]
)
rf_results_with_methods["dataset_level"] = rf_results_with_methods["dataset"].apply(
    lambda x: x.split("_")[-3]
)
rf_results_with_methods["set_id"] = rf_results_with_methods["dataset"].apply(
    lambda x: x.split("_")[-1]
)
rf_results_with_methods["model"] = "RF"

# Combine
combined_results_with_methods = pd.concat(
    [rf_results_with_methods, lgb_results_with_methods], ignore_index=True
)
combined_results_with_methods = combined_results_with_methods[
    ["sampling_method", "dataset_level", "set_id", "r2", "rmsle", "model"]
]

In [None]:
# Significance and effect size analysis
metrics = ["r2", "rmsle"]
models = ["RF", "LGB"]
sampling_methods = combined_results_with_methods["sampling_method"].unique()
sampling_methods = [m for m in sampling_methods if m != "FSCS"]
dataset_levels = combined_results_with_methods["dataset_level"].unique()

results = []

In [None]:
# 1. FSCS vs other sampling methods within the same model
for model in models:
    for metric in metrics:
        for dataset_level in dataset_levels:
            for idx, method in enumerate(sampling_methods):
                fscs = combined_results_with_methods[
                    (combined_results_with_methods["sampling_method"] == "FSCS")
                    & (combined_results_with_methods["model"] == model)
                    & (combined_results_with_methods["dataset_level"] == dataset_level)
                ]
                other = combined_results_with_methods[
                    (combined_results_with_methods["sampling_method"] == method)
                    & (combined_results_with_methods["model"] == model)
                    & (combined_results_with_methods["dataset_level"] == dataset_level)
                ]
                merged = pd.merge(
                    fscs,
                    other,
                    on=["dataset_level", "set_id"],
                    suffixes=("_fscs", "_other"),
                )
                if len(merged) == 0:
                    continue
                diff = merged[f"{metric}_fscs"] - merged[f"{metric}_other"]
                # Mann-Whitney U test and effect size r
                try:
                    stat, p_u = mannwhitneyu(
                        merged[f"{metric}_other"],
                        merged[f"{metric}_fscs"],
                        alternative="two-sided",
                    )
                    n1, n2 = len(merged[f"{metric}_fscs"]), len(
                        merged[f"{metric}_other"]
                    )
                    r_rb = 1 - 2 * stat / (n1 * n2) if n1 > 0 and n2 > 0 else np.nan
                except Exception:
                    stat = np.nan
                    p_u = np.nan
                    r_rb = np.nan
                # Cliff's delta
                try:
                    from cliffs_delta import cliffs_delta

                    cd, _ = cliffs_delta(
                        merged[f"{metric}_fscs"], merged[f"{metric}_other"]
                    )
                except Exception:
                    cd = np.nan
                results.append(
                    {
                        "model": model,
                        "metric": metric,
                        "dataset_level": dataset_level,
                        "method_vs": method,
                        "mw_U": stat,
                        "mw_p": p_u,
                        "r_rb": r_rb,
                        "CD": cd,
                    }
                )
# 2. RF+FSCS vs LGB+Other Sampling Methods
for metric in metrics:
    for dataset_level in dataset_levels:
        for method in sampling_methods:
            rf_fscs = combined_results_with_methods[
                (combined_results_with_methods["sampling_method"] == "FSCS")
                & (combined_results_with_methods["model"] == "RF")
                & (combined_results_with_methods["dataset_level"] == dataset_level)
            ]
            lgb_other = combined_results_with_methods[
                (combined_results_with_methods["sampling_method"] == method)
                & (combined_results_with_methods["model"] == "LGB")
                & (combined_results_with_methods["dataset_level"] == dataset_level)
            ]
            merged = pd.merge(
                rf_fscs,
                lgb_other,
                on=["dataset_level", "set_id"],
                suffixes=("_rf_fscs", "_lgb_other"),
            )
            if len(merged) == 0:
                continue
            diff = merged[f"{metric}_rf_fscs"] - merged[f"{metric}_lgb_other"]
            try:
                stat, p_u = mannwhitneyu(
                    merged[f"{metric}_lgb_other"],
                    merged[f"{metric}_rf_fscs"],
                    alternative="two-sided",
                )
                n1, n2 = len(merged[f"{metric}_rf_fscs"]), len(
                    merged[f"{metric}_lgb_other"]
                )
                r_rb = 1 - ((2 * stat) / (n1 * n2)) if n1 > 0 and n2 > 0 else np.nan
            except Exception:
                stat = np.nan
                p_u = np.nan
                r_rb = np.nan
            try:
                from cliffs_delta import cliffs_delta

                cd, _ = cliffs_delta(
                    merged[f"{metric}_rf_fscs"], merged[f"{metric}_lgb_other"]
                )
            except Exception:
                cd = np.nan
            results.append(
                {
                    "model": "RF_FSCS_vs_LGB_other",
                    "metric": metric,
                    "dataset_level": dataset_level,
                    "method_vs": method,
                    "mw_U": stat,
                    "mw_p": p_u,
                    "r_rb": r_rb,
                    "CD": cd,
                }
            )

# 3. LGB+FSCS vs RF+Other Sampling Methods
for metric in metrics:
    for dataset_level in dataset_levels:
        for method in sampling_methods:
            lgb_fscs = combined_results_with_methods[
                (combined_results_with_methods["sampling_method"] == "FSCS")
                & (combined_results_with_methods["model"] == "LGB")
                & (combined_results_with_methods["dataset_level"] == dataset_level)
            ]
            rf_other = combined_results_with_methods[
                (combined_results_with_methods["sampling_method"] == method)
                & (combined_results_with_methods["model"] == "RF")
                & (combined_results_with_methods["dataset_level"] == dataset_level)
            ]
            merged = pd.merge(
                lgb_fscs,
                rf_other,
                on=["dataset_level", "set_id"],
                suffixes=("_lgb_fscs", "_rf_other"),
            )
            if len(merged) == 0:
                continue
            diff = merged[f"{metric}_lgb_fscs"] - merged[f"{metric}_rf_other"]
            try:
                stat, p_u = mannwhitneyu(
                    merged[f"{metric}_rf_other"],
                    merged[f"{metric}_lgb_fscs"],
                    alternative="two-sided",
                )
                n1, n2 = len(merged[f"{metric}_lgb_fscs"]), len(
                    merged[f"{metric}_rf_other"]
                )
                r_rb = 1 - 2 * stat / (n1 * n2) if n1 > 0 and n2 > 0 else np.nan
            except Exception:
                stat = np.nan
                p_u = np.nan
                r_rb = np.nan
            try:
                from cliffs_delta import cliffs_delta

                cd, _ = cliffs_delta(
                    merged[f"{metric}_lgb_fscs"], merged[f"{metric}_rf_other"]
                )
            except Exception:
                cd = np.nan
            results.append(
                {
                    "model": "LGB_FSCS_vs_RF_other",
                    "metric": metric,
                    "dataset_level": dataset_level,
                    "method_vs": method,
                    "mw_U": stat,
                    "mw_p": p_u,
                    "r_rb": r_rb,
                    "CD": cd,
                }
            )

# Summary table
results_df = pd.DataFrame(results)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from matplotlib.lines import Line2D

plt.rcParams.update({"font.size": 22})

ur_thresholds = [0.1, 0.3, 0.5]
cd_thresholds = [0.147, 0.33, 0.474]
sampling_methods = ["BalancedSampling", "clhs", "SRS"]
sampling_labels = ["Balanced\nSampling", "CLHS", "SRS"]

effect_legend = [
    Line2D(
        [0], [0], marker="x", color="blue", lw=0, markersize=14, label="Small effect"
    ),
    Line2D(
        [0],
        [0],
        marker="o",
        color="orange",
        markerfacecolor="none",
        lw=0,
        markersize=14,
        label="Medium effect",
    ),
    Line2D(
        [0],
        [0],
        marker="o",
        color="red",
        markerfacecolor="red",
        lw=0,
        markersize=14,
        label="Large effect",
    ),
    Line2D([0], [0], marker="*", color="red", lw=0, markersize=18, label="p > 0.05"),
]


def get_compare_df(results_df, model_name, metric, effect_col):
    # Filter the results for the specified model and metric
    df = results_df[
        (results_df["model"] == model_name) & (results_df["metric"] == metric)
    ].copy()
    df["sampling_method"] = df["method_vs"]
    return df


def plot_effect_bar(ax, sub, effect_col, thresholds, invert_y=False, metric_type=None):
    # Draw barplot for effect size
    sns.barplot(
        x="sampling_method",
        y=effect_col,
        hue="dataset_level",
        data=sub,
        order=sampling_methods,
        ax=ax,
    )
    ax.set_xticklabels(
        [sampling_labels[sampling_methods.index(m)] for m in sampling_methods],
        fontsize=20,
    )
    # y-axis settings
    if invert_y:
        ax.invert_yaxis()
    ax.grid(True, axis="y", linestyle="--", linewidth=0.5)
    # Markers for effect size and significance
    for i, method in enumerate(sampling_methods):
        for j, level in enumerate(
            sorted(sub["dataset_level"].unique(), key=lambda x: int(x))
        ):
            row = sub[
                (sub["sampling_method"] == method) & (sub["dataset_level"] == level)
            ]
            if not row.empty:
                effect = abs(row[effect_col].values[0])
                pval = row["mw_p"].values[0]
                # marker
                if effect < thresholds[0]:
                    # Only plot p-value star, skip effect size marker
                    for bar in ax.patches:
                        x_pos = bar.get_x() + bar.get_width() / 2.0
                        if (abs(x_pos - i) < 0.4) and np.isclose(
                            bar.get_height(), row[effect_col].values[0], atol=1e-4
                        ):
                            if pval > 0.05:
                                y_base = 0.02 if not invert_y else -0.02
                                ax.scatter(
                                    bar.get_x() + bar.get_width() / 2.0,
                                    y_base,
                                    marker="*",
                                    color="red",
                                    s=180,
                                    zorder=21,
                                )
                    continue  # Skip effect size marker
                elif effect < thresholds[1]:
                    marker_style = dict(
                        marker="x", color="blue", s=120, linewidths=3
                    )  # Small effect
                elif effect < thresholds[2]:
                    marker_style = dict(
                        marker="o",
                        facecolors="none",
                        edgecolors="orange",
                        s=120,
                        linewidths=3,
                    )  # Medium effect
                else:
                    marker_style = dict(marker="o", color="red", s=120)  # Large effect
                # Find the corresponding bar
                for bar in ax.patches:
                    x_pos = bar.get_x() + bar.get_width() / 2.0
                    if (abs(x_pos - i) < 0.4) and np.isclose(
                        bar.get_height(), row[effect_col].values[0], atol=1e-4
                    ):
                        if not invert_y:
                            y_eff = bar.get_height() + 0.015
                        else:
                            y_eff = bar.get_height() - 0.015
                        ax.scatter(
                            bar.get_x() + bar.get_width() / 2.0,
                            y_eff,
                            **marker_style,
                            zorder=20,
                        )

                        if pval > 0.05:
                            if not invert_y:
                                y_base = 0.02
                            else:
                                y_base = -0.02
                            ax.scatter(
                                bar.get_x() + bar.get_width() / 2.0,
                                y_base,
                                marker="*",
                                color="red",
                                s=180,
                                zorder=21,
                            )


fig, axes = plt.subplots(4, 4, figsize=(28, 22), sharex=True)
# Rows 1-2: FSCS vs other sampling methods within the same model
for row, (model, metric, effect_col, thresholds, effect_label) in enumerate(
    [
        ("RF", "r2", "r_rb", ur_thresholds, "R2 $r_{rb}$"),
        ("RF", "r2", "CD", cd_thresholds, "R2 $\\delta$"),
        ("RF", "rmsle", "r_rb", ur_thresholds, "RMSLE $r_{rb}$"),
        ("RF", "rmsle", "CD", cd_thresholds, "RMSLE $\\delta$"),
        ("LGB", "r2", "r_rb", ur_thresholds, "R2 $r_{rb}$"),
        ("LGB", "r2", "CD", cd_thresholds, "R2 $\\delta$"),
        ("LGB", "rmsle", "r_rb", ur_thresholds, "RMSLE $r_{rb}$"),
        ("LGB", "rmsle", "CD", cd_thresholds, "RMSLE $\\delta$"),
    ]
):
    r = row // 4
    c = row % 4
    df = get_compare_df(results_df, model, metric, effect_col)
    plot_effect_bar(
        axes[r, c],
        df,
        effect_col,
        thresholds,
        invert_y=("rmsle" in metric),
        metric_type=metric,
    )
    axes[r, c].set_title(f"FSCS vs Other in {model}\n{effect_label}", fontsize=22)
    axes[r, c].set_xticklabels([])
    axes[r, c].set_xlabel("")
    if c in [0, 2]:
        axes[r, c].set_ylabel("$r_{rb}$")
    else:
        axes[r, c].set_ylabel("$\\delta$")

    axes[r, c].grid(True, which="major", axis="y", linestyle="-", linewidth=0.6)
    axes[r, c].grid(True, which="minor", axis="y", linestyle="--", linewidth=0.25)
    axes[r, c].yaxis.set_major_locator(plt.MultipleLocator(0.2))
    axes[r, c].yaxis.set_minor_locator(plt.MultipleLocator(0.05))

# Row 3: RF+FSCS vs LGB+Other
for i, (metric, effect_col, thresholds, effect_label) in enumerate(
    [
        ("r2", "r_rb", ur_thresholds, "R2 $r_{rb}$"),
        ("r2", "CD", cd_thresholds, "R2 $\\delta$"),
        ("rmsle", "r_rb", ur_thresholds, "RMSLE $r_{rb}$"),
        ("rmsle", "CD", cd_thresholds, "RMSLE $\\delta$"),
    ]
):
    df = get_compare_df(results_df, "RF_FSCS_vs_LGB_other", metric, effect_col)
    plot_effect_bar(
        axes[2, i],
        df,
        effect_col,
        thresholds,
        invert_y=("rmsle" in metric),
        metric_type=metric,
    )
    axes[2, i].set_title(f"RF+FSCS vs LGB+Other\n{effect_label}", fontsize=22)
    axes[2, i].set_xticklabels([])
    axes[2, i].set_xlabel("")
    if i in [0, 2]:
        axes[2, i].set_ylabel("$r_{rb}$")
    else:
        axes[2, i].set_ylabel("$\\delta$")

    axes[2, i].grid(True, which="major", axis="y", linestyle="-", linewidth=0.6)
    axes[2, i].grid(True, which="minor", axis="y", linestyle="--", linewidth=0.25)
    axes[2, i].yaxis.set_major_locator(plt.MultipleLocator(0.2))
    axes[2, i].yaxis.set_minor_locator(plt.MultipleLocator(0.05))

# Row 4: LGB+FSCS vs RF+Other
for i, (metric, effect_col, thresholds, effect_label) in enumerate(
    [
        ("r2", "r_rb", ur_thresholds, "R2 $r_{rb}$"),
        ("r2", "CD", cd_thresholds, "R2 $\\delta$"),
        ("rmsle", "r_rb", ur_thresholds, "RMSLE $r_{rb}$"),
        ("rmsle", "CD", cd_thresholds, "RMSLE $\\delta$"),
    ]
):
    df = get_compare_df(results_df, "LGB_FSCS_vs_RF_other", metric, effect_col)
    plot_effect_bar(
        axes[3, i],
        df,
        effect_col,
        thresholds,
        invert_y=("rmsle" in metric),
        metric_type=metric,
    )
    axes[3, i].set_title(f"LGB+FSCS vs RF+Other\n{effect_label}", fontsize=22)
    axes[3, i].set_xlabel("Sampling Method", fontsize=22)
    if i in [0, 2]:
        axes[3, i].set_ylabel("$r_{rb}$")
    else:
        axes[3, i].set_ylabel("$\\delta$")

    axes[3, i].grid(True, which="major", axis="y", linestyle="-", linewidth=0.6)
    axes[3, i].grid(True, which="minor", axis="y", linestyle="--", linewidth=0.25)
    axes[3, i].yaxis.set_major_locator(plt.MultipleLocator(0.2))
    axes[3, i].yaxis.set_minor_locator(plt.MultipleLocator(0.05))

# Legend handling
# Only show two legends below the first subplot in the last row
handles1, labels1 = axes[3, 0].get_legend_handles_labels()
legend1 = axes[3, 0].legend(
    handles=handles1,
    labels=labels1,
    title="Data Size",
    loc="lower left",
    prop={"size": 19},
    title_fontsize=19,
    framealpha=0.7,
)
axes[3, 0].add_artist(legend1)
legend2 = axes[3, 1].legend(
    handles=effect_legend,
    loc="lower right",
    title="Effect size",
    prop={"size": 19},
    title_fontsize=19,
    framealpha=0.6,
)
axes[3, 1].add_artist(legend2)
# Remove legends from other subplots
for r in range(4):
    for c in range(4):
        if not (r == 3 and c == 0) | (r == 3 and c == 1):
            axes[r, c].get_legend().remove()

plt.tight_layout()
plt.savefig("Ksat_effectsize_barplot_4x4.jpg", dpi=800, bbox_inches="tight")
plt.show()

The following are the forest cover

In [None]:
# Read results
lgb_results_df = pd.read_csv("lgbresults_icluROCAUC.csv")
rf_results_df = pd.read_csv("rfresults_icluROCAUC.csv")

In [None]:
# Extract sampling method, dataset level, and set_id
lgb_results_with_methods = lgb_results_df.copy()
lgb_results_with_methods["sampling_method"] = lgb_results_with_methods["dataset"].apply(
    lambda x: x.split("_")[0]
)
lgb_results_with_methods["dataset_level"] = lgb_results_with_methods["dataset"].apply(
    lambda x: x.split("_")[-3]
)
lgb_results_with_methods["set_id"] = lgb_results_with_methods["dataset"].apply(
    lambda x: x.split("_")[-1]
)
lgb_results_with_methods["model"] = "LGB"

rf_results_with_methods = rf_results_df.copy()
rf_results_with_methods["sampling_method"] = rf_results_with_methods["dataset"].apply(
    lambda x: x.split("_")[0]
)
rf_results_with_methods["dataset_level"] = rf_results_with_methods["dataset"].apply(
    lambda x: x.split("_")[-3]
)
rf_results_with_methods["set_id"] = rf_results_with_methods["dataset"].apply(
    lambda x: x.split("_")[-1]
)
rf_results_with_methods["model"] = "RF"

# Merge results
combined_results_with_methods = pd.concat(
    [rf_results_with_methods, lgb_results_with_methods], ignore_index=True
)
combined_results_with_methods = combined_results_with_methods[
    [
        "sampling_method",
        "dataset_level",
        "set_id",
        "test_accuracy",
        "test_f1",
        "roc_auc",
        "model",
    ]
]

In [None]:
# Significance and effect size analysis
metrics = [
    "test_accuracy",
    "test_f1",
    "roc_auc",
]
models = ["RF", "LGB"]
sampling_methods = combined_results_with_methods["sampling_method"].unique()
sampling_methods = [m for m in sampling_methods if m != "FSCS"]
dataset_levels = combined_results_with_methods["dataset_level"].unique()

results = []

In [None]:
# 1. FSCS vs other sampling methods within the same model
for model in models:
    for metric in metrics:
        for dataset_level in dataset_levels:
            for idx, method in enumerate(sampling_methods):
                fscs = combined_results_with_methods[
                    (combined_results_with_methods["sampling_method"] == "FSCS")
                    & (combined_results_with_methods["model"] == model)
                    & (combined_results_with_methods["dataset_level"] == dataset_level)
                ]
                other = combined_results_with_methods[
                    (combined_results_with_methods["sampling_method"] == method)
                    & (combined_results_with_methods["model"] == model)
                    & (combined_results_with_methods["dataset_level"] == dataset_level)
                ]
                merged = pd.merge(
                    fscs,
                    other,
                    on=["dataset_level", "set_id"],
                    suffixes=("_fscs", "_other"),
                )
                if len(merged) == 0:
                    continue
                diff = merged[f"{metric}_fscs"] - merged[f"{metric}_other"]
                # Mann-Whitney U test and effect size r
                try:
                    stat, p_u = mannwhitneyu(
                        merged[f"{metric}_other"],
                        merged[f"{metric}_fscs"],
                        alternative="two-sided",
                    )
                    n1, n2 = len(merged[f"{metric}_fscs"]), len(
                        merged[f"{metric}_other"]
                    )
                    r_rb = 1 - 2 * stat / (n1 * n2) if n1 > 0 and n2 > 0 else np.nan
                except Exception:
                    stat = np.nan
                    p_u = np.nan
                    r_rb = np.nan
                # Cliff's delta
                try:
                    from cliffs_delta import cliffs_delta

                    cd, _ = cliffs_delta(
                        merged[f"{metric}_fscs"], merged[f"{metric}_other"]
                    )
                except Exception:
                    cd = np.nan
                results.append(
                    {
                        "model": model,
                        "metric": metric,
                        "dataset_level": dataset_level,
                        "method_vs": method,
                        "mw_U": stat,
                        "mw_p": p_u,
                        "r_rb": r_rb,
                        "CD": cd,
                    }
                )
# 2. RF+FSCS vs LGB+Other Sampling Methods
for metric in metrics:
    for dataset_level in dataset_levels:
        for method in sampling_methods:
            rf_fscs = combined_results_with_methods[
                (combined_results_with_methods["sampling_method"] == "FSCS")
                & (combined_results_with_methods["model"] == "RF")
                & (combined_results_with_methods["dataset_level"] == dataset_level)
            ]
            lgb_other = combined_results_with_methods[
                (combined_results_with_methods["sampling_method"] == method)
                & (combined_results_with_methods["model"] == "LGB")
                & (combined_results_with_methods["dataset_level"] == dataset_level)
            ]
            merged = pd.merge(
                rf_fscs,
                lgb_other,
                on=["dataset_level", "set_id"],
                suffixes=("_rf_fscs", "_lgb_other"),
            )
            if len(merged) == 0:
                continue
            diff = merged[f"{metric}_rf_fscs"] - merged[f"{metric}_lgb_other"]
            try:
                stat, p_u = mannwhitneyu(
                    merged[f"{metric}_lgb_other"],
                    merged[f"{metric}_rf_fscs"],
                    alternative="two-sided",
                )
                n1, n2 = len(merged[f"{metric}_rf_fscs"]), len(
                    merged[f"{metric}_lgb_other"]
                )
                r_rb = 1 - ((2 * stat) / (n1 * n2)) if n1 > 0 and n2 > 0 else np.nan
            except Exception:
                stat = np.nan
                p_u = np.nan
                r_rb = np.nan
            try:
                from cliffs_delta import cliffs_delta

                cd, _ = cliffs_delta(
                    merged[f"{metric}_rf_fscs"], merged[f"{metric}_lgb_other"]
                )
            except Exception:
                cd = np.nan
            results.append(
                {
                    "model": "RF_FSCS_vs_LGB_other",
                    "metric": metric,
                    "dataset_level": dataset_level,
                    "method_vs": method,
                    "mw_U": stat,
                    "mw_p": p_u,
                    "r_rb": r_rb,
                    "CD": cd,
                }
            )

# 3. LGB+FSCS vs RF+Other Sampling Methods
for metric in metrics:
    for dataset_level in dataset_levels:
        for method in sampling_methods:
            lgb_fscs = combined_results_with_methods[
                (combined_results_with_methods["sampling_method"] == "FSCS")
                & (combined_results_with_methods["model"] == "LGB")
                & (combined_results_with_methods["dataset_level"] == dataset_level)
            ]
            rf_other = combined_results_with_methods[
                (combined_results_with_methods["sampling_method"] == method)
                & (combined_results_with_methods["model"] == "RF")
                & (combined_results_with_methods["dataset_level"] == dataset_level)
            ]
            merged = pd.merge(
                lgb_fscs,
                rf_other,
                on=["dataset_level", "set_id"],
                suffixes=("_lgb_fscs", "_rf_other"),
            )
            if len(merged) == 0:
                continue
            diff = merged[f"{metric}_lgb_fscs"] - merged[f"{metric}_rf_other"]
            try:
                stat, p_u = mannwhitneyu(
                    merged[f"{metric}_rf_other"],
                    merged[f"{metric}_lgb_fscs"],
                    alternative="two-sided",
                )
                n1, n2 = len(merged[f"{metric}_lgb_fscs"]), len(
                    merged[f"{metric}_rf_other"]
                )
                r_rb = 1 - 2 * stat / (n1 * n2) if n1 > 0 and n2 > 0 else np.nan
            except Exception:
                stat = np.nan
                p_u = np.nan
                r_rb = np.nan
            try:
                from cliffs_delta import cliffs_delta

                cd, _ = cliffs_delta(
                    merged[f"{metric}_lgb_fscs"], merged[f"{metric}_rf_other"]
                )
            except Exception:
                cd = np.nan
            results.append(
                {
                    "model": "LGB_FSCS_vs_RF_other",
                    "metric": metric,
                    "dataset_level": dataset_level,
                    "method_vs": method,
                    "mw_U": stat,
                    "mw_p": p_u,
                    "r_rb": r_rb,
                    "CD": cd,
                }
            )

# Summary table
results_df = pd.DataFrame(results)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from matplotlib.lines import Line2D

plt.rcParams.update({"font.size": 22})

ur_thresholds = [0.1, 0.3, 0.5]
cd_thresholds = [0.147, 0.33, 0.474]
sampling_methods = [
    "BalancedSampling",
    "clhs",
    "SRS",
]
sampling_labels = [
    "Balanced\nSampling",
    "CLHS",
    "SRS",
]

effect_legend = [
    Line2D(
        [0], [0], marker="x", color="blue", lw=0, markersize=14, label="Small effect"
    ),
    Line2D(
        [0],
        [0],
        marker="o",
        color="orange",
        markerfacecolor="none",
        lw=0,
        markersize=14,
        label="Medium effect",
    ),
    Line2D(
        [0],
        [0],
        marker="o",
        color="red",
        markerfacecolor="red",
        lw=0,
        markersize=14,
        label="Large effect",
    ),
    Line2D([0], [0], marker="*", color="red", lw=0, markersize=18, label="p > 0.05"),
]


def get_compare_df(results_df, model_name, metric, effect_col):
    df = results_df[
        (results_df["model"] == model_name) & (results_df["metric"] == metric)
    ].copy()
    df["sampling_method"] = df["method_vs"]
    return df


def plot_effect_bar(ax, sub, effect_col, thresholds, invert_y=False, metric_type=None):
    # metric_type
    sns.barplot(
        x="sampling_method",
        y=effect_col,
        hue="dataset_level",
        data=sub,
        order=sampling_methods,
        ax=ax,
    )
    ax.set_xticklabels(
        [sampling_labels[sampling_methods.index(m)] for m in sampling_methods],
        fontsize=20,
    )
    # legend is handled outside the main loop
    # y-axis
    if invert_y:
        ax.invert_yaxis()
    ax.grid(True, axis="y", linestyle="--", linewidth=0.5)
    # markers
    for i, method in enumerate(sampling_methods):
        for j, level in enumerate(
            sorted(sub["dataset_level"].unique(), key=lambda x: int(x))
        ):
            row = sub[
                (sub["sampling_method"] == method) & (sub["dataset_level"] == level)
            ]
            if not row.empty:
                effect = abs(row[effect_col].values[0])
                pval = row["mw_p"].values[0]
                # marker
                if effect < thresholds[0]:
                    # Only plot p-value star, not effect size marker
                    for bar in ax.patches:
                        x_pos = bar.get_x() + bar.get_width() / 2.0
                        if (abs(x_pos - i) < 0.4) and np.isclose(
                            bar.get_height(), row[effect_col].values[0], atol=1e-4
                        ):
                            if pval > 0.05:
                                y_base = 0.02 if not invert_y else -0.02
                                ax.scatter(
                                    bar.get_x() + bar.get_width() / 2.0,
                                    y_base,
                                    marker="*",
                                    color="red",
                                    s=180,
                                    zorder=21,
                                )
                    continue  # Skip effect size marker
                elif effect < thresholds[1]:
                    marker_style = dict(
                        marker="x", color="blue", s=120, linewidths=3
                    )  # Small effect
                elif effect < thresholds[2]:
                    marker_style = dict(
                        marker="o",
                        facecolors="none",
                        edgecolors="orange",
                        s=120,
                        linewidths=3,
                    )  # Medium effect
                else:
                    marker_style = dict(marker="o", color="red", s=120)  # Large effect
                # Find bar
                for bar in ax.patches:
                    x_pos = bar.get_x() + bar.get_width() / 2.0
                    if (abs(x_pos - i) < 0.4) and np.isclose(
                        bar.get_height(), row[effect_col].values[0], atol=1e-4
                    ):
                        if not invert_y:
                            y_eff = bar.get_height() + 0.015
                        else:
                            y_eff = bar.get_height() - 0.015
                        ax.scatter(
                            bar.get_x() + bar.get_width() / 2.0,
                            y_eff,
                            **marker_style,
                            zorder=20,
                        )

                        if pval > 0.05:
                            if not invert_y:
                                y_base = 0.02
                            else:
                                y_base = -0.02
                            ax.scatter(
                                bar.get_x() + bar.get_width() / 2.0,
                                y_base,
                                marker="*",
                                color="red",
                                s=180,
                                zorder=21,
                            )


fig, axes = plt.subplots(4, 6, figsize=(28, 22), sharey=True, sharex=True)

# Row 1-2: FSCS vs other sampling methods within the same model
for row, (model, metric, effect_col, thresholds, effect_label) in enumerate(
    [
        ("RF", "test_accuracy", "r_rb", ur_thresholds, "Accuracy $r_{rb}$"),
        ("RF", "test_accuracy", "CD", cd_thresholds, "Accuracy $\\delta$"),
        ("RF", "test_f1", "r_rb", ur_thresholds, "F1 Score $r_{rb}$"),
        ("RF", "test_f1", "CD", cd_thresholds, "F1 Score $\\delta$"),
        ("RF", "roc_auc", "r_rb", ur_thresholds, "ROC-AUC $r_{rb}$"),
        ("RF", "roc_auc", "CD", cd_thresholds, "ROC-AUC $\\delta$"),
        ("LGB", "test_accuracy", "r_rb", ur_thresholds, "Accuracy $r_{rb}$"),
        ("LGB", "test_accuracy", "CD", cd_thresholds, "Accuracy $\\delta$"),
        ("LGB", "test_f1", "r_rb", ur_thresholds, "F1 Score $r_{rb}$"),
        ("LGB", "test_f1", "CD", cd_thresholds, "F1 Score $\\delta$"),
        ("LGB", "roc_auc", "r_rb", ur_thresholds, "ROC-AUC $r_{rb}$"),
        ("LGB", "roc_auc", "CD", cd_thresholds, "ROC-AUC $\\delta$"),
    ]
):
    r = row // 6
    c = row % 6
    df = get_compare_df(results_df, model, metric, effect_col)
    plot_effect_bar(
        axes[r, c],
        df,
        effect_col,
        thresholds,
        invert_y=("rmsle" in metric),
        metric_type=metric,
    )
    axes[r, c].set_title(f"FSCS vs Other in {model}\n{effect_label}", fontsize=22)
    axes[r, c].set_xticklabels([])
    axes[r, c].set_xlabel("")
    if c in [0, 2, 4]:
        axes[r, c].set_ylabel("$r_{rb}$")
    else:
        axes[r, c].set_ylabel("$\\delta$")

    axes[r, c].grid(True, which="major", axis="y", linestyle="-", linewidth=0.6)
    axes[r, c].grid(True, which="minor", axis="y", linestyle="--", linewidth=0.25)
    axes[r, c].yaxis.set_major_locator(plt.MultipleLocator(0.2))
    axes[r, c].yaxis.set_minor_locator(plt.MultipleLocator(0.05))

# Row 3: RF+FSCS vs LGB+Other
for i, (metric, effect_col, thresholds, effect_label) in enumerate(
    [
        ("test_accuracy", "r_rb", ur_thresholds, "Accuracy $r_{rb}$"),
        ("test_accuracy", "CD", cd_thresholds, "Accuracy $\\delta$"),
        ("test_f1", "r_rb", ur_thresholds, "F1 Score $r_{rb}$"),
        ("test_f1", "CD", cd_thresholds, "F1 Score $\\delta$"),
        ("roc_auc", "r_rb", ur_thresholds, "ROC-AUC $r_{rb}$"),
        ("roc_auc", "CD", cd_thresholds, "ROC-AUC $\\delta$"),
    ]
):
    df = get_compare_df(results_df, "RF_FSCS_vs_LGB_other", metric, effect_col)
    plot_effect_bar(
        axes[2, i],
        df,
        effect_col,
        thresholds,
        invert_y=("rmsle" in metric),
        metric_type=metric,
    )
    axes[2, i].set_title(f"RF+FSCS vs LGB+Other\n{effect_label}", fontsize=22)
    axes[2, i].set_xticklabels([])
    axes[2, i].set_xlabel("")
    if i in [0, 2, 4]:
        axes[2, i].set_ylabel("$r_{rb}$")
    else:
        axes[2, i].set_ylabel("$\\delta$")

    axes[2, i].grid(True, which="major", axis="y", linestyle="-", linewidth=0.6)
    axes[2, i].grid(True, which="minor", axis="y", linestyle="--", linewidth=0.25)
    axes[2, i].yaxis.set_major_locator(plt.MultipleLocator(0.2))
    axes[2, i].yaxis.set_minor_locator(plt.MultipleLocator(0.05))

# Row 4: LGB+FSCS vs RF+Other
for i, (metric, effect_col, thresholds, effect_label) in enumerate(
    [
        ("test_accuracy", "r_rb", ur_thresholds, "Accuracy $r_{rb}$"),
        ("test_accuracy", "CD", cd_thresholds, "Accuracy $\\delta$"),
        ("test_f1", "r_rb", ur_thresholds, "F1 Score $r_{rb}$"),
        ("test_f1", "CD", cd_thresholds, "F1 Score $\\delta$"),
        ("roc_auc", "r_rb", ur_thresholds, "ROC-AUC $r_{rb}$"),
        ("roc_auc", "CD", cd_thresholds, "ROC-AUC $\\delta$"),
    ]
):
    df = get_compare_df(results_df, "LGB_FSCS_vs_RF_other", metric, effect_col)
    plot_effect_bar(
        axes[3, i],
        df,
        effect_col,
        thresholds,
        invert_y=("rmsle" in metric),
        metric_type=metric,
    )
    axes[3, i].set_title(f"LGB+FSCS vs RF+Other\n{effect_label}", fontsize=22)
    axes[3, i].set_xlabel("Sampling Method", fontsize=22)
    if i in [0, 2, 4]:
        axes[3, i].set_ylabel("$r_{rb}$")
    else:
        axes[3, i].set_ylabel("$\\delta$")

    axes[3, i].grid(True, which="major", axis="y", linestyle="-", linewidth=0.6)
    axes[3, i].grid(True, which="minor", axis="y", linestyle="--", linewidth=0.25)
    axes[3, i].yaxis.set_major_locator(plt.MultipleLocator(0.2))
    axes[3, i].yaxis.set_minor_locator(plt.MultipleLocator(0.05))

# legend handling
# Only show two legends below the first subplot in the last row
handles1, labels1 = axes[3, 0].get_legend_handles_labels()
legend1 = axes[3, 0].legend(
    handles=handles1,
    labels=labels1,
    title="Data Size",
    loc="center left",
    prop={"size": 19},
    title_fontsize=19,
    framealpha=0.7,
)
axes[3, 0].add_artist(legend1)
legend2 = axes[3, 1].legend(
    handles=effect_legend,
    loc="center right",
    title="Effect size",
    prop={"size": 19},
    title_fontsize=19,
    framealpha=0.6,
)
axes[3, 1].add_artist(legend2)
# Other subplots do not show legend
for r in range(4):
    for c in range(6):
        if not (r == 3 and c == 0) | (r == 3 and c == 1):
            axes[r, c].get_legend().remove()

plt.tight_layout()
plt.savefig("forest_effectsize_barplot_2x2.jpg", dpi=800, bbox_inches="tight")
plt.show()

The following compares LGB+balanced sampling/CLHS/FSCS/SRS vs. RF+SRS in Ksat

In [None]:
# Read results
lgb_results_df = pd.read_csv("lgbKsatresults_all.csv")
rf_results_df = pd.read_csv("rfKsatresults_all.csv")

In [None]:
# Extract sampling method, dataset level, and set_id
lgb_results_with_methods = lgb_results_df.copy()
lgb_results_with_methods["sampling_method"] = lgb_results_with_methods["dataset"].apply(
    lambda x: x.split("_")[0]
)
lgb_results_with_methods["dataset_level"] = lgb_results_with_methods["dataset"].apply(
    lambda x: x.split("_")[-3]
)
lgb_results_with_methods["set_id"] = lgb_results_with_methods["dataset"].apply(
    lambda x: x.split("_")[-1]
)
lgb_results_with_methods["model"] = "LGB"

rf_results_with_methods = rf_results_df.copy()
rf_results_with_methods["sampling_method"] = rf_results_with_methods["dataset"].apply(
    lambda x: x.split("_")[0]
)
rf_results_with_methods["dataset_level"] = rf_results_with_methods["dataset"].apply(
    lambda x: x.split("_")[-3]
)
rf_results_with_methods["set_id"] = rf_results_with_methods["dataset"].apply(
    lambda x: x.split("_")[-1]
)
rf_results_with_methods["model"] = "RF"

# Merge results
combined_results_with_methods = pd.concat(
    [rf_results_with_methods, lgb_results_with_methods], ignore_index=True
)
combined_results_with_methods = combined_results_with_methods[
    ["sampling_method", "dataset_level", "set_id", "r2", "rmsle", "model"]
]

In [None]:
# Significance and effect size analysis
metrics = ["r2", "rmsle"]
models = ["RF", "LGB"]
sampling_methods = combined_results_with_methods["sampling_method"].unique()
sampling_methods = [m for m in sampling_methods if m != "FSCS"]
dataset_levels = combined_results_with_methods["dataset_level"].unique()

results = []

In [None]:
# Compare LGB + BalancedSampling/CLHS/FSCS/SRS vs RF + SRS
for method in ["BalancedSampling", "clhs", "FSCS", "SRS"]:
    for metric in metrics:
        for dataset_level in dataset_levels:
            lgb_method = combined_results_with_methods[
                (combined_results_with_methods["sampling_method"] == method)
                & (combined_results_with_methods["model"] == "LGB")
                & (combined_results_with_methods["dataset_level"] == dataset_level)
            ]
            rf_srs = combined_results_with_methods[
                (combined_results_with_methods["sampling_method"] == "SRS")
                & (combined_results_with_methods["model"] == "RF")
                & (combined_results_with_methods["dataset_level"] == dataset_level)
            ]
            merged = pd.merge(
                lgb_method,
                rf_srs,
                on=["dataset_level", "set_id"],
                suffixes=("_lgb", "_rf"),
            )
            if len(merged) == 0:
                continue
            diff = merged[f"{metric}_lgb"] - merged[f"{metric}_rf"]
            try:
                stat, p_u = mannwhitneyu(
                    merged[f"{metric}_rf"],
                    merged[f"{metric}_lgb"],
                    alternative="two-sided",
                )
                n1, n2 = len(merged[f"{metric}_rf"]), len(merged[f"{metric}_lgb"])
                r_rb = 1 - 2 * stat / (n1 * n2) if n1 > 0 and n2 > 0 else np.nan
            except Exception:
                stat = np.nan
                p_w = np.nan
                r_rb = np.nan
            try:
                cd, _ = cliffs_delta(merged[f"{metric}_lgb"], merged[f"{metric}_rf"])
            except Exception:
                cd = np.nan
            results.append(
                {
                    "model": "LGB_method_vs_RF_SRS",
                    "metric": metric,
                    "dataset_level": dataset_level,
                    "method_vs": method,
                    "mw_U": stat,
                    "mw_p": p_u,
                    "r_rb": r_rb,
                    "CD": cd,
                }
            )

# Summary table
results_df = pd.DataFrame(results)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from matplotlib.lines import Line2D

plt.rcParams.update({"font.size": 22})

# Thresholds for effect size
ur_thresholds = [0.1, 0.3, 0.5]
cd_thresholds = [0.147, 0.33, 0.474]
sampling_methods = ["BalancedSampling", "clhs", "FSCS", "SRS"]
sampling_labels = ["Balanced\nSampling", "CLHS", "FSCS", "SRS"]

# Legend for effect size markers
effect_legend = [
    Line2D(
        [0], [0], marker="x", color="blue", lw=0, markersize=14, label="Small effect"
    ),
    Line2D(
        [0],
        [0],
        marker="o",
        color="orange",
        markerfacecolor="none",
        lw=0,
        markersize=14,
        label="Medium effect",
    ),
    Line2D(
        [0],
        [0],
        marker="o",
        color="red",
        markerfacecolor="red",
        lw=0,
        markersize=14,
        label="Large effect",
    ),
    Line2D([0], [0], marker="*", color="red", lw=0, markersize=18, label="p > 0.05"),
]


def get_compare_df(results_df, model_name, metric, effect_col):
    # Filter the results for the specified model and metric
    df = results_df[
        (results_df["model"] == model_name) & (results_df["metric"] == metric)
    ].copy()
    df["sampling_method"] = df["method_vs"]
    return df


def plot_effect_bar(ax, sub, effect_col, thresholds, invert_y=False, metric_type=None):
    # Plot bar chart for effect size
    if sub.empty:
        ax.axis("off")
        return
    sns.barplot(
        x="sampling_method",
        y=effect_col,
        hue="dataset_level",
        data=sub,
        order=sampling_methods,
        ax=ax,
    )
    ax.set_xticklabels(
        [sampling_labels[sampling_methods.index(m)] for m in sampling_methods],
        fontsize=18,
    )
    if invert_y:
        ax.invert_yaxis()
    ax.grid(True, axis="y", linestyle="--", linewidth=0.5)
    # Add effect size and significance markers
    for i, method in enumerate(sampling_methods):
        for j, level in enumerate(
            sorted(sub["dataset_level"].unique(), key=lambda x: int(x))
        ):
            row = sub[
                (sub["sampling_method"] == method) & (sub["dataset_level"] == level)
            ]
            if not row.empty:
                effect = abs(row[effect_col].values[0])
                pval = row["mw_p"].values[0]
                # marker
                if effect < thresholds[0]:
                    # Only plot p-value star, skip effect size marker
                    for bar in ax.patches:
                        x_pos = bar.get_x() + bar.get_width() / 2.0
                        if (abs(x_pos - i) < 0.4) and np.isclose(
                            bar.get_height(), row[effect_col].values[0], atol=1e-4
                        ):
                            if pval > 0.05:
                                y_base = 0.02 if not invert_y else -0.02
                                ax.scatter(
                                    bar.get_x() + bar.get_width() / 2.0,
                                    y_base,
                                    marker="*",
                                    color="red",
                                    s=180,
                                    zorder=21,
                                )
                    continue  # Skip effect size marker
                elif effect < thresholds[1]:
                    marker_style = dict(
                        marker="x", color="blue", s=120, linewidths=3
                    )  # Small effect
                elif effect < thresholds[2]:
                    marker_style = dict(
                        marker="o",
                        facecolors="none",
                        edgecolors="orange",
                        s=120,
                        linewidths=3,
                    )  # Medium effect
                else:
                    marker_style = dict(marker="o", color="red", s=120)  # Large effect
                for bar in ax.patches:
                    x_pos = bar.get_x() + bar.get_width() / 2.0
                    if (abs(x_pos - i) < 0.4) and np.isclose(
                        bar.get_height(), row[effect_col].values[0], atol=1e-4
                    ):
                        y_eff = (
                            bar.get_height() + 0.015
                            if not invert_y
                            else bar.get_height() - 0.015
                        )
                        ax.scatter(
                            bar.get_x() + bar.get_width() / 2.0,
                            y_eff,
                            **marker_style,
                            zorder=20,
                        )
                        if pval > 0.05:
                            y_base = 0.02 if not invert_y else -0.02
                            ax.scatter(
                                bar.get_x() + bar.get_width() / 2.0,
                                y_base,
                                marker="*",
                                color="red",
                                s=180,
                                zorder=21,
                            )


# Draw 2x2 subplots for effect size comparison
fig, axes = plt.subplots(2, 2, figsize=(16, 12), sharex=True)
plot_settings = [
    ("r2", "r_rb", ur_thresholds, "R2 $r_{rb}$"),
    ("r2", "CD", cd_thresholds, "R2 $\\delta$"),
    ("rmsle", "r_rb", ur_thresholds, "RMSLE $r_{rb}$"),
    ("rmsle", "CD", cd_thresholds, "RMSLE $\\delta$"),
]
for idx, (metric, effect_col, thresholds, effect_label) in enumerate(plot_settings):
    r, c = divmod(idx, 2)
    df = get_compare_df(results_df, "LGB_method_vs_RF_SRS", metric, effect_col)
    plot_effect_bar(
        axes[r, c],
        df,
        effect_col,
        thresholds,
        invert_y=("rmsle" in metric),
        metric_type=metric,
    )
    axes[r, c].set_title(effect_label, fontsize=22)
    axes[r, c].set_xlabel("Sampling Method", fontsize=18)
    if c == 0:
        axes[r, c].set_ylabel("$r_{rb}$")
    else:
        axes[r, c].set_ylabel("$\\delta$")

    axes[r, c].grid(True, which="major", axis="y", linestyle="-", linewidth=0.6)
    axes[r, c].grid(True, which="minor", axis="y", linestyle="--", linewidth=0.25)
    axes[r, c].yaxis.set_major_locator(plt.MultipleLocator(0.2))
    axes[r, c].yaxis.set_minor_locator(plt.MultipleLocator(0.05))

# Add legends
handles1, labels1 = axes[1, 0].get_legend_handles_labels()
legend1 = axes[1, 0].legend(
    handles=handles1,
    labels=labels1,
    title="Data Size",
    loc="center left",
    prop={"size": 16},
    title_fontsize=16,
    framealpha=0.7,
)
axes[1, 0].add_artist(legend1)
legend2 = axes[1, 1].legend(
    handles=effect_legend,
    loc="center left",
    title="Effect size",
    prop={"size": 16},
    title_fontsize=16,
    framealpha=0.6,
    bbox_to_anchor=(0, 0.54),  # (x, y) relative to the upper left of the subplot
)
axes[1, 1].add_artist(legend2)
for r in range(2):
    for c in range(2):
        if not (r == 1 and c in [0, 1]):
            axes[r, c].get_legend().remove()

plt.suptitle("LGB + Sampling Methods vs RF + SRS", fontsize=26)
plt.tight_layout()
plt.savefig("Ksat_LGBvsRF_SRS_effectsize_barplot_2x2.jpg", dpi=800, bbox_inches="tight")
plt.show()

The following compares LGB+balanced sampling/CLHS/FSCS/SRS vs. RF+SRS in forest cover

In [None]:
# Read results
lgb_results_df = pd.read_csv("lgbresults_icluROCAUC.csv")
rf_results_df = pd.read_csv("rfresults_icluROCAUC.csv")

In [None]:
# Extract sampling method, dataset level, and set_id
lgb_results_with_methods = lgb_results_df.copy()
lgb_results_with_methods["sampling_method"] = lgb_results_with_methods["dataset"].apply(
    lambda x: x.split("_")[0]
)
lgb_results_with_methods["dataset_level"] = lgb_results_with_methods["dataset"].apply(
    lambda x: x.split("_")[-3]
)
lgb_results_with_methods["set_id"] = lgb_results_with_methods["dataset"].apply(
    lambda x: x.split("_")[-1]
)
lgb_results_with_methods["model"] = "LGB"

rf_results_with_methods = rf_results_df.copy()
rf_results_with_methods["sampling_method"] = rf_results_with_methods["dataset"].apply(
    lambda x: x.split("_")[0]
)
rf_results_with_methods["dataset_level"] = rf_results_with_methods["dataset"].apply(
    lambda x: x.split("_")[-3]
)
rf_results_with_methods["set_id"] = rf_results_with_methods["dataset"].apply(
    lambda x: x.split("_")[-1]
)
rf_results_with_methods["model"] = "RF"

# Combine
combined_results_with_methods = pd.concat(
    [rf_results_with_methods, lgb_results_with_methods], ignore_index=True
)
combined_results_with_methods = combined_results_with_methods[
    [
        "sampling_method",
        "dataset_level",
        "set_id",
        "test_accuracy",
        "test_f1",
        "roc_auc",
        "model",
    ]
]

In [None]:
# Significance and effect size analysis
metrics = [
    "test_accuracy",
    "test_f1",
    "roc_auc",
]
models = ["RF", "LGB"]
sampling_methods = combined_results_with_methods["sampling_method"].unique()
sampling_methods = [m for m in sampling_methods if m != "FSCS"]
dataset_levels = combined_results_with_methods["dataset_level"].unique()

results = []

In [None]:
# Compare LGB + BalancedSampling/CLHS/FSCS/SRS vs RF + SRS
for method in ["BalancedSampling", "clhs", "FSCS", "SRS"]:
    for metric in metrics:
        for dataset_level in dataset_levels:
            lgb_method = combined_results_with_methods[
                (combined_results_with_methods["sampling_method"] == method)
                & (combined_results_with_methods["model"] == "LGB")
                & (combined_results_with_methods["dataset_level"] == dataset_level)
            ]
            rf_srs = combined_results_with_methods[
                (combined_results_with_methods["sampling_method"] == "SRS")
                & (combined_results_with_methods["model"] == "RF")
                & (combined_results_with_methods["dataset_level"] == dataset_level)
            ]
            merged = pd.merge(
                lgb_method,
                rf_srs,
                on=["dataset_level", "set_id"],
                suffixes=("_lgb", "_rf"),
            )
            if len(merged) == 0:
                continue
            diff = merged[f"{metric}_lgb"] - merged[f"{metric}_rf"]
            try:
                stat, p_u = mannwhitneyu(
                    merged[f"{metric}_rf"],
                    merged[f"{metric}_lgb"],
                    alternative="two-sided",
                )
                n1, n2 = len(merged[f"{metric}_rf"]), len(merged[f"{metric}_lgb"])
                r_rb = 1 - 2 * stat / (n1 * n2) if n1 > 0 and n2 > 0 else np.nan
            except Exception:
                stat = np.nan
                p_w = np.nan
                r_rb = np.nan
            try:
                cd, _ = cliffs_delta(merged[f"{metric}_lgb"], merged[f"{metric}_rf"])
            except Exception:
                cd = np.nan
            results.append(
                {
                    "model": "LGB_method_vs_RF_SRS",
                    "metric": metric,
                    "dataset_level": dataset_level,
                    "method_vs": method,
                    "mw_U": stat,
                    "mw_p": p_u,
                    "r_rb": r_rb,
                    "CD": cd,
                }
            )

# Summary table
results_df = pd.DataFrame(results)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from matplotlib.lines import Line2D

plt.rcParams.update({"font.size": 22})

# Thresholds for r_rb and Cliff's delta effect size
ur_thresholds = [0.1, 0.3, 0.5]
cd_thresholds = [0.147, 0.33, 0.474]
sampling_methods = ["BalancedSampling", "clhs", "FSCS", "SRS"]
sampling_labels = ["Balanced\nSampling", "CLHS", "FSCS", "SRS"]

# Effect size legend
effect_legend = [
    Line2D(
        [0], [0], marker="x", color="blue", lw=0, markersize=14, label="Small effect"
    ),
    Line2D(
        [0],
        [0],
        marker="o",
        color="orange",
        markerfacecolor="none",
        lw=0,
        markersize=14,
        label="Medium effect",
    ),
    Line2D(
        [0],
        [0],
        marker="o",
        color="red",
        markerfacecolor="red",
        lw=0,
        markersize=14,
        label="Large effect",
    ),
    Line2D([0], [0], marker="*", color="red", lw=0, markersize=18, label="p > 0.05"),
]


def get_compare_df(results_df, model_name, metric, effect_col):
    # Filter the results for the specified model and metric
    df = results_df[
        (results_df["model"] == model_name) & (results_df["metric"] == metric)
    ].copy()
    df["sampling_method"] = df["method_vs"]
    return df


def plot_effect_bar(ax, sub, effect_col, thresholds, invert_y=False, metric_type=None):
    # If the data is empty, hide the axis
    if sub.empty:
        ax.axis("off")
        return
    # Draw barplot
    sns.barplot(
        x="sampling_method",
        y=effect_col,
        hue="dataset_level",
        data=sub,
        order=sampling_methods,
        ax=ax,
    )
    ax.set_xticklabels(
        [sampling_labels[sampling_methods.index(m)] for m in sampling_methods],
        fontsize=18,
    )
    # Invert y-axis if needed
    if invert_y:
        ax.invert_yaxis()
    ax.grid(True, axis="y", linestyle="--", linewidth=0.5)
    # Mark effect size and significance
    for i, method in enumerate(sampling_methods):
        for j, level in enumerate(
            sorted(sub["dataset_level"].unique(), key=lambda x: int(x))
        ):
            row = sub[
                (sub["sampling_method"] == method) & (sub["dataset_level"] == level)
            ]
            if not row.empty:
                effect = abs(row[effect_col].values[0])
                pval = row["mw_p"].values[0]
                # marker
                if effect < thresholds[0]:
                    # Only plot p-value star, skip effect size marker
                    for bar in ax.patches:
                        x_pos = bar.get_x() + bar.get_width() / 2.0
                        if (abs(x_pos - i) < 0.4) and np.isclose(
                            bar.get_height(), row[effect_col].values[0], atol=1e-4
                        ):
                            if pval > 0.05:
                                y_base = 0.02 if not invert_y else -0.02
                                ax.scatter(
                                    bar.get_x() + bar.get_width() / 2.0,
                                    y_base,
                                    marker="*",
                                    color="red",
                                    s=180,
                                    zorder=21,
                                )
                    continue  # Skip effect size marker
                elif effect < thresholds[1]:
                    marker_style = dict(
                        marker="x", color="blue", s=120, linewidths=3
                    )  # Small effect
                elif effect < thresholds[2]:
                    marker_style = dict(
                        marker="o",
                        facecolors="none",
                        edgecolors="orange",
                        s=120,
                        linewidths=3,
                    )  # Medium effect
                else:
                    marker_style = dict(marker="o", color="red", s=120)  # Large effect
                for bar in ax.patches:
                    x_pos = bar.get_x() + bar.get_width() / 2.0
                    if (abs(x_pos - i) < 0.4) and np.isclose(
                        bar.get_height(), row[effect_col].values[0], atol=1e-4
                    ):
                        y_eff = (
                            bar.get_height() + 0.015
                            if not invert_y
                            else bar.get_height() - 0.015
                        )
                        ax.scatter(
                            bar.get_x() + bar.get_width() / 2.0,
                            y_eff,
                            **marker_style,
                            zorder=20,
                        )
                        if pval > 0.05:
                            y_base = 0.02 if not invert_y else -0.02
                            ax.scatter(
                                bar.get_x() + bar.get_width() / 2.0,
                                y_base,
                                marker="*",
                                color="red",
                                s=180,
                                zorder=21,
                            )


fig, axes = plt.subplots(3, 2, figsize=(16, 18), sharex=True)
# Plot settings for each subplot: (metric, effect_col, thresholds, effect_label)
plot_settings = [
    ("test_accuracy", "r_rb", ur_thresholds, "Accuracy $r_{rb}$"),
    ("test_accuracy", "CD", cd_thresholds, "Accuracy $\\delta$"),
    ("test_f1", "r_rb", ur_thresholds, "F1 Score $r_{rb}$"),
    ("test_f1", "CD", cd_thresholds, "F1 Score $\\delta$"),
    ("roc_auc", "r_rb", ur_thresholds, "ROC-AUC $r_{rb}$"),
    ("roc_auc", "CD", cd_thresholds, "ROC-AUC $\\delta$"),
]
for idx, (metric, effect_col, thresholds, effect_label) in enumerate(plot_settings):
    r, c = divmod(idx, 2)
    df = get_compare_df(results_df, "LGB_method_vs_RF_SRS", metric, effect_col)
    plot_effect_bar(
        axes[r, c],
        df,
        effect_col,
        thresholds,
        invert_y=("rmsle" in metric),
        metric_type=metric,
    )
    axes[r, c].set_title(effect_label, fontsize=22)
    axes[r, c].set_xlabel("Sampling Method", fontsize=18)
    if c == 0:
        axes[r, c].set_ylabel("$r_{rb}$")
    else:
        axes[r, c].set_ylabel("$\\delta$")

    axes[r, c].grid(True, which="major", axis="y", linestyle="-", linewidth=0.6)
    axes[r, c].grid(True, which="minor", axis="y", linestyle="--", linewidth=0.25)
    axes[r, c].yaxis.set_major_locator(plt.MultipleLocator(0.2))
    axes[r, c].yaxis.set_minor_locator(plt.MultipleLocator(0.05))

# legend
handles1, labels1 = axes[2, 0].get_legend_handles_labels()
legend1 = axes[2, 0].legend(
    handles=handles1,
    labels=labels1,
    title="Data Size",
    loc="upper left",
    prop={"size": 16},
    title_fontsize=16,
    framealpha=0.7,
    bbox_to_anchor=(0, 0.93),  # (x, y) relative to the upper left of the subplot
)
axes[2, 0].add_artist(legend1)
legend2 = axes[2, 1].legend(
    handles=effect_legend,
    loc="upper left",
    title="Effect size",
    prop={"size": 16},
    title_fontsize=16,
    framealpha=0.6,
    bbox_to_anchor=(0, 0.93),  # (x, y) relative to the upper left of the subplot
)
axes[2, 1].add_artist(legend2)
for r in range(3):
    for c in range(2):
        if not (r == 3 and c in [0, 1]):
            axes[r, c].get_legend().remove()

plt.suptitle("LGB + Sampling Methods vs RF + SRS", fontsize=26)
plt.tight_layout()
plt.savefig(
    "forest_LGBvsRF_SRS_effectsize_barplot_3x2.jpg", dpi=800, bbox_inches="tight"
)
plt.show()