In [1]:
import pandas as pd
from scipy.stats import wilcoxon

In [2]:



def run_wilcoxon_test(csv_before, csv_after, model_name):
    df_before = pd.read_csv(csv_before)
    df_after = pd.read_csv(csv_after)

    print(f"=== Wilcoxon Signed-Rank Test for {model_name} ===\n")

    for metric in df_before.columns:
        before = df_before[metric]
        after = df_after[metric]

        if len(before) != len(after):
            print(f"Skipping {metric} – unequal lengths.")
            continue

        print(f"Metric: {metric}")
        print(f"  Values BEFORE: {before.values}")
        print(f"  Values AFTER : {after.values}")

        try:
            stat, p = wilcoxon(before, after, alternative='two-sided')
            print(f"  Test statistic: {stat:.4f}")
            print(f"  p-value: {p:.4f}")

            if p < 0.05:
                print("  ✅ Statistically significant difference.")
            else:
                print("  ❌ No statistically significant difference.")
        except ValueError as e:
            print(f"  Could not compute Wilcoxon for {metric}: {e}")
        print()



# Comparison of regression and classification (without the resampling)

In [3]:
# svm

run_wilcoxon_test(
    "2a_classification_no_resample_SVC.csv",
    "3a_classification_no_resample_SVR.csv",
    "SVM"
)

=== Wilcoxon Signed-Rank Test for SVM ===

Metric: acc
  Values BEFORE: [0.68333333 0.67083333 0.6875     0.63125    0.68125    0.68125
 0.68541667 0.67916667 0.69375    0.69375   ]
  Values AFTER : [0.65       0.64583333 0.65199161 0.62133891 0.60460251 0.63256785
 0.63522013 0.64091858 0.62761506 0.62083333]
  Test statistic: 0.0000
  p-value: 0.0020
  ✅ Statistically significant difference.

Metric: prec
  Values BEFORE: [0.46850683 0.38532401 0.37511091 0.3445748  0.42427792 0.42602384
 0.36801087 0.36980609 0.35715658 0.38193961]
  Values AFTER : [0.30444951 0.31206225 0.31158785 0.28111194 0.29894349 0.3720947
 0.29205011 0.27238501 0.3387095  0.28292851]
  Test statistic: 0.0000
  p-value: 0.0020
  ✅ Statistically significant difference.

Metric: rec
  Values BEFORE: [0.42560431 0.39289006 0.38364257 0.33538005 0.40103624 0.42771889
 0.41519201 0.3908697  0.38148296 0.37532626]
  Values AFTER : [0.34491727 0.32672438 0.32539116 0.30997857 0.30629193 0.37539878
 0.32676607 0.2994

In [4]:
# knn

run_wilcoxon_test(
    "2a_classification_no_resample_KNN.csv",
    "3a_classification_no_resample_KNN.csv",
    "KNN"
)

=== Wilcoxon Signed-Rank Test for KNN ===

Metric: acc
  Values BEFORE: [0.575      0.56875    0.62291667 0.58125    0.55625    0.58125
 0.5625     0.58541667 0.6        0.60416667]
  Values AFTER : [0.61041667 0.63541667 0.64583333 0.61666667 0.57708333 0.61041667
 0.5875     0.64791667 0.61458333 0.62708333]
  Test statistic: 0.0000
  p-value: 0.0020
  ✅ Statistically significant difference.

Metric: prec
  Values BEFORE: [0.39468687 0.34621439 0.39503158 0.32724418 0.30643721 0.45298992
 0.31655537 0.33724756 0.32669492 0.37131077]
  Values AFTER : [0.43223636 0.44568498 0.44879825 0.46462483 0.3890391  0.42247089
 0.38156328 0.4854866  0.39230383 0.42920728]
  Test statistic: 1.0000
  p-value: 0.0039
  ✅ Statistically significant difference.

Metric: rec
  Values BEFORE: [0.34502789 0.31514886 0.3450748  0.32024551 0.28477988 0.3354685
 0.30402106 0.31407672 0.31053364 0.32621119]
  Values AFTER : [0.40452372 0.42742762 0.43501057 0.43124239 0.35470771 0.37437049
 0.37173787 0.4578

In [5]:
# dt

run_wilcoxon_test(
    "2a_classification_no_resample_DT.csv",
    "3a_classification_no_resample_DT.csv",
    "DT"
)

=== Wilcoxon Signed-Rank Test for DT ===

Metric: acc
  Values BEFORE: [0.62708333 0.68541667 0.65416667 0.64375    0.62916667 0.67083333
 0.68125    0.64375    0.6625     0.65833333]
  Values AFTER : [0.65416667 0.63958333 0.64166667 0.63958333 0.6125     0.67916667
 0.67083333 0.66041667 0.67291667 0.69583333]
  Test statistic: 26.0000
  p-value: 0.9023
  ❌ No statistically significant difference.

Metric: prec
  Values BEFORE: [0.40052189 0.45408946 0.42616583 0.41840351 0.411961   0.37676859
 0.40678991 0.38076014 0.39603301 0.42605216]
  Values AFTER : [0.4089952  0.39370561 0.39765225 0.36129274 0.36292735 0.43354421
 0.46330785 0.39920445 0.40080938 0.42442058]
  Test statistic: 24.0000
  p-value: 0.7695
  ❌ No statistically significant difference.

Metric: rec
  Values BEFORE: [0.39270738 0.45374814 0.44783125 0.39683203 0.36899766 0.36740798
 0.41518692 0.37702162 0.39364635 0.39251771]
  Values AFTER : [0.41491339 0.40139808 0.39476621 0.37413497 0.35970842 0.42204276
 0.4246

In [6]:
# mlp
run_wilcoxon_test(
    "2a_classification_no_resample_MLP.csv",
    "3a_classification_no_resample_MLP.csv",
    "MLP"
)

=== Wilcoxon Signed-Rank Test for MLP ===

Metric: acc
  Values BEFORE: [0.7        0.73333333 0.69583333 0.7125     0.6875     0.7125
 0.69166667 0.725      0.73125    0.73541667]
  Values AFTER : [0.69620253 0.70512821 0.65894737 0.70440252 0.68631579 0.67016807
 0.66880342 0.71368421 0.69684211 0.68498943]
  Test statistic: 0.0000
  p-value: 0.0020
  ✅ Statistically significant difference.

Metric: prec
  Values BEFORE: [0.39945686 0.39700082 0.34088972 0.38240271 0.41042568 0.46123985
 0.39817053 0.40338982 0.43706119 0.4554337 ]
  Values AFTER : [0.47226573 0.50093395 0.37535741 0.47826632 0.43783154 0.42344137
 0.36849025 0.45795474 0.42937679 0.47406619]
  Test statistic: 11.0000
  p-value: 0.1055
  ❌ No statistically significant difference.

Metric: rec
  Values BEFORE: [0.42706485 0.4454863  0.37931611 0.40716634 0.39103116 0.47477755
 0.43034933 0.41978532 0.45051478 0.44579267]
  Values AFTER : [0.471947   0.51247748 0.38837222 0.45962441 0.42018235 0.43239029
 0.41455978 0.

# Comparison of regression and classification (with simple resampling)


In [7]:
run_wilcoxon_test(
    "2b_classification_simple_resample_SVC.csv",
    "3b_classification_simple_resample_SVR.csv",
    "SVM"
)

=== Wilcoxon Signed-Rank Test for SVM ===

Metric: acc
  Values BEFORE: [0.66060904 0.66699411 0.66257367 0.65324165 0.66011788 0.66159136
 0.65962672 0.65618861 0.67387033 0.67829077]
  Values AFTER : [0.96414538 0.95088409 0.96218075 0.94744597 0.96119843 0.9518664
 0.96070727 0.95383104 0.95333988 0.96070727]
  Test statistic: 0.0000
  p-value: 0.0020
  ✅ Statistically significant difference.

Metric: prec
  Values BEFORE: [0.66605446 0.67585088 0.6711536  0.66296722 0.6752907  0.66528915
 0.66580718 0.6605424  0.6738258  0.68483931]
  Values AFTER : [0.96439907 0.95109733 0.96261357 0.9490023  0.96208804 0.95166325
 0.96207867 0.95475907 0.95358148 0.96175541]
  Test statistic: 0.0000
  p-value: 0.0020
  ✅ Statistically significant difference.

Metric: rec
  Values BEFORE: [0.6590192  0.66624517 0.66501143 0.65350707 0.66434812 0.66272088
 0.66140323 0.6560544  0.67006027 0.67634537]
  Values AFTER : [0.96434354 0.95192889 0.96166536 0.94759412 0.96119101 0.95194191
 0.9612926  0.9

In [8]:
run_wilcoxon_test(
    "2b_classification_simple_resample_KNN.csv",
    "3b_classification_simple_resample_KNN.csv",
    "KNN"
)

=== Wilcoxon Signed-Rank Test for KNN ===

Metric: acc
  Values BEFORE: [0.89833006 0.8845776  0.9002947  0.89882122 0.89931238 0.90766208
 0.89390963 0.89440079 0.89931238 0.91257367]
  Values AFTER : [0.96807466 0.9651277  0.96856582 0.95825147 0.96905697 0.97396857
 0.96463654 0.96168959 0.96119843 0.96660118]
  Test statistic: 0.0000
  p-value: 0.0020
  ✅ Statistically significant difference.

Metric: prec
  Values BEFORE: [0.90068461 0.88082071 0.89818292 0.89730693 0.89758435 0.90018354
 0.89318116 0.89189231 0.89542306 0.91418781]
  Values AFTER : [0.96958489 0.96418437 0.96911905 0.95937056 0.96980866 0.97268724
 0.96561094 0.96180282 0.95946845 0.9673363 ]
  Test statistic: 0.0000
  p-value: 0.0020
  ✅ Statistically significant difference.

Metric: rec
  Values BEFORE: [0.90008527 0.88381982 0.89919545 0.90058926 0.89957458 0.90232493
 0.89791131 0.8936896  0.89665605 0.91487365]
  Values AFTER : [0.96847166 0.96535085 0.96888272 0.95859202 0.9694446  0.97255357
 0.96572721 0.

In [9]:
run_wilcoxon_test(
    "2b_classification_simple_resample_DT.csv",
    "3b_classification_simple_resample_DT.csv",
    "DT"
)

=== Wilcoxon Signed-Rank Test for DT ===

Metric: acc
  Values BEFORE: [0.97642436 0.97937132 0.97544204 0.97642436 0.98182711 0.98526523
 0.96856582 0.97789784 0.97937132 0.97789784]
  Values AFTER : [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
  Test statistic: 0.0000
  p-value: 0.0020
  ✅ Statistically significant difference.

Metric: prec
  Values BEFORE: [0.97749581 0.9791211  0.97532309 0.97780914 0.98232868 0.9844248
 0.96920043 0.97778357 0.98006477 0.97860322]
  Values AFTER : [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
  Test statistic: 0.0000
  p-value: 0.0020
  ✅ Statistically significant difference.

Metric: rec
  Values BEFORE: [0.9769004  0.97899291 0.97447417 0.97685598 0.98130971 0.98451084
 0.97006335 0.97766317 0.9789943  0.97849229]
  Values AFTER : [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
  Test statistic: 0.0000
  p-value: 0.0020
  ✅ Statistically significant difference.

Metric: f1
  Values BEFORE: [0.97688522 0.97899499 0.97452732 0.97695901 0.98142855 0.98439437
 0.96943067 0.9775897  0.9791127

In [10]:
run_wilcoxon_test(
    "2b_classification_simple_resample_MLP.csv",
    "3b_classification_simple_resample_MLP.csv",
    "MLP"
)

=== Wilcoxon Signed-Rank Test for MLP ===

Metric: acc
  Values BEFORE: [0.86394892 0.87573674 0.86886051 0.86444008 0.87131631 0.89685658
 0.87131631 0.87966601 0.89685658 0.89685658]
  Values AFTER : [0.95398318 0.95806611 0.95056846 0.93948413 0.95667159 0.94227923
 0.95647873 0.94103072 0.95002474 0.95261599]
  Test statistic: 0.0000
  p-value: 0.0020
  ✅ Statistically significant difference.

Metric: prec
  Values BEFORE: [0.86619385 0.8729429  0.87081574 0.86313239 0.8743764  0.89304277
 0.87073872 0.8786982  0.88961477 0.89846309]
  Values AFTER : [0.95370995 0.95844253 0.95066067 0.94137044 0.95804912 0.9434971
 0.9562755  0.94162555 0.95251716 0.95304113]
  Test statistic: 0.0000
  p-value: 0.0020
  ✅ Statistically significant difference.

Metric: rec
  Values BEFORE: [0.86617    0.87328998 0.87269896 0.86626297 0.87342414 0.89226039
 0.8739047  0.88002496 0.89079868 0.89790476]
  Values AFTER : [0.95362369 0.95859499 0.94988771 0.93994216 0.95710611 0.94373787
 0.95593281 0.9