In [None]:
from scipy.stats import ttest_rel
import numpy as np

# === STEP 1: Define your macro-F1 results for each model across 5 seeds ===
# Replace these with your actual 5-seed performance values (same order of seeds)

# BERTweet models
bertweet_sent_single = [0.72, 0.73, 0.74, 0.71, 0.75]
bertweet_sent_multi  = [0.76, 0.78, 0.77, 0.75, 0.78]

bertweet_emotion_single = [0.68, 0.70, 0.69, 0.67, 0.71]
bertweet_emotion_multi  = [0.73, 0.74, 0.75, 0.72, 0.76]

# DistilRoBERTa models
distil_sent_single = [0.70, 0.71, 0.72, 0.69, 0.73]
distil_sent_multi  = [0.74, 0.75, 0.76, 0.73, 0.77]

distil_emotion_single = [0.66, 0.67, 0.65, 0.64, 0.68]
distil_emotion_multi  = [0.70, 0.72, 0.71, 0.69, 0.73]

# === STEP 2: Helper function to run t-test and print results ===
def run_ttest(model_name, task, single_scores, multi_scores):
    t_stat, p_value = ttest_rel(multi_scores, single_scores)
    print(f"\n📊 {model_name} - {task}")
    print(f"Mean (Single): {np.mean(single_scores):.4f}")
    print(f"Mean (Multi):  {np.mean(multi_scores):.4f}")
    print(f"T-statistic: {t_stat:.4f}")
    print(f"P-value:     {p_value:.4f}")
    if p_value < 0.05:
        print("✅ Statistically significant difference (p < 0.05)")
    else:
        print("❌ No statistically significant difference (p ≥ 0.05)")

# === STEP 3: Run t-tests for all comparisons ===
run_ttest("BERTweet", "Sentiment", bertweet_sent_single, bertweet_sent_multi)
run_ttest("BERTweet", "Emotion", bertweet_emotion_single, bertweet_emotion_multi)
run_ttest("DistilRoBERTa", "Sentiment", distil_sent_single, distil_sent_multi)
run_ttest("DistilRoBERTa", "Emotion", distil_emotion_single, distil_emotion_multi)
