In [21]:
import pandas as pd
import glob
import numpy as np
import json
import math
from natsort import natsorted
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import classification_report, f1_score, accuracy_score, recall_score, precision_score
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from scipy.stats import friedmanchisquare

In [16]:
# Performance data from the table for 5 models and 4 metrics (Accuracy, Precision, Recall, F1)
# Models: [LightGBM, fastText, Pretrained GPT-2, Pretrained RoBERTa, RoBERTa with Additional Pretraining]
# Metrics order: Accuracy, Precision, Recall, F1
data = np.array([
    [73.91, 75.57, 73.91, 74.13],   # LightGBM
    [75.86, 76.44, 75.90, 76.04],   # fastText
    [81.07, 81.07, 81.07, 81.05],   # Pretrained GPT-2
    [90.00, 89.98, 90.00, 89.97],   # Pretrained RoBERTa
    [90.28, 90.27, 90.27, 90.27]    # Pretrained RoBERTa with Additional Pretraining
])

# Run the Friedman test
stat, p_value = friedmanchisquare(data[:, 0], data[:, 1], data[:, 2], data[:, 3])
print(stat, p_value)

1.9500000000000028 0.58284897105255


In [17]:
import numpy as np
from scipy.stats import friedmanchisquare

# Performance data
data = np.array([
    [73.91, 75.57, 73.91, 74.13],   # LightGBM
    [75.86, 76.44, 75.90, 76.04],   # fastText
    [81.07, 81.07, 81.07, 81.05],   # Pretrained GPT-2
    [90.00, 89.98, 90.00, 89.97],   # Pretrained RoBERTa
    [90.28, 90.27, 90.27, 90.27]    # Pretrained RoBERTa with Additional Pretraining
])

# Extract F1 scores (last column)
f1_scores = data[:, 3]

# Since you need to compare across models, we need to reshape data to fit Friedman test format
f1_data = data[:, [0, 1, 2, 3]].T  # Transpose to have models in rows

# Run the Friedman test
stat, p_value = friedmanchisquare(*f1_data)

print(stat, p_value)

1.9500000000000028 0.58284897105255


In [20]:
import numpy as np
from scikit_posthocs import posthoc_nemenyi

# F1 scores data for each model
data = np.array([
    [74.13],  # LightGBM
    [76.04],  # fastText
    [81.05],  # Pretrained GPT-2
    [89.97],  # Pretrained RoBERTa
    [90.27]   # Pretrained RoBERTa with Additional Pretraining
]).T  # Transpose to fit the test format

# Run Nemenyi test for pairwise comparisons
posthoc_results = posthoc_nemenyi(data.T)  # Transpose back for the post-hoc test

print(posthoc_results)

          1         2         3         4         5
1  1.000000  0.995321  0.938448  0.772482  0.524931
2  0.995321  1.000000  0.995321  0.938448  0.772482
3  0.938448  0.995321  1.000000  0.995321  0.938448
4  0.772482  0.938448  0.995321  1.000000  0.995321
5  0.524931  0.772482  0.938448  0.995321  1.000000


In [13]:
from scipy.stats import f_oneway

for i, metric in enumerate(metrics):
    scores = [data[j, i] for j in range(data.shape[0])]  # scores for the i-th metric
    print(f"Scores for {metric}: {scores}")
    stat, p = f_oneway(*([scores]))  # Unpack the list correctly
    anova_results[metric] = (stat, p)

print(anova_results)


Scores for Accuracy: [73.91, 75.86, 81.07, 90.0, 90.28]


TypeError: at least two inputs are required; got 1.

In [7]:
import pandas as pd
from statsmodels.stats.multicomp import pairwise_tukeyhsd

# Convert data to a suitable format
model_names = ['LightGBM', 'fastText', 'Pretrained GPT-2', 'Pretrained RoBERTa', 'Pretrained RoBERTa with Additional Pretraining']
metrics = ['Accuracy', 'Precision', 'Recall', 'F1']

# Create a DataFrame for each metric
df = pd.DataFrame(data, columns=model_names)

# Melt the DataFrame for Tukey's test
df_melted = df.melt(var_name='Model', value_name='Score')

# Perform Tukey's HSD for one metric (e.g., Accuracy)
tukey = pairwise_tukeyhsd(df_melted['Score'][df_melted['variable'] == 'Accuracy'], df_melted['Model'][df_melted['variable'] == 'Accuracy'])
print(tukey)

ValueError: Shape of passed values is (5, 4), indices imply (5, 5)