In [7]:
import pandas as pd
import numpy as np
from scipy import stats

# Load CSVs
df_4_0 = pd.read_csv("https://raw.githubusercontent.com/happy522/ChatGPT-as-a-CAT-tool-in-Easy-language-translation_Implemenatation/refs/heads/main/GPT_4_o/Matrix_Scores_GPT4_0.csv")
df_4_1 = pd.read_csv("https://raw.githubusercontent.com/happy522/ChatGPT-as-a-CAT-tool-in-Easy-language-translation_Implemenatation/refs/heads/main/GPT_4_1/Matrix_Scores_GPT4_1.csv")

# Sort by ID
df_4_0 = df_4_0.sort_values("ID").reset_index(drop=True)
df_4_1 = df_4_1.sort_values("ID").reset_index(drop=True)

# Define metrics
metrics = [c.replace("_Holistic", "") for c in df_4_0.columns if c.endswith("_Holistic")]

# Define which metrics are "lower is better" or "higher is better"
lower_better = ["TTR", "LD", "WR", "MDD", "SI", "Cross", "WSF", "LIX", "Fog"]
higher_better = ["FRE"]

# Function to compute statistical comparison
def compare_metrics(vals_0, vals_1):
    diff = vals_0 - vals_1
    # Check normality
    shapiro_p = stats.shapiro(diff).pvalue

    if shapiro_p > 0.05:
        p_val = stats.ttest_rel(vals_0, vals_1).pvalue
        effect = diff.mean() / np.std(diff, ddof=1)
        test = "t-test"
    else:
        p_val = stats.wilcoxon(vals_0, vals_1).pvalue
        effect = (np.sum(diff > 0) - np.sum(diff < 0)) / len(diff)
        test = "Wilcoxon"

    significance = "Yes" if p_val < 0.05 else "No"
    return p_val, effect, test, significance

# Build results
def build_results(df0, df1):
    results = {"Holistic": [], "Linguistic": []}

    for metric in metrics:
        for method in ["Holistic", "Linguistic"]:
            col_0 = f"{metric}_{method}"
            col_1 = f"{metric}_{method}"

            vals_0 = df0[col_0]
            vals_1 = df1[col_1]

            mean_0 = vals_0.mean()
            mean_1 = vals_1.mean()

            p_val, effect, test, significance = compare_metrics(vals_0, vals_1)

            # Decide winner based on metric orientation
            if metric in lower_better:
                winner = "GPT-4.0" if mean_0 < mean_1 else "GPT-4.1" if mean_0 > mean_1 else "Tie"
            else:  # higher is better
                winner = "GPT-4.0" if mean_0 > mean_1 else "GPT-4.1" if mean_0 < mean_1 else "Tie"

            results[method].append({
                "Metric": metric,
                "Mean GPT-4.0": round(mean_0, 3),
                "Mean GPT-4.1": round(mean_1, 3),
                "Winner": winner,
                "Stat Test": test,
                "p-value": round(p_val, 4),
                "Significant": significance,
                "Effect Size": round(effect, 2)
            })

    holistic_df = pd.DataFrame(results["Holistic"])
    linguistic_df = pd.DataFrame(results["Linguistic"])
    return holistic_df, linguistic_df

# Generate tables
holistic_df, linguistic_df = build_results(df_4_0, df_4_1)

# Display
print("\n===== Holistic Results =====\n")
print(holistic_df)

print("\n===== Linguistic Results =====\n")
print(linguistic_df)



===== Holistic Results =====

  Metric  Mean GPT-4.0  Mean GPT-4.1   Winner Stat Test  p-value Significant  \
0    TTR         0.545         0.484  GPT-4.1    t-test   0.0001         Yes   
1     LD         0.483         0.480  GPT-4.1    t-test   0.7855          No   
2     WR         0.437         0.419  GPT-4.1    t-test   0.0251         Yes   
3    MDD         2.420         2.428  GPT-4.0    t-test   0.8970          No   
4     SI         0.333         0.335  GPT-4.0    t-test   0.8981          No   
5  Cross         3.150         2.550  GPT-4.1  Wilcoxon   0.3986          No   
6    WSF         4.361         3.874  GPT-4.1    t-test   0.0035         Yes   
7    LIX        38.398        35.837  GPT-4.1    t-test   0.0250         Yes   
8    FRE        60.484        65.886  GPT-4.1    t-test   0.0009         Yes   
9    Fog        11.215         9.992  GPT-4.1    t-test   0.0030         Yes   

   Effect Size  
0         1.14  
1         0.06  
2         0.54  
3        -0.03  
4  