In [None]:
import pandas as pd
import numpy as np
from scipy import stats
from statsmodels.stats.multitest import multipletests

In [23]:
import numpy as np
import pandas as pd
from scipy import stats
from statsmodels.stats.multitest import multipletests


metrics_info = {
    'TTR': False,
    'LD': False,
    'WR': False,
    'MDD': False,
    'SI': False,
    'Cross': False,
    'WSF': False,
    'LIX': False,
    'FRE': True,
    'Fog': False
}


# -----------------------------
# Welch t-test + Proper Cohen's d + 95% CI
# -----------------------------
def compare_columns(col1, col2):

    # Welch's t-test
    t_stat, p_val = stats.ttest_ind(col1, col2, equal_var=False)

    # Sample sizes
    n1 = len(col1)
    n2 = len(col2)

    # Means
    mean1 = col1.mean()
    mean2 = col2.mean()

    # Variances
    var1 = np.var(col1, ddof=1)
    var2 = np.var(col2, ddof=1)

    # -----------------------------
    # Proper pooled SD (weighted)
    # -----------------------------
    pooled_sd = np.sqrt(
        ((n1 - 1) * var1 + (n2 - 1) * var2) / (n1 + n2 - 2)
    )

    cohen_d = (mean1 - mean2) / pooled_sd

    # -----------------------------
    # 95% Confidence Interval for mean difference (Welch)
    # -----------------------------
    diff = mean1 - mean2

    se_diff = np.sqrt(var1/n1 + var2/n2)

    # Welchâ€“Satterthwaite df
    df = (var1/n1 + var2/n2)**2 / (
        (var1/n1)**2 / (n1 - 1) +
        (var2/n2)**2 / (n2 - 1)
    )

    t_crit = stats.t.ppf(0.975, df)
    ci_low = diff - t_crit * se_diff
    ci_high = diff + t_crit * se_diff

    return p_val, cohen_d, diff, ci_low, ci_high


# -----------------------------
# Main comparison function
# -----------------------------
def compare_holistic_files(df1, df2,nrows = 20):
    df1 = df1.head(nrows)
    df2 = df2.head(nrows)

    results = []

    for metric, higher_better in metrics_info.items():

        col1 = df1[f"{metric}_Holistic"].dropna()
        col2 = df2[f"{metric}_Holistic"].dropna()

        p_val, effect, diff, ci_low, ci_high = compare_columns(col1, col2)

        results.append({
            "Metric": metric,
            "Administrative_mean": col1.mean(),
            "Ausgangstexte_mean": col2.mean(),
            "Mean_difference": diff,
            "CI_lower": ci_low,
            "CI_upper": ci_high,
            "p_value": p_val,
            "Effect_size_d": effect
        })

    results_df = pd.DataFrame(results)

    # -----------------------------
    # Holm correction
    # -----------------------------
    results_df["p_adjusted"] = multipletests(
        results_df["p_value"], method="holm"
    )[1]

    # -----------------------------
    # Determine statistically supported "Better"
    # -----------------------------
    def determine_winner(row):
        if row["p_adjusted"] >= 0.05:
            return "No significant difference"

        if metrics_info[row["Metric"]]:  # higher is better
            return "Administrative" if row["Administrative_mean"] > row["Ausgangstexte_mean"] else "Ausgangstexte"
        else:  # lower is better
            return "Administrative" if row["Administrative_mean"] < row["Ausgangstexte_mean"] else "Ausgangstexte"

    results_df["Better"] = results_df.apply(determine_winner, axis=1)

    # -----------------------------
    # Rounding for clean output
    # -----------------------------
    results_df = results_df.round({
        "Administrative_mean": 3,
        "Ausgangstexte_mean": 3,
        "Mean_difference": 3,
        "CI_lower": 3,
        "CI_upper": 3,
        "p_value": 4,
        "p_adjusted": 4,
        "Effect_size_d": 3
    })

    return results_df


# Qwen

In [27]:

df1 = pd.read_csv(
    "https://raw.githubusercontent.com/happy522/ChatGPT-as-a-CAT-tool-in-Easy-language-translation_Implemenatation/refs/heads/main/qwen3-235b-a22b/Matrix_Scores_qwen3-235b-a22b_Administrative.csv")

df2 = pd.read_csv(
    "https://raw.githubusercontent.com/happy522/ChatGPT-as-a-CAT-tool-in-Easy-language-translation_Implemenatation/refs/heads/main/qwen3-235b-a22b/Matrix_Scores_qwen3-235b-a22b_Ausgangstexte.csv")


results_df = compare_holistic_files(df1, df2)

results_df


Unnamed: 0,Metric,Administrative_mean,Ausgangstexte_mean,Mean_difference,CI_lower,CI_upper,p_value,Effect_size_d,p_adjusted,Better
0,TTR,0.491,0.308,0.183,0.138,0.227,0.0,2.37,0.0,Ausgangstexte
1,LD,0.406,0.477,-0.071,-0.108,-0.034,0.0006,-1.377,0.0046,Administrative
2,WR,0.656,0.618,0.038,-0.03,0.105,0.2569,0.401,1.0,No significant difference
3,MDD,2.639,2.459,0.18,0.071,0.29,0.0021,1.025,0.0145,Ausgangstexte
4,SI,0.319,0.326,-0.006,-0.042,0.029,0.7133,-0.109,1.0,No significant difference
5,Cross,5.3,16.267,-10.967,-15.841,-6.093,0.0,-1.132,0.0004,Administrative
6,WSF,4.196,3.945,0.251,-0.042,0.545,0.0911,0.502,0.5365,No significant difference
7,LIX,39.679,39.769,-0.09,-1.931,1.75,0.9216,-0.028,1.0,No significant difference
8,FRE,61.07,63.516,-2.446,-5.285,0.393,0.0894,-0.5,0.5365,No significant difference
9,Fog,10.52,10.232,0.288,-0.423,1.0,0.4181,0.236,1.0,No significant difference


# Llama

In [28]:

df1 = pd.read_csv(
    "https://raw.githubusercontent.com/happy522/ChatGPT-as-a-CAT-tool-in-Easy-language-translation_Implemenatation/refs/heads/main/llama-3.3-70b-instruct/Matrix_Scores_llama-3.3-70b-instruct_Administrative.csv"
)

df2 = pd.read_csv(
    "https://raw.githubusercontent.com/happy522/ChatGPT-as-a-CAT-tool-in-Easy-language-translation_Implemenatation/refs/heads/main/llama-3.3-70b-instruct/Matrix_Scores_llama-3.3-70b-instruct_Ausgangstexte.csv"
)


results_df = compare_holistic_files(df1, df2)

results_df


Unnamed: 0,Metric,Administrative_mean,Ausgangstexte_mean,Mean_difference,CI_lower,CI_upper,p_value,Effect_size_d,p_adjusted,Better
0,TTR,0.504,0.513,-0.009,-0.043,0.025,0.5933,-0.143,1.0,No significant difference
1,LD,0.462,0.505,-0.043,-0.062,-0.024,0.0,-1.363,0.0004,Administrative
2,WR,0.42,0.439,-0.018,-0.041,0.004,0.1046,-0.448,0.7321,No significant difference
3,MDD,2.255,2.232,0.023,-0.104,0.149,0.7192,0.105,1.0,No significant difference
4,SI,0.319,0.295,0.025,-0.026,0.076,0.3279,0.285,1.0,No significant difference
5,Cross,2.55,2.6,-0.05,-1.685,1.585,0.951,-0.018,1.0,No significant difference
6,WSF,3.971,4.292,-0.321,-0.754,0.113,0.1431,-0.43,0.8584,No significant difference
7,LIX,36.456,40.035,-3.579,-6.024,-1.133,0.0051,-0.83,0.0458,Administrative
8,FRE,62.345,60.61,1.735,-3.074,6.545,0.4712,0.203,1.0,No significant difference
9,Fog,10.641,11.581,-0.94,-2.025,0.145,0.0877,-0.494,0.7014,No significant difference


#GPT 5.1

In [29]:
df1 = pd.read_csv("https://raw.githubusercontent.com/happy522/ChatGPT-as-a-CAT-tool-in-Easy-language-translation_Implemenatation/refs/heads/main/GPT_5_1/Matrix_Scores_GPT5_1_Administrative.csv")
df2 = pd.read_csv("https://raw.githubusercontent.com/happy522/ChatGPT-as-a-CAT-tool-in-Easy-language-translation_Implemenatation/refs/heads/main/GPT_5_1/Matrix_Scores_GPT5_1_ausgangstexte.csv")

results_df = compare_holistic_files(df1, df2)

results_df


Unnamed: 0,Metric,Administrative_mean,Ausgangstexte_mean,Mean_difference,CI_lower,CI_upper,p_value,Effect_size_d,p_adjusted,Better
0,TTR,0.466,0.44,0.026,-0.044,0.096,0.4564,0.23,1.0,No significant difference
1,LD,0.505,0.562,-0.057,-0.086,-0.028,0.0003,-1.136,0.003,Administrative
2,WR,0.449,0.471,-0.023,-0.053,0.007,0.1332,-0.442,1.0,No significant difference
3,MDD,2.526,2.541,-0.015,-0.259,0.229,0.9017,-0.033,1.0,No significant difference
4,SI,0.231,0.235,-0.004,-0.058,0.051,0.8887,-0.041,1.0,No significant difference
5,Cross,5.35,13.067,-7.717,-14.389,-1.044,0.0246,-0.571,0.2215,No significant difference
6,WSF,4.045,3.956,0.088,-0.443,0.619,0.7391,0.097,1.0,No significant difference
7,LIX,38.399,39.266,-0.867,-4.661,2.928,0.6472,-0.132,1.0,No significant difference
8,FRE,60.362,61.927,-1.565,-7.53,4.401,0.6004,-0.142,1.0,No significant difference
9,Fog,10.28,10.582,-0.302,-1.701,1.097,0.6651,-0.126,1.0,No significant difference
