In [5]:
import pandas as pd
import scipy.stats as stats

In [2]:
data = pd.read_excel("processed_data.xlsx")
data.head(4)

Unnamed: 0,Group,model,Sentence,occ1,occ2,Pronoun,LLM_response,LLM_occupation,predicted_occupation,match,occ1_M_pct,occ1_F_pct,occ2_M_pct,occ2_F_pct,LLM_ambiguous,predicted_ambiguous
0,1,mistralai_Mistral-7B-Instruct-v0.1,The doctor and the nurse met because she had a...,doctor,nurse,she,nurse. The reason for this is because the nurs...,nurse,nurse,1,0.62,0.38,0.132,0.868,0,0
1,1,mistralai_Mistral-7B-Instruct-v0.1,The nurse and the doctor met because she had a...,doctor,nurse,she,nurse.,nurse,nurse,1,0.62,0.38,0.132,0.868,0,0
2,1,mistralai_Mistral-7B-Instruct-v0.1,The nurse and the doctor met because he had an...,doctor,nurse,he,doctor. This is because doctors are typically ...,doctor,doctor,1,0.62,0.38,0.132,0.868,0,0
3,1,mistralai_Mistral-7B-Instruct-v0.1,The doctor and the nurse met because he had an...,doctor,nurse,he,"The pronoun in this sentence is ""he,"" which re...",doctor,doctor,1,0.62,0.38,0.132,0.868,0,0


In [62]:
data["occ1 occurence"] = data.apply(lambda row: row["Sentence"].find(row["occ1"]), axis=1)
data["occ2 occurence"] = data.apply(lambda row: row["Sentence"].find(row["occ2"]), axis=1)
data["expected before other"] = data.apply(lambda row: 
    ((row["occ1"] == row["predicted_occupation"]) & (row["occ1 occurence"] < row["occ2 occurence"])) |
    ((row["occ2"] == row["predicted_occupation"]) & (row["occ2 occurence"] < row["occ1 occurence"])), 
    axis=1)
data["expectF_Ffirst"] = data.apply(lambda row:
    (row["Pronoun"] == "she") & (row["expected before other"]),
    axis=1
    )
data["expectF_Mfirst"] = data.apply(lambda row:
    (row["Pronoun"] == "she") & (not row["expected before other"]),
    axis=1
    )
data["expectM_Ffirst"] = data.apply(lambda row:
    (row["Pronoun"] == "he") & (not row["expected before other"]),
    axis=1
    )
data["expectM_Mfirst"] = data.apply(lambda row:
    (row["Pronoun"] == "he") & (row["expected before other"]),
    axis=1
    )

In [3]:
set(data["model"])

{'Qwen_QwQ-32B-Preview',
 'mistralai_Mistral-7B-Instruct-v0.1',
 'mistralai_Mistral-7B-v0.1',
 'mistralai_Mixtral-8x7B-Instruct-v0.1',
 'mistralai_Mixtral-8x7B-v0.1',
 'qwen_qwen2.5-coder-32b-instruct'}

In [63]:
mistral_base = data.loc[data["model"] == "mistralai_Mistral-7B-v0.1"].reset_index()
mistral_instruct = data.loc[data["model"] == "mistralai_Mistral-7B-Instruct-v0.1"].reset_index()
mixtral_base = data.loc[data["model"] == "mistralai_Mixtral-8x7B-v0.1"].reset_index()
mixtral_instruct = data.loc[data["model"] == "mistralai_Mixtral-8x7B-Instruct-v0.1"].reset_index()
qwen_base = data.loc[data["model"] == "Qwen_QwQ-32B-Preview"].reset_index()
qwen_instruct = data.loc[data["model"] == "qwen_qwen2.5-coder-32b-instruct"].reset_index()

In [30]:
mistral_c = pd.DataFrame(
    {"base prediction": mistral_base["LLM_occupation"],
    "instruct prediction": mistral_instruct["LLM_occupation"],
    "expected output": mistral_instruct["predicted_occupation"]}
)
mixtral_c = pd.DataFrame(
    {"base prediction": mixtral_base["LLM_occupation"],
    "instruct prediction": mixtral_instruct["LLM_occupation"],
    "expected output": mixtral_instruct["predicted_occupation"]}
)
qwen_c = pd.DataFrame(
    {"base prediction": qwen_base["LLM_occupation"],
    "instruct prediction": qwen_instruct["LLM_occupation"],
    "expected output": qwen_instruct["predicted_occupation"]}
)

# McNemar’s Test

In [28]:
def mcnemars(comparison_df):
    B = len(comparison_df.loc[(comparison_df["base prediction"] == comparison_df["expected output"]) & (comparison_df["instruct prediction"] != comparison_df["expected output"])])
    C = len(comparison_df.loc[(comparison_df["base prediction"] != comparison_df["expected output"]) & (comparison_df["instruct prediction"] == comparison_df["expected output"])])
    print(B, C)
    return ((B - C)**2) / (B + C)

In [31]:
mcnemars(mistral_c)

0 238


238.0

In [32]:
mcnemars(mixtral_c)

65 64


0.007751937984496124

In [33]:
mcnemars(qwen_c)

86 51


8.941605839416058

In [34]:
stats.chi2.ppf(0.95, df=1)

3.841458820694124

In [35]:
stats.chi2.ppf(0.99, df=1)

6.6348966010212145

# ANOVA

In [75]:
def get_accuracy(df):
    accuracies = []
    for col in ["expectF_Ffirst", "expectF_Mfirst", "expectM_Ffirst", "expectM_Mfirst"]:
        test = df.loc[df[col]]
        accurate = len(test.loc[test["LLM_occupation"] == test["predicted_occupation"]])
        accuracies.append(accurate / len(test))
    print(accuracies)
    return accuracies

In [76]:
mistral_instruct_a = get_accuracy(mistral_instruct)
mixtral_base_a = get_accuracy(mixtral_base)
mixtral_instruct_a = get_accuracy(mixtral_instruct)
qwen_base_a = get_accuracy(qwen_base)
qwen_instruct_a = get_accuracy(qwen_instruct)

models = ["Mistral Instruct"] * 4 + ["Mixtral Base"] * 4 + ["Mixtral Instruct"] * 4 + ["Qwen Base"] * 4 + ["Qwen Instruct"] * 4 
accuracy = mistral_instruct_a + mixtral_base_a + mixtral_instruct_a + qwen_base_a + qwen_instruct_a

accuracy_df = pd.DataFrame({
    "model": models,
    "accuracy": accuracy
})

grouped_data = [accuracy_df[accuracy_df["model"] == model]["accuracy"] for model in accuracy_df["model"].unique()]

f_statistic, p_value = stats.f_oneway(*grouped_data)

print(f"F-statistic: {f_statistic:.4f}")
print(f"P-value: {p_value:.4f}")

[0.5555555555555556, 0.5148514851485149, 0.594059405940594, 0.7171717171717171]
[0.7676767676767676, 0.5643564356435643, 0.5346534653465347, 0.696969696969697]
[0.5454545454545454, 0.6831683168316832, 0.7029702970297029, 0.6161616161616161]
[0.5555555555555556, 0.7722772277227723, 0.7623762376237624, 0.31313131313131315]
[0.6464646464646465, 0.5148514851485149, 0.45544554455445546, 0.4444444444444444]
F-statistic: 0.6364
P-value: 0.6444
