In [1]:
from pathlib import Path

import pandas as pd
import numpy as np
from scipy.stats import mannwhitneyu, ttest_ind
from statsmodels.stats.multitest import multipletests


# GPT 3.5 - Load all isolated results

In [3]:
data_path = Path('data')

# find files
files = list(data_path.glob('results_scenario*gpt-3.5*.csv'))

df_single = pd.concat([pd.read_csv(f) for f in files], ignore_index=True)



In [4]:
df_single

Unnamed: 0,gen_id,answer,mfv_code,model
0,1,3,105,gpt-3.5-turbo
1,2,4,105,gpt-3.5-turbo
2,3,3,105,gpt-3.5-turbo
3,4,3,105,gpt-3.5-turbo
4,5,3,105,gpt-3.5-turbo
...,...,...,...,...
1890,104,5,808,gpt-3.5-turbo
1891,105,2,808,gpt-3.5-turbo
1892,106,4,808,gpt-3.5-turbo
1893,107,4,808,gpt-3.5-turbo


Checking correct-answer effect

In [47]:
df_single.groupby("mfv_code")["answer"].nunique()

mfv_code
105    4
111    4
201    3
203    3
205    3
206    3
408    3
409    3
501    5
503    3
509    4
602    5
610    4
702    5
704    5
808    5
Name: answer, dtype: int64

## Loads general data

In [15]:
df_in_series = pd.read_csv("data/results_original_gpt-3.5-turbo_2023-08-13_22-43.csv")

df_in_series.head(2)

Unnamed: 0,102,103,104,105,108,109,110,111,112,113,...,714,715,716,801,802,803,804,805,808,810
0,4,4,3,4,4,3,4,3,4,3,...,5,4,5,5,3,5,4,3,3,3
1,4,3,3,4,4,2,2,3,4,2,...,5,4,5,5,4,5,5,4,4,4


Run tests

In [71]:
from numpy import mean, std, median
from math import sqrt
def cohen_d(x,y):
        return (mean(x) - mean(y)) / sqrt((std(x, ddof=1) ** 2 + std(y, ddof=1) ** 2) / 2.0)


In [80]:
np.mad

AttributeError: module 'numpy' has no attribute 'mad'

In [82]:
import scipy.stats
test_results = list()

for code in df_single["mfv_code"].unique():
    single_answers = df_single[df_single["mfv_code"] == code]["answer"]

    series_answers = df_in_series[str(code)]
    print(
        f"Code: {code} had {df_in_series[str(code)].nunique()} different answers when generated for all vignettes"
    )

    test_results.append(
        (
            code,
            median(single_answers),
            scipy.stats.median_abs_deviation(single_answers),
            median(series_answers),
            scipy.stats.median_abs_deviation(series_answers),
            cohen_d(single_answers, series_answers),
            *mannwhitneyu(single_answers, series_answers),
        )
    )

df_tests = pd.DataFrame(
    test_results,
    columns=["mfv_code", "mediana_single", "MAD_single", "mediana_serial", "MAD_serial", "cohen_d", "u", "p-value"],
)


Code: 205 had 3 different answers when generated for all vignettes
Code: 111 had 3 different answers when generated for all vignettes
Code: 203 had 2 different answers when generated for all vignettes
Code: 105 had 4 different answers when generated for all vignettes
Code: 201 had 2 different answers when generated for all vignettes
Code: 602 had 3 different answers when generated for all vignettes
Code: 409 had 4 different answers when generated for all vignettes
Code: 808 had 5 different answers when generated for all vignettes
Code: 501 had 4 different answers when generated for all vignettes
Code: 509 had 4 different answers when generated for all vignettes
Code: 704 had 5 different answers when generated for all vignettes
Code: 702 had 4 different answers when generated for all vignettes
Code: 610 had 4 different answers when generated for all vignettes
Code: 503 had 3 different answers when generated for all vignettes
Code: 408 had 4 different answers when generated for all vigne

In [83]:
df_tests["rejected"], df_tests["adjusted_p-value"], _, _ = multipletests(
    df_tests["p-value"],
    alpha=0.05,
    method="bonferroni"
)

In [84]:
df_single.groupby("mfv_code").size()

mfv_code
105    106
111    179
201    100
203     99
205     87
206     96
408     90
409     97
501    100
503     99
509    202
602    100
610     91
702    174
704    103
808    172
dtype: int64

In [85]:
df_tests["median_dif"] = df_tests["mediana_single"] - df_tests["mediana_serial"]


df_tests

Unnamed: 0,mfv_code,mediana_single,MAD_single,mediana_serial,MAD_serial,cohen_d,u,p-value,rejected,adjusted_p-value,median_dif
0,205,4.0,0.0,4.0,0.0,-0.703878,3732.5,3.203127e-06,True,5.125003e-05,0.0
1,111,3.0,0.0,3.0,0.0,0.89069,15465.0,5.02797e-13,True,8.044752e-12,0.0
2,203,4.0,0.0,5.0,0.0,-1.837652,1917.0,7.403682e-28,True,1.184589e-26,-1.0
3,105,4.0,0.0,4.0,0.0,-0.264728,5709.5,0.02445954,False,0.3913526,0.0
4,201,5.0,0.0,5.0,0.0,-0.727063,4772.0,1.569465e-08,True,2.511144e-07,0.0
5,602,3.0,0.0,3.0,0.0,-0.368304,5279.5,0.006178656,False,0.09885849,0.0
6,409,4.0,0.0,4.0,0.0,-0.599308,4305.5,1.046135e-05,True,0.0001673816,0.0
7,808,4.0,1.0,4.0,1.0,-0.086182,10382.5,0.5125071,False,1.0,0.0
8,501,3.0,0.0,3.0,0.0,0.171528,6662.5,0.351101,False,1.0,0.0
9,509,3.0,0.0,4.0,0.0,-2.241929,2136.0,4.309372e-46,True,6.894995e-45,-1.0
