In [1]:
from pathlib import Path

import pandas as pd
import numpy as np
from scipy.stats import mannwhitneyu, ttest_ind
from statsmodels.stats.multitest import multipletests


# GPT 3.5 - Load all isolated results

In [2]:
data_path = Path('data')

# find files
files = list(data_path.glob('single_scenario/results_scenario*gpt-3.5*.csv'))

df_single = pd.concat([pd.read_csv(f) for f in files], ignore_index=True)



In [3]:
df_single

Unnamed: 0,gen_id,answer,mfv_code,model
0,1,3,105,gpt-3.5-turbo
1,2,4,105,gpt-3.5-turbo
2,3,3,105,gpt-3.5-turbo
3,4,3,105,gpt-3.5-turbo
4,5,3,105,gpt-3.5-turbo
...,...,...,...,...
1701,104,5,808,gpt-3.5-turbo
1702,105,2,808,gpt-3.5-turbo
1703,106,4,808,gpt-3.5-turbo
1704,107,4,808,gpt-3.5-turbo


Checking correct-answer effect

In [4]:
df_single.groupby("mfv_code")["answer"].nunique()

mfv_code
105    4
111    3
201    3
203    3
205    3
206    3
408    3
409    3
501    5
503    3
509    4
602    5
610    4
702    5
704    5
808    5
Name: answer, dtype: int64

## Loads general data

In [5]:
df_in_series = pd.read_csv("data/results_original_gpt-3.5-turbo_2023-08-13_22-43.csv")

df_in_series.head(2)

Unnamed: 0,102,103,104,105,108,109,110,111,112,113,...,714,715,716,801,802,803,804,805,808,810
0,4,4,3,4,4,3,4,3,4,3,...,5,4,5,5,3,5,4,3,3,3
1,4,3,3,4,4,2,2,3,4,2,...,5,4,5,5,4,5,5,4,4,4


Run tests

In [6]:
def cohen_d(x,y):
        from math import sqrt
        try:
            d = (np.mean(x) - np.mean(y)) / sqrt((np.std(x, ddof=1) ** 2 + np.std(y, ddof=1) ** 2) / 2.0)
            return d
        except ZeroDivisionError:
            return np.nan



In [7]:
def compare_single_series(df_single, df_in_series):
    import scipy.stats
    test_results = list()

    for code in df_single["mfv_code"].unique():
        single_answers = df_single[df_single["mfv_code"] == code]["answer"]

        series_answers = df_in_series[str(code)]
        if single_answers.nunique() < 3 or series_answers.nunique() < 3:
            print(
                f"Code: {code} had {single_answers.nunique()} different answers when generated isolated"
            )
            print(
                f"\tand {series_answers.nunique()} different answers when generated aggregated"
            )

        test_results.append(
            (
                code,
                np.median(single_answers),
                scipy.stats.median_abs_deviation(single_answers),
                np.median(series_answers),
                scipy.stats.median_abs_deviation(series_answers),
                cohen_d(single_answers, series_answers),
                *mannwhitneyu(single_answers, series_answers),
            )
        )

    df_tests = pd.DataFrame(
        test_results,
        columns=["mfv_code", "mediana_single", "MAD_single", "mediana_serial", "MAD_serial", "cohen_d", "u", "p-value"],
    )
    df_tests["rejected"], df_tests["adjusted_p-value"], _, _ = multipletests(
        df_tests["p-value"],
        alpha=0.05,
        method="bonferroni"
    )
    # round adjusted p-value to 3 decimals
    df_tests["adjusted_p-value"] = df_tests["adjusted_p-value"].apply(lambda x: round(x, 4))

    df_tests["median_dif"] = df_tests["mediana_single"] - df_tests["mediana_serial"]
    return df_tests

In [8]:
df_single.groupby("mfv_code").size()

mfv_code
105    106
111     91
201    100
203     99
205     87
206     96
408     90
409     97
501    100
503     99
509    101
602    100
610     91
702    174
704    103
808    172
dtype: int64

In [9]:
tests_gpt_35 = compare_single_series(df_single, df_in_series)

tests_gpt_35

Code: 201 had 3 different answers when generated isolated
	and 2 different answers when generated aggregated
Code: 203 had 3 different answers when generated isolated
	and 2 different answers when generated aggregated


Unnamed: 0,mfv_code,mediana_single,MAD_single,mediana_serial,MAD_serial,cohen_d,u,p-value,rejected,adjusted_p-value,median_dif
0,105,4.0,0.0,4.0,0.0,-0.264728,5709.5,0.02445954,False,0.3914,0.0
1,111,3.0,0.0,3.0,0.0,1.007794,8132.5,2.721096e-11,True,0.0,0.0
2,201,5.0,0.0,5.0,0.0,-0.727063,4772.0,1.569465e-08,True,0.0,0.0
3,203,4.0,0.0,5.0,0.0,-1.837652,1917.0,7.403682e-28,True,0.0,-1.0
4,205,4.0,0.0,4.0,0.0,-0.703878,3732.5,3.203127e-06,True,0.0001,0.0
5,206,4.0,0.0,4.0,0.0,0.158092,6541.5,0.225642,False,1.0,0.0
6,408,3.0,0.0,3.0,0.0,0.686282,7325.0,4.766079e-06,True,0.0001,0.0
7,409,4.0,0.0,4.0,0.0,-0.599308,4305.5,1.046135e-05,True,0.0002,0.0
8,501,3.0,0.0,3.0,0.0,0.171528,6662.5,0.351101,False,1.0,0.0
9,503,3.0,0.0,4.0,0.0,-1.489023,2345.0,1.6348489999999999e-19,True,0.0,-1.0


In [10]:
tests_gpt_35.query("rejected == True").shape

(9, 11)

# GPT-4

In [11]:
data_path = Path('data')

# find files
files = list(data_path.glob('single_scenario/results_scenario*gpt-4*.csv'))

single_gpt4 = pd.concat([pd.read_csv(f) for f in files], ignore_index=True)



Checking correct-answer effect

In [12]:
single_gpt4.pivot(index="gen_id", columns="mfv_code", values="answer").nunique()

mfv_code
105    1
111    1
201    1
203    1
205    1
206    1
408    1
409    1
501    1
503    1
509    1
602    1
610    1
702    1
704    1
808    1
dtype: int64

In [13]:
series_gpt4 = pd.concat(
    map(pd.read_csv, data_path.glob("results_original_gpt-4*.csv"),)
)

series_gpt4.head(2)

Unnamed: 0,102,103,104,105,108,109,110,111,112,113,...,714,715,716,801,802,803,804,805,808,810
0,4,4,4,5,5,3,5,4,5,3,...,2,2,4,5,4,5,5,4,4,4
1,4,4,4,5,5,3,5,3,5,4,...,5,3,4,5,5,5,5,4,5,5


In [14]:
series_gpt4.nunique()

102    2
103    3
104    3
105    2
108    1
      ..
803    1
804    2
805    4
808    2
810    4
Length: 68, dtype: int64

In [15]:
tests_gpt4 = compare_single_series(single_gpt4, series_gpt4)
tests_gpt4.query("rejected == True").shape

Code: 105 had 1 different answers when generated isolated
	and 2 different answers when generated aggregated
Code: 111 had 1 different answers when generated isolated
	and 4 different answers when generated aggregated
Code: 201 had 1 different answers when generated isolated
	and 1 different answers when generated aggregated
Code: 203 had 1 different answers when generated isolated
	and 1 different answers when generated aggregated
Code: 205 had 1 different answers when generated isolated
	and 2 different answers when generated aggregated
Code: 206 had 1 different answers when generated isolated
	and 2 different answers when generated aggregated
Code: 408 had 1 different answers when generated isolated
	and 4 different answers when generated aggregated
Code: 409 had 1 different answers when generated isolated
	and 3 different answers when generated aggregated
Code: 501 had 1 different answers when generated isolated
	and 3 different answers when generated aggregated
Code: 503 had 1 dif

(12, 11)

In [16]:
tests_gpt4.query("rejected == True")

Unnamed: 0,mfv_code,mediana_single,MAD_single,mediana_serial,MAD_serial,cohen_d,u,p-value,rejected,adjusted_p-value,median_dif
0,105,4.0,0.0,5.0,0.0,-3.727876,918.0,7.622527999999999e-42,True,0.0,-1.0
1,111,3.0,0.0,4.0,1.0,-1.170618,3726.0,4.728379e-16,True,0.0,-1.0
5,206,5.0,0.0,5.0,0.0,0.635678,8586.0,7.478582e-06,True,0.0001,0.0
6,408,3.0,0.0,4.0,0.5,-1.401548,3348.0,1.6193429999999998e-19,True,0.0,-1.0
7,409,4.0,0.0,5.0,0.0,-2.447448,1620.0,1.230372e-33,True,0.0,-1.0
8,501,3.0,0.0,4.5,0.5,-3.842984,162.0,3.914587e-45,True,0.0,-1.5
9,503,3.0,0.0,5.0,0.0,-6.193507,54.0,3.671654e-49,True,0.0,-2.0
10,509,4.0,0.0,5.0,0.0,-0.984179,3942.0,1.914391e-13,True,0.0,-1.0
11,602,3.0,0.0,4.0,0.0,-3.500006,270.0,4.25992e-44,True,0.0,-1.0
12,610,3.0,0.0,4.0,1.0,-1.11635,3618.0,3.004363e-16,True,0.0,-1.0
