In [1]:
import random

import openai
import numpy as np
import pandas as pd
from getpass import getpass

from helpers import run_random_order

# openai.api_key = getpass()

In [2]:
# df = run_random_order(128, "gpt-3.5-turbo")

# df.head(2)

In [3]:
df_randomized_order = pd.read_csv("data/random_order/results_gpt-3.5-turbo_2023-08-30_12-45.csv")

In [4]:
df_randomized_order

Unnamed: 0,102,103,104,105,108,109,110,111,112,113,...,714,715,716,801,802,803,804,805,808,810
0,3,3,3,3,4,3,3,3,3,3,...,4,3,2,5,5,5,5,5,5,5
1,2,3,2,3,4,2,3,4,3,3,...,4,4,3,5,5,5,5,3,5,5
2,2,2,3,2,2,2,2,4,2,2,...,3,2,2,1,1,1,1,2,5,1
3,2,3,3,3,5,3,3,4,3,3,...,3,4,2,5,4,4,5,3,4,3
4,1,1,1,2,1,1,2,1,2,1,...,3,5,4,5,5,5,5,2,5,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122,2,5,4,5,5,3,4,4,3,3,...,1,5,4,5,5,5,5,5,5,4
123,2,4,3,3,4,4,3,4,3,4,...,4,5,4,5,5,5,5,4,5,4
124,3,4,2,5,5,3,3,3,3,4,...,4,3,3,5,5,5,5,3,5,5
125,2,1,2,3,1,1,3,4,2,2,...,2,1,2,5,5,4,5,4,4,2


In [6]:
df_ordered = pd.read_csv("data/results_original_gpt-3.5-turbo_2023-08-13_22-43.csv")

df_ordered.head(2)

Unnamed: 0,102,103,104,105,108,109,110,111,112,113,...,714,715,716,801,802,803,804,805,808,810
0,4,4,3,4,4,3,4,3,4,3,...,5,4,5,5,3,5,4,3,3,3
1,4,3,3,4,4,2,2,3,4,2,...,5,4,5,5,4,5,5,4,4,4


In [7]:
def cohen_d(x,y):
        from math import sqrt
        return (np.mean(x) - np.mean(y)) / sqrt((np.std(x, ddof=1) ** 2 + np.std(y, ddof=1) ** 2) / 2.0)


def compare_randomized(df_ordered, df_randomized):
    import scipy.stats
    from scipy.stats import mannwhitneyu
    from statsmodels.stats.multitest import multipletests
    test_results = list()

    for code in df_ordered.columns:
        ordered_answers = df_ordered[code]

        randomized_answer = df_randomized[code]
        if df_randomized[str(code)].nunique() < 3 or df_ordered[str(code)].nunique() < 3:
            print(
                f"Code: {code} had {df_ordered[str(code)].nunique()} different answers when generated in order"
            )
            print(
                f"Code: {code} had {df_randomized[str(code)].nunique()} different answers when generated randomized"
            )

        test_results.append(
            (
                code,
                np.median(ordered_answers),
                scipy.stats.median_abs_deviation(ordered_answers),
                np.median(randomized_answer),
                scipy.stats.median_abs_deviation(randomized_answer),
                cohen_d(ordered_answers, randomized_answer),
                *mannwhitneyu(ordered_answers, randomized_answer),
            )
        )

    df_tests = pd.DataFrame(
        test_results,
        columns=["mfv_code", "mediana_ordered", "MAD_ordered", "mediana_randomized", "MAD_randomized", "cohen_d", "u", "p-value"],
    )
    df_tests["rejected"], df_tests["adjusted_p-value"], _, _ = multipletests(
        df_tests["p-value"],
        alpha=0.05,
        method="bonferroni"
    )
    # round adjusted p-value to 3 decimals
    df_tests["adjusted_p-value"] = df_tests["adjusted_p-value"].apply(lambda x: round(x, 4))

    df_tests["median_dif"] = df_tests["mediana_ordered"] - df_tests["mediana_randomized"]
    return df_tests

In [8]:
df_comparison = compare_randomized(df_ordered, df_randomized_order)

df_comparison

Code: 201 had 2 different answers when generated in order
Code: 201 had 3 different answers when generated randomized
Code: 202 had 2 different answers when generated in order
Code: 202 had 2 different answers when generated randomized
Code: 203 had 2 different answers when generated in order
Code: 203 had 3 different answers when generated randomized
Code: 208 had 4 different answers when generated in order
Code: 208 had 2 different answers when generated randomized
Code: 406 had 2 different answers when generated in order
Code: 406 had 4 different answers when generated randomized
Code: 410 had 2 different answers when generated in order
Code: 410 had 4 different answers when generated randomized
Code: 804 had 2 different answers when generated in order
Code: 804 had 3 different answers when generated randomized


Unnamed: 0,mfv_code,mediana_ordered,MAD_ordered,mediana_randomized,MAD_randomized,cohen_d,u,p-value,rejected,adjusted_p-value,median_dif
0,102,4.0,0.0,3.0,0.0,1.725088,13871.5,8.453435e-27,True,0.0000,1.0
1,103,3.0,0.0,4.0,1.0,-0.193066,6707.0,1.531834e-02,False,1.0000,-1.0
2,104,3.0,0.0,3.0,1.0,-0.225565,7088.0,8.163854e-02,False,1.0000,0.0
3,105,4.0,0.0,4.0,1.0,0.280471,8994.0,6.026773e-02,False,1.0000,0.0
4,108,4.0,0.0,4.0,0.0,0.900326,11350.5,2.502544e-10,True,0.0000,0.0
...,...,...,...,...,...,...,...,...,...,...,...
63,803,5.0,0.0,5.0,0.0,-0.071815,7500.5,5.378434e-02,False,1.0000,0.0
64,804,5.0,0.0,5.0,0.0,-0.354680,6741.0,3.891344e-05,True,0.0026,0.0
65,805,4.0,1.0,4.0,1.0,0.841311,11219.5,5.077599e-09,True,0.0000,0.0
66,808,4.0,1.0,5.0,0.0,-0.979403,4371.5,1.409224e-13,True,0.0000,-1.0


In [9]:
df_comparison["rejected"].value_counts()

True     44
False    24
Name: rejected, dtype: int64

In [12]:
df_comparison.query("rejected == True").median_dif.value_counts()

 0.0    15
 1.0    14
-1.0    12
 2.0     3
Name: median_dif, dtype: int64