# Data Wrangling
Starting point:
- 848 datasets (18 tasks per model (minus the NON_IDEAL_OUTPUTS)) with all logprobs for all answer alternatives of each subtask for all ~1.500 tasks. 

What does this script do
- Read data sets: Which?
- ...

Goal:
- first have one value per item per model
- then transform those values in "outcomes" for each subscale (like Frey did)
- Have 36 values per model! (one per (sub-) scale).

## Packages & Helpers

In [97]:
# packages
import pandas as pd
import numpy as np
import glob
import os
import matplotlib.pyplot as plt

In [98]:
# --------------------------------------------
# 2. Pro Modell × Item die Zähler und Nenner berechnen
# --------------------------------------------
# - Numerator = Summe von (Antwort * Wahrscheinlichkeit)
# - Denominator = Summe von (Wahrscheinlichkeiten)

def compute_weighted_score(group):
    numerator = (group["human_number"] * group["prob_pred"]).sum()
    denominator = group["prob_pred"].sum()
    return numerator / denominator if denominator > 0 else None


# produce df with one value per model per item --------------------------------------------------
def get_LLM_value_per_item(data):
    new_df = (
    data.groupby(["experiment", "model", "item"])[["human_number", "prob_pred"]]
      .apply(compute_weighted_score)
      .reset_index(name="score")
    )
    return(new_df)

# more compact version (that runs faster)
def get_LLM_value_per_item(data):
    grouped = data.groupby(["experiment", "model", "item"])
    score = (grouped["human_number"].apply(lambda x: (x * data.loc[x.index, "prob_pred"]).sum())
             / grouped["prob_pred"].sum())
    return score.reset_index(name="score")


# Loading all data files of one task ------------------------------------------------------------
def load_dataframes(task_name, path = "LLM_data"):

    # Initialize empty list to store DataFrames
    dataframe = []

    path = "LLM_data"  # folder with CSVs of LLM answers

    for file in glob.glob(os.path.join(path, f"*_{task_name}_prompting_results.csv")):
        model_name = os.path.basename(file).replace(f"*_{task_name}_prompting_results.csv", "")
        
        # Read the CSV
        df = pd.read_csv(file)
        
        # Append to list
        dataframe.append(df)
        
    # Concatenate all DataFrames into one big DataFrame
    merged_data = pd.concat(dataframe, ignore_index=True)

    print(f"Merged DataFrame shape: {merged_data.shape}")
    print(f"Total models: {merged_data['model'].nunique()}")

    return(merged_data)


# filter out probability LLM assigned to real item answer  ------------------------------------------
def filter_pred_prob(data):
    data["prob_pred"] = data.apply(
        lambda row: row[f"prob_{row['human_number']}"], axis=1
    )
    return(data)


## AUDIT SCALE

In [99]:
# load data
AUDIT_data = load_dataframes(task_name="AUDIT", path = "LLM_data")

Merged DataFrame shape: (712264, 11)
Total models: 46


In [100]:
# normalise answer option sum to one (tun so als hätten wir sehr guten Prompt, dann würde LLM nur zwischen möglichen Antwortalternativen aussuchen, da simulieren wir dadurch)
mask = (AUDIT_data["item"] == 1)
AUDIT_data.loc[mask, "prob_1"] = np.exp(AUDIT_data.loc[mask, "1"])/(np.exp(AUDIT_data.loc[mask, "1"]) + np.exp(AUDIT_data.loc[mask, "2"]))
AUDIT_data.loc[mask, "prob_2"] = np.exp(AUDIT_data.loc[mask, "2"])/(np.exp(AUDIT_data.loc[mask, "1"]) + np.exp(AUDIT_data.loc[mask, "2"]))

mask = (AUDIT_data["item"].isin([10, 11]))
AUDIT_data.loc[mask, "prob_1"] = np.exp(AUDIT_data.loc[mask, "1"])/(np.exp(AUDIT_data.loc[mask, "1"]) + np.exp(AUDIT_data.loc[mask, "2"]) + np.exp(AUDIT_data.loc[mask, "3"]))
AUDIT_data.loc[mask, "prob_2"] = np.exp(AUDIT_data.loc[mask, "2"])/(np.exp(AUDIT_data.loc[mask, "1"]) + np.exp(AUDIT_data.loc[mask, "2"]) + np.exp(AUDIT_data.loc[mask, "3"]))
AUDIT_data.loc[mask, "prob_3"] = np.exp(AUDIT_data.loc[mask, "3"])/(np.exp(AUDIT_data.loc[mask, "1"]) + np.exp(AUDIT_data.loc[mask, "2"]) + np.exp(AUDIT_data.loc[mask, "3"]))


mask = (AUDIT_data["item"].isin([2, 3, 4, 5, 6, 7, 8, 9]))
AUDIT_data.loc[mask, "prob_1"] = np.exp(AUDIT_data.loc[mask, "1"])/(np.exp(AUDIT_data.loc[mask, "1"]) + np.exp(AUDIT_data.loc[mask, "2"]) + np.exp(AUDIT_data.loc[mask, "3"]) + np.exp(AUDIT_data.loc[mask, "4"]) + np.exp(AUDIT_data.loc[mask, "5"]))
AUDIT_data.loc[mask, "prob_2"] = np.exp(AUDIT_data.loc[mask, "2"])/(np.exp(AUDIT_data.loc[mask, "1"]) + np.exp(AUDIT_data.loc[mask, "2"]) + np.exp(AUDIT_data.loc[mask, "3"]) + np.exp(AUDIT_data.loc[mask, "4"]) + np.exp(AUDIT_data.loc[mask, "5"]))
AUDIT_data.loc[mask, "prob_3"] = np.exp(AUDIT_data.loc[mask, "3"])/(np.exp(AUDIT_data.loc[mask, "1"]) + np.exp(AUDIT_data.loc[mask, "2"]) + np.exp(AUDIT_data.loc[mask, "3"]) + np.exp(AUDIT_data.loc[mask, "4"]) + np.exp(AUDIT_data.loc[mask, "5"]))
AUDIT_data.loc[mask, "prob_4"] = np.exp(AUDIT_data.loc[mask, "4"])/(np.exp(AUDIT_data.loc[mask, "1"]) + np.exp(AUDIT_data.loc[mask, "2"]) + np.exp(AUDIT_data.loc[mask, "3"]) + np.exp(AUDIT_data.loc[mask, "4"]) + np.exp(AUDIT_data.loc[mask, "5"]))
AUDIT_data.loc[mask, "prob_5"] = np.exp(AUDIT_data.loc[mask, "5"])/(np.exp(AUDIT_data.loc[mask, "1"]) + np.exp(AUDIT_data.loc[mask, "2"]) + np.exp(AUDIT_data.loc[mask, "3"]) + np.exp(AUDIT_data.loc[mask, "4"]) + np.exp(AUDIT_data.loc[mask, "5"]))


In [101]:
# filter out probability LLM assigned to real item answer 
AUDIT_data=filter_pred_prob(AUDIT_data)

In [102]:
# flip back human answers where they were flipped
mask = (AUDIT_data["flipped"] == "yes") & (AUDIT_data["item"] == 1)
AUDIT_data.loc[mask, "human_number"] = 3 - AUDIT_data.loc[mask, "human_number"]
mask = (AUDIT_data["flipped"] == "yes") & (AUDIT_data["item"].isin([10, 11]))
AUDIT_data.loc[mask, "human_number"] = 4 - AUDIT_data.loc[mask, "human_number"]
mask = (AUDIT_data["flipped"] == "yes") & (AUDIT_data["item"].isin([2, 3, 4, 5, 6, 7, 8, 9]))
AUDIT_data.loc[mask, "human_number"] = 6 - AUDIT_data.loc[mask, "human_number"]


In [103]:
# produce df with one value per model per item 
model_item_scores_AUDIT = get_LLM_value_per_item(AUDIT_data)

## BARRAT SCALE

In [104]:
# load data
BARRAT_data = load_dataframes(task_name="BARRAT", path = "LLM_data")

Merged DataFrame shape: (2082420, 10)
Total models: 46


In [105]:
# normalise answer option sum to one
BARRAT_data["prob_1"] = np.exp(BARRAT_data["1"])/(np.exp(BARRAT_data["1"]) + np.exp(BARRAT_data["2"]) + np.exp(BARRAT_data["3"]) + np.exp(BARRAT_data["4"]))
BARRAT_data["prob_2"] = np.exp(BARRAT_data["2"])/(np.exp(BARRAT_data["1"]) + np.exp(BARRAT_data["2"]) + np.exp(BARRAT_data["3"]) + np.exp(BARRAT_data["4"]))
BARRAT_data["prob_3"] = np.exp(BARRAT_data["3"])/(np.exp(BARRAT_data["1"]) + np.exp(BARRAT_data["2"]) + np.exp(BARRAT_data["3"]) + np.exp(BARRAT_data["4"]))
BARRAT_data["prob_4"] = np.exp(BARRAT_data["4"])/(np.exp(BARRAT_data["1"]) + np.exp(BARRAT_data["2"]) + np.exp(BARRAT_data["3"]) + np.exp(BARRAT_data["4"]))


In [106]:
# filter out probability LLM assigned to real item answer 
BARRAT_data=filter_pred_prob(BARRAT_data)

In [107]:
# flip back human answers where they were flipped
mask = (BARRAT_data["flipped"] == True)
BARRAT_data.loc[mask, "human_number"] = 5 - BARRAT_data.loc[mask, "human_number"]


In [108]:
# produce df with one value per model per item 
model_item_scores_BARRAT = get_LLM_value_per_item(BARRAT_data)


In [109]:
# Adding task specific categories to save in all data

# add item categories
item_to_category = {
    1: "BISn", 2: "BISm", 3: "BISm",  4: "BISm", 5: "BISa",  6: "BISa",  7: "BISn",  8: "BISn",  9: "BISa",  10: "BISn",
    11: "BISa", 12: "BISn", 13: "BISn",  14: "BISn", 15: "BISn",  16: "BISm",  17: "BISm",  18: "BISn",  19: "BISm",  20: "BISa",
    21: "BISm", 22: "BISm", 23: "BISm",  24: "BISa", 25: "BISm",  26: "BISa",  27: "BISn",  28: "BISa",  29: "BISn",  30: "BISm"
}
# add whether item was reverse coded
reverse_coded = {
    1: True, 2: False, 3: False,  4: False, 5: False,  6: False,  7: True,  8: True,  9: True,  10: True,
    11: False, 12: True, 13: True,  14: False, 15: True,  16: False,  17: False,  18: False,  19: False,  20: True,
    21: False, 22: False, 23: False,  24: False, 25: False,  26: False,  27: False,  28: False,  29: True,  30: True
    }

model_item_scores_BARRAT["category"] = model_item_scores_BARRAT["item"].map(item_to_category)
model_item_scores_BARRAT["reverse_coded"] = model_item_scores_BARRAT["item"].map(reverse_coded)


In [110]:
# merge dfs
all_data = pd.concat([model_item_scores_AUDIT, model_item_scores_BARRAT], ignore_index=True)


## CARE TASK

In [111]:
# load data
CARE_data = load_dataframes(task_name="CARE", path = "LLM_data")

Merged DataFrame shape: (1320614, 106)
Total models: 46


In [112]:
# get probabilities out of log-probabilities

cols = [str(i) for i in range(0, 100)]
# Compute normalized probabilities
exp_vals = np.exp(CARE_data[cols])
prob_vals = exp_vals.div(exp_vals.sum(axis=1), axis=0)

# Rename columns all at once
prob_vals.columns = [f"prob_{i}" for i in range(0, 100)]

# Join to original dataframe in one step
CARE_data = pd.concat([CARE_data, prob_vals], axis=1).copy()

In [113]:
CARE_data=filter_pred_prob(CARE_data)

In [114]:
model_item_scores_CARE = get_LLM_value_per_item(CARE_data)


In [115]:
# Adding task specific categories to save in all data
# add item categories
item_to_category = {
    1: "CAREa", 2: "CAREa", 3: "CAREa",  4: "CAREa", 5: "CAREa",  6: "CAREa",  7: "CAREa",  8: "CAREa",  9: "CAREa",  10: "CAREs",
    11: "CAREs", 12: "CAREs", 13: "CAREs",  14: "CAREs", 15: "CAREs",  16: "CAREw",  17: "CAREw",  18: "CAREw",  19: "CAREw"
}

model_item_scores_CARE["category"] = model_item_scores_CARE["item"].map(item_to_category)


In [116]:
all_data = pd.concat([all_data, model_item_scores_CARE], ignore_index=True)


## DAST SCALE

In [117]:
# load data
DAST_data = load_dataframes(task_name="DAST")

Merged DataFrame shape: (1391040, 8)
Total models: 46


In [118]:
# normalise answer option sum to one 
DAST_data["prob_1"] = np.exp(DAST_data["1"])/(np.exp(DAST_data["1"]) + np.exp(DAST_data["2"]))
DAST_data["prob_2"] = np.exp(DAST_data["2"])/(np.exp(DAST_data["1"]) + np.exp(DAST_data["2"]))

In [119]:
# filter out probability LLM assigned to real item answer 
DAST_data=filter_pred_prob(DAST_data)

In [120]:
# flip back human answers where they were flipped
mask = (DAST_data["flipped"] == True) 
DAST_data.loc[mask, "human_number"] = 3 - DAST_data.loc[mask, "human_number"]

In [121]:
# produce df with one value per model per item 
model_item_scores_DAST = get_LLM_value_per_item(DAST_data)


In [122]:
# merge dfs
all_data = pd.concat([all_data, model_item_scores_DAST], ignore_index=True)


## DM SCALE

In [123]:
# load data
DM_data = load_dataframes(task_name="DM")

Merged DataFrame shape: (1318866, 10)
Total models: 46


In [124]:
# normalise answer option sum to one
DM_data["prob_1"] = np.exp(DM_data["1"])/(np.exp(DM_data["1"]) + np.exp(DM_data["2"]) + np.exp(DM_data["3"]) + np.exp(DM_data["4"]))
DM_data["prob_2"] = np.exp(DM_data["2"])/(np.exp(DM_data["1"]) + np.exp(DM_data["2"]) + np.exp(DM_data["3"]) + np.exp(DM_data["4"]))
DM_data["prob_3"] = np.exp(DM_data["3"])/(np.exp(DM_data["1"]) + np.exp(DM_data["2"]) + np.exp(DM_data["3"]) + np.exp(DM_data["4"]))
DM_data["prob_4"] = np.exp(DM_data["4"])/(np.exp(DM_data["1"]) + np.exp(DM_data["2"]) + np.exp(DM_data["3"]) + np.exp(DM_data["4"]))


In [125]:
# filter out probability LLM assigned to real item answer 
DM_data=filter_pred_prob(DM_data)

In [126]:
# flip back human answers where they were flipped
mask = (DM_data["flipped"] == True) 
DM_data.loc[mask, "human_number"] = 3 - DM_data.loc[mask, "human_number"]

In [127]:
# produce df with one value per model per item 
model_item_scores_DM = get_LLM_value_per_item(DM_data)


In [128]:
# merge dfs
all_data = pd.concat([all_data, model_item_scores_DM], ignore_index=True)


## DOSPERT SCALE

In [129]:
# load data
DOSPERT_data = load_dataframes(task_name="DOSPERT")

Merged DataFrame shape: (2780240, 11)
Total models: 46


In [130]:
# normalise answer option sum to one
DOSPERT_data["prob_1"] = np.exp(DOSPERT_data["1"])/(np.exp(DOSPERT_data["1"]) + np.exp(DOSPERT_data["2"]) + np.exp(DOSPERT_data["3"]) + np.exp(DOSPERT_data["4"]) + np.exp(DOSPERT_data["5"]))
DOSPERT_data["prob_2"] = np.exp(DOSPERT_data["2"])/(np.exp(DOSPERT_data["1"]) + np.exp(DOSPERT_data["2"]) + np.exp(DOSPERT_data["3"]) + np.exp(DOSPERT_data["4"]) + np.exp(DOSPERT_data["5"]))
DOSPERT_data["prob_3"] = np.exp(DOSPERT_data["3"])/(np.exp(DOSPERT_data["1"]) + np.exp(DOSPERT_data["2"]) + np.exp(DOSPERT_data["3"]) + np.exp(DOSPERT_data["4"]) + np.exp(DOSPERT_data["5"]))
DOSPERT_data["prob_4"] = np.exp(DOSPERT_data["4"])/(np.exp(DOSPERT_data["1"]) + np.exp(DOSPERT_data["2"]) + np.exp(DOSPERT_data["3"]) + np.exp(DOSPERT_data["4"]) + np.exp(DOSPERT_data["5"]))
DOSPERT_data["prob_5"] = np.exp(DOSPERT_data["5"])/(np.exp(DOSPERT_data["1"]) + np.exp(DOSPERT_data["2"]) + np.exp(DOSPERT_data["3"]) + np.exp(DOSPERT_data["4"]) + np.exp(DOSPERT_data["5"]))


In [131]:
# filter out probability LLM assigned to real item answer 
DOSPERT_data=filter_pred_prob(DOSPERT_data)

In [132]:
# flip back human answers where they were flipped
mask = (DOSPERT_data["flipped"] == 'yes') 
DOSPERT_data.loc[mask, "human_number"] = 6 - DOSPERT_data.loc[mask, "human_number"]

In [133]:
# produce df with one value per model per item 
model_item_scores_DOSPERT = get_LLM_value_per_item(DOSPERT_data)


In [134]:
# Adding task specific categories to save in all data

# add item categories
item_to_category = {
    1: "Social", 10: "Social", 16: "Social", 19: "Social", 23: "Social", 26: "Social", 34: "Social", 35: "Social",
    2: "Recreational", 6: "Recreational", 15: "Recreational", 17: "Recreational", 21: "Recreational", 31: "Recreational", 37: "Recreational", 38: "Recreational",
    3: "Gambling", 11: "Gambling", 22: "Gambling", 33: "Gambling",
    4: "Health", 8: "Health", 27: "Health", 29: "Health", 32: "Health", 36: "Health", 39: "Health", 40: "Health",
    5: "Ethical", 9: "Ethical", 12: "Ethical", 13: "Ethical", 14: "Ethical", 20: "Ethical", 25: "Ethical", 28: "Ethical",
    7: "Investment", 18: "Investment", 24: "Investment", 30: "Investment"
}

model_item_scores_DOSPERT["category"] = model_item_scores_DOSPERT["item"].map(item_to_category)


In [135]:
# merge dfs
all_data = pd.concat([all_data, model_item_scores_DOSPERT], ignore_index=True)


## FTND SCALE

In [136]:
# load data
FTND_data = load_dataframes(task_name="FTND")

Merged DataFrame shape: (163162, 10)
Total models: 46


In [137]:
# normalise answer option sum to one (tun so als hätten wir sehr guten Prompt, dann würde LLM nur zwischen möglichen Antwortalternativen aussuchen, da simulieren wir dadurch)
mask = (FTND_data["item"] == 1)
FTND_data.loc[mask, "prob_1"] = np.exp(FTND_data.loc[mask, "1"])/(np.exp(FTND_data.loc[mask, "1"]) + np.exp(FTND_data.loc[mask, "2"]) + np.exp(FTND_data.loc[mask, "3"]))
FTND_data.loc[mask, "prob_2"] = np.exp(FTND_data.loc[mask, "2"])/(np.exp(FTND_data.loc[mask, "1"]) + np.exp(FTND_data.loc[mask, "2"]) + np.exp(FTND_data.loc[mask, "3"]))
FTND_data.loc[mask, "prob_3"] = np.exp(FTND_data.loc[mask, "3"])/(np.exp(FTND_data.loc[mask, "1"]) + np.exp(FTND_data.loc[mask, "2"]) + np.exp(FTND_data.loc[mask, "3"]))

mask = (FTND_data["item"].isin([3, 4, 6, 7]))
FTND_data.loc[mask, "prob_1"] = np.exp(FTND_data.loc[mask, "1"])/(np.exp(FTND_data.loc[mask, "1"]) + np.exp(FTND_data.loc[mask, "2"]))
FTND_data.loc[mask, "prob_2"] = np.exp(FTND_data.loc[mask, "2"])/(np.exp(FTND_data.loc[mask, "1"]) + np.exp(FTND_data.loc[mask, "2"]))

mask = (FTND_data["item"].isin([2, 5]))
FTND_data.loc[mask, "prob_1"] = np.exp(FTND_data.loc[mask, "1"])/(np.exp(FTND_data.loc[mask, "1"]) + np.exp(FTND_data.loc[mask, "2"]) + np.exp(FTND_data.loc[mask, "3"]) + np.exp(FTND_data.loc[mask, "4"]))
FTND_data.loc[mask, "prob_2"] = np.exp(FTND_data.loc[mask, "2"])/(np.exp(FTND_data.loc[mask, "1"]) + np.exp(FTND_data.loc[mask, "2"]) + np.exp(FTND_data.loc[mask, "3"]) + np.exp(FTND_data.loc[mask, "4"]))
FTND_data.loc[mask, "prob_3"] = np.exp(FTND_data.loc[mask, "3"])/(np.exp(FTND_data.loc[mask, "1"]) + np.exp(FTND_data.loc[mask, "2"]) + np.exp(FTND_data.loc[mask, "3"]) + np.exp(FTND_data.loc[mask, "4"]))
FTND_data.loc[mask, "prob_4"] = np.exp(FTND_data.loc[mask, "4"])/(np.exp(FTND_data.loc[mask, "1"]) + np.exp(FTND_data.loc[mask, "2"]) + np.exp(FTND_data.loc[mask, "3"]) + np.exp(FTND_data.loc[mask, "4"]))


In [138]:
# filter out probability LLM assigned to real item answer 
FTND_data=filter_pred_prob(FTND_data)

In [139]:
# flip back human answers where they were flipped
mask = (FTND_data["flipped"] == True) & (FTND_data["item"] == 1)
FTND_data.loc[mask, "human_number"] = 4 - FTND_data.loc[mask, "human_number"]
mask = (FTND_data["flipped"] == True) & (FTND_data["item"].isin([3, 4, 6, 7]))
FTND_data.loc[mask, "human_number"] = 3 - FTND_data.loc[mask, "human_number"]
mask = (FTND_data["flipped"] == True) & (FTND_data["item"].isin([2, 5]))
FTND_data.loc[mask, "human_number"] = 5 - FTND_data.loc[mask, "human_number"]


In [140]:
# produce df with one value per model per item 
model_item_scores_FTND = get_LLM_value_per_item(FTND_data)


In [141]:
# merge dfs
all_data = pd.concat([all_data, model_item_scores_FTND], ignore_index=True)


## GABS SCALE

In [142]:
# load data
GABS_data = load_dataframes(task_name="GABS")

Merged DataFrame shape: (581210, 10)
Total models: 46


In [143]:
# normalise answer option sum to one

# columns representing log-probabilities
answer_cols = ["1", "2", "3", "4"]

# make a copy to avoid SettingWithCopy warnings
GABS_data = GABS_data.copy()

# case 1: item == 1 → only options 1 and 2
mask_item1 = GABS_data["item"] == 1
exp_vals_item1 = np.exp(GABS_data.loc[mask_item1, ["1", "2"]])
probs_item1 = exp_vals_item1.div(exp_vals_item1.sum(axis=1), axis=0)
probs_item1.columns = ["prob_1", "prob_2"]

# case 2: items 2–17 → options 1–4
mask_item2plus = GABS_data["item"].between(2, 17)
exp_vals_item2plus = np.exp(GABS_data.loc[mask_item2plus, answer_cols])
probs_item2plus = exp_vals_item2plus.div(exp_vals_item2plus.sum(axis=1), axis=0)
probs_item2plus.columns = [f"prob_{c}" for c in answer_cols]

# merge both parts back into original df
GABS_data = GABS_data.join(pd.concat([probs_item1, probs_item2plus]))



In [144]:
# filter out probability LLM assigned to real item answer 
GABS_data=filter_pred_prob(GABS_data)

In [145]:
# flip back human answers where they were flipped
mask = (GABS_data["flipped"] == True) & (GABS_data["item"] == 1)
GABS_data.loc[mask, "human_number"] = 3 - GABS_data.loc[mask, "human_number"]
mask = (GABS_data["flipped"] == True) & (GABS_data["item"].isin(range(2,17)))
GABS_data.loc[mask, "human_number"] = 5 - GABS_data.loc[mask, "human_number"]


In [146]:
# produce df with one value per model per item 
model_item_scores_GABS = get_LLM_value_per_item(GABS_data)


In [147]:
# merge dfs
all_data = pd.concat([all_data, model_item_scores_GABS], ignore_index=True)


## PG SCALE

In [148]:
# load data
PG_data = load_dataframes(task_name="PG")

Merged DataFrame shape: (1127322, 13)
Total models: 46


In [149]:
# normalise answer option sum to one 

mask = (PG_data["item"].isin([1, 26]))
PG_data.loc[mask, "prob_1"] = np.exp(PG_data.loc[mask, "1"])/(np.exp(PG_data.loc[mask, "1"]) + np.exp(PG_data.loc[mask, "2"]))
PG_data.loc[mask, "prob_2"] = np.exp(PG_data.loc[mask, "2"])/(np.exp(PG_data.loc[mask, "1"]) + np.exp(PG_data.loc[mask, "2"]))



mask = (PG_data["item"].isin(range(2, 21)))
PG_data.loc[mask, "prob_1"] = np.exp(PG_data.loc[mask, "1"])/(np.exp(PG_data.loc[mask, "1"]) + np.exp(PG_data.loc[mask, "2"]) + np.exp(PG_data.loc[mask, "3"]) + np.exp(PG_data.loc[mask, "4"]) + np.exp(PG_data.loc[mask, "5"]))
PG_data.loc[mask, "prob_2"] = np.exp(PG_data.loc[mask, "2"])/(np.exp(PG_data.loc[mask, "1"]) + np.exp(PG_data.loc[mask, "2"]) + np.exp(PG_data.loc[mask, "3"]) + np.exp(PG_data.loc[mask, "4"]) + np.exp(PG_data.loc[mask, "5"]))
PG_data.loc[mask, "prob_3"] = np.exp(PG_data.loc[mask, "3"])/(np.exp(PG_data.loc[mask, "1"]) + np.exp(PG_data.loc[mask, "2"]) + np.exp(PG_data.loc[mask, "3"]) + np.exp(PG_data.loc[mask, "4"]) + np.exp(PG_data.loc[mask, "5"]))
PG_data.loc[mask, "prob_4"] = np.exp(PG_data.loc[mask, "4"])/(np.exp(PG_data.loc[mask, "1"]) + np.exp(PG_data.loc[mask, "2"]) + np.exp(PG_data.loc[mask, "3"]) + np.exp(PG_data.loc[mask, "4"]) + np.exp(PG_data.loc[mask, "5"]))
PG_data.loc[mask, "prob_5"] = np.exp(PG_data.loc[mask, "5"])/(np.exp(PG_data.loc[mask, "1"]) + np.exp(PG_data.loc[mask, "2"]) + np.exp(PG_data.loc[mask, "3"]) + np.exp(PG_data.loc[mask, "4"]) + np.exp(PG_data.loc[mask, "5"]))



mask = (PG_data["item"] == 25)
PG_data.loc[mask, "prob_1"] = np.exp(PG_data.loc[mask, "1"])/(np.exp(PG_data.loc[mask, "1"]) + np.exp(PG_data.loc[mask, "2"]) + np.exp(PG_data.loc[mask, "3"]) + np.exp(PG_data.loc[mask, "4"]) + np.exp(PG_data.loc[mask, "5"]) + np.exp(PG_data.loc[mask, "6"]))
PG_data.loc[mask, "prob_2"] = np.exp(PG_data.loc[mask, "2"])/(np.exp(PG_data.loc[mask, "1"]) + np.exp(PG_data.loc[mask, "2"]) + np.exp(PG_data.loc[mask, "3"]) + np.exp(PG_data.loc[mask, "4"]) + np.exp(PG_data.loc[mask, "5"]) + np.exp(PG_data.loc[mask, "6"]))
PG_data.loc[mask, "prob_3"] = np.exp(PG_data.loc[mask, "3"])/(np.exp(PG_data.loc[mask, "1"]) + np.exp(PG_data.loc[mask, "2"]) + np.exp(PG_data.loc[mask, "3"]) + np.exp(PG_data.loc[mask, "4"]) + np.exp(PG_data.loc[mask, "5"]) + np.exp(PG_data.loc[mask, "6"]))
PG_data.loc[mask, "prob_4"] = np.exp(PG_data.loc[mask, "4"])/(np.exp(PG_data.loc[mask, "1"]) + np.exp(PG_data.loc[mask, "2"]) + np.exp(PG_data.loc[mask, "3"]) + np.exp(PG_data.loc[mask, "4"]) + np.exp(PG_data.loc[mask, "5"]) + np.exp(PG_data.loc[mask, "6"]))
PG_data.loc[mask, "prob_5"] = np.exp(PG_data.loc[mask, "5"])/(np.exp(PG_data.loc[mask, "1"]) + np.exp(PG_data.loc[mask, "2"]) + np.exp(PG_data.loc[mask, "3"]) + np.exp(PG_data.loc[mask, "4"]) + np.exp(PG_data.loc[mask, "5"]) + np.exp(PG_data.loc[mask, "6"]))
PG_data.loc[mask, "prob_6"] = np.exp(PG_data.loc[mask, "6"])/(np.exp(PG_data.loc[mask, "1"]) + np.exp(PG_data.loc[mask, "2"]) + np.exp(PG_data.loc[mask, "3"]) + np.exp(PG_data.loc[mask, "4"]) + np.exp(PG_data.loc[mask, "5"]) + np.exp(PG_data.loc[mask, "6"]))


mask = (PG_data["item"].isin([21, 22, 23, 24, 27, 28, 29, 30, 31, 32]))
PG_data.loc[mask, "prob_0"] = np.exp(PG_data.loc[mask, "0"])/(np.exp(PG_data.loc[mask, "0"]) + np.exp(PG_data.loc[mask, "1"]))
PG_data.loc[mask, "prob_1"] = np.exp(PG_data.loc[mask, "1"])/(np.exp(PG_data.loc[mask, "0"]) + np.exp(PG_data.loc[mask, "1"]))


In [150]:
# filter out probability LLM assigned to real item answer 
PG_data=filter_pred_prob(PG_data)

In [151]:
# flip back human answers where they were flipped
mask = (PG_data["flipped"] == True) & (PG_data["item"].isin([1, 26]))
PG_data.loc[mask, "human_number"] = 3 - PG_data.loc[mask, "human_number"]

mask = (PG_data["flipped"] == True) & (PG_data["item"].isin(range(2, 21)))
PG_data.loc[mask, "human_number"] = 6 - PG_data.loc[mask, "human_number"]

mask = (PG_data["flipped"] == True) & (PG_data["item"] == 25)
PG_data.loc[mask, "human_number"] = 7 - PG_data.loc[mask, "human_number"]

mask = (PG_data["flipped"] == True) & (PG_data["item"].isin([21, 22, 23, 24, 27, 28, 29, 30, 31, 32]))
PG_data.loc[mask, "human_number"] = 1 - PG_data.loc[mask, "human_number"]



In [152]:
# produce df with one value per model per item 
model_item_scores_PG = get_LLM_value_per_item(PG_data)


In [153]:
# merge dfs
all_data = pd.concat([all_data, model_item_scores_PG], ignore_index=True)


## PRI SCALE

In [154]:
# load data
PRI_data = load_dataframes(task_name="PRI")

Merged DataFrame shape: (1110624, 13)
Total models: 46


In [155]:
# normalise answer option sum to one 

mask = (PRI_data["item"].isin([1, 3, 5, 7, 9, 11, 13, 15]))
PRI_data.loc[mask, "prob_1"] = np.exp(PRI_data.loc[mask, "1"])/(np.exp(PRI_data.loc[mask, "1"]) + np.exp(PRI_data.loc[mask, "2"]))
PRI_data.loc[mask, "prob_2"] = np.exp(PRI_data.loc[mask, "2"])/(np.exp(PRI_data.loc[mask, "1"]) + np.exp(PRI_data.loc[mask, "2"]))



mask = (PRI_data["item"].isin([2, 4, 6, 8, 10, 12, 14, 16]))
PRI_data.loc[mask, "prob_1"] = np.exp(PRI_data.loc[mask, "1"])/(np.exp(PRI_data.loc[mask, "1"]) + np.exp(PRI_data.loc[mask, "2"]) + np.exp(PRI_data.loc[mask, "3"]) + np.exp(PRI_data.loc[mask, "4"]) + np.exp(PRI_data.loc[mask, "5"]) + np.exp(PRI_data.loc[mask, "6"]) + np.exp(PRI_data.loc[mask, "7"]))
PRI_data.loc[mask, "prob_2"] = np.exp(PRI_data.loc[mask, "2"])/(np.exp(PRI_data.loc[mask, "1"]) + np.exp(PRI_data.loc[mask, "2"]) + np.exp(PRI_data.loc[mask, "3"]) + np.exp(PRI_data.loc[mask, "4"]) + np.exp(PRI_data.loc[mask, "5"]) + np.exp(PRI_data.loc[mask, "6"]) + np.exp(PRI_data.loc[mask, "7"]))
PRI_data.loc[mask, "prob_3"] = np.exp(PRI_data.loc[mask, "3"])/(np.exp(PRI_data.loc[mask, "1"]) + np.exp(PRI_data.loc[mask, "2"]) + np.exp(PRI_data.loc[mask, "3"]) + np.exp(PRI_data.loc[mask, "4"]) + np.exp(PRI_data.loc[mask, "5"]) + np.exp(PRI_data.loc[mask, "6"]) + np.exp(PRI_data.loc[mask, "7"]))
PRI_data.loc[mask, "prob_4"] = np.exp(PRI_data.loc[mask, "4"])/(np.exp(PRI_data.loc[mask, "1"]) + np.exp(PRI_data.loc[mask, "2"]) + np.exp(PRI_data.loc[mask, "3"]) + np.exp(PRI_data.loc[mask, "4"]) + np.exp(PRI_data.loc[mask, "5"]) + np.exp(PRI_data.loc[mask, "6"]) + np.exp(PRI_data.loc[mask, "7"]))
PRI_data.loc[mask, "prob_5"] = np.exp(PRI_data.loc[mask, "5"])/(np.exp(PRI_data.loc[mask, "1"]) + np.exp(PRI_data.loc[mask, "2"]) + np.exp(PRI_data.loc[mask, "3"]) + np.exp(PRI_data.loc[mask, "4"]) + np.exp(PRI_data.loc[mask, "5"]) + np.exp(PRI_data.loc[mask, "6"]) + np.exp(PRI_data.loc[mask, "7"]))
PRI_data.loc[mask, "prob_6"] = np.exp(PRI_data.loc[mask, "6"])/(np.exp(PRI_data.loc[mask, "1"]) + np.exp(PRI_data.loc[mask, "2"]) + np.exp(PRI_data.loc[mask, "3"]) + np.exp(PRI_data.loc[mask, "4"]) + np.exp(PRI_data.loc[mask, "5"]) + np.exp(PRI_data.loc[mask, "6"]) + np.exp(PRI_data.loc[mask, "7"]))
PRI_data.loc[mask, "prob_7"] = np.exp(PRI_data.loc[mask, "7"])/(np.exp(PRI_data.loc[mask, "1"]) + np.exp(PRI_data.loc[mask, "2"]) + np.exp(PRI_data.loc[mask, "3"]) + np.exp(PRI_data.loc[mask, "4"]) + np.exp(PRI_data.loc[mask, "5"]) + np.exp(PRI_data.loc[mask, "6"]) + np.exp(PRI_data.loc[mask, "7"]))

In [156]:
# filter out probability LLM assigned to real item answer 
PRI_data=filter_pred_prob(PRI_data)

In [157]:
# flip back human answers where they were flipped
mask = (PRI_data["flipped"] == True) & (PRI_data["item"].isin([1, 3, 5, 7, 9, 11, 13, 15]))
PRI_data.loc[mask, "human_number"] = 3 - PRI_data.loc[mask, "human_number"]

mask = (PRI_data["flipped"] == True) & (PRI_data["item"].isin([2, 4, 6, 8, 10, 12, 14, 16]))
PRI_data.loc[mask, "human_number"] = 8 - PRI_data.loc[mask, "human_number"]


In [158]:
# produce df with one value per model per item 
model_item_scores_PRI = get_LLM_value_per_item(PRI_data)


In [159]:
# merge dfs
all_data = pd.concat([all_data, model_item_scores_PRI], ignore_index=True)


## SOEP SCALE

In [160]:
# load data
SOEP_data = load_dataframes(task_name="SOEP")

Merged DataFrame shape: (486542, 17)
Total models: 46


In [161]:
# get probabilities out of log-probabilities

cols = [str(i) for i in range(1, 12)]
# Compute normalized probabilities
exp_vals = np.exp(SOEP_data[cols])
prob_vals = exp_vals.div(exp_vals.sum(axis=1), axis=0)

# Rename columns all at once
prob_vals.columns = [f"prob_{i}" for i in range(1, 12)]

# Join to original dataframe in one step
SOEP_data = pd.concat([SOEP_data, prob_vals], axis=1).copy()

In [162]:
# filter out probability LLM assigned to real item answer 
SOEP_data=filter_pred_prob(SOEP_data)

In [163]:
# flip back human answers where they were flipped
mask = (SOEP_data["flipped"] == "yes") 
SOEP_data.loc[mask, "human_number"] = 12 - SOEP_data.loc[mask, "human_number"]


In [164]:
# produce df with one value per model per item 
model_item_scores_SOEP = get_LLM_value_per_item(SOEP_data)


In [165]:
# Adding task specific categories to save in all data

# add item categories
item_to_category = {
     1: "SOEP", 2: "SOEPdri", 3: "SOEPfin",  4: "SOEPrec", 5: "SOEPocc",  6: "SOEPhea",  7: "SOEPsoc"
}

model_item_scores_SOEP["category"] = model_item_scores_SOEP["item"].map(item_to_category)


In [166]:
# merge dfs
all_data = pd.concat([all_data, model_item_scores_SOEP], ignore_index=True)


## SSSV SCALE

In [167]:
# load data
SSSV_data = load_dataframes(task_name="SSSV")

Merged DataFrame shape: (2776560, 8)
Total models: 46


In [168]:
# normalise answer option sum to one
SSSV_data["prob_1"] = np.exp(SSSV_data["1"])/(np.exp(SSSV_data["1"]) + np.exp(SSSV_data["2"]))
SSSV_data["prob_2"] = np.exp(SSSV_data["2"])/(np.exp(SSSV_data["1"]) + np.exp(SSSV_data["2"]))

In [169]:
# filter out probability LLM assigned to real item answer 
SSSV_data=filter_pred_prob(SSSV_data)

In [170]:
# flip back human answers where they were flipped
mask = (SSSV_data["flipped"] == True) 
SSSV_data.loc[mask, "human_number"] = 3 - SSSV_data.loc[mask, "human_number"]


In [171]:
# produce df with one value per model per item 
model_item_scores_SSSV = get_LLM_value_per_item(SSSV_data)


In [172]:
# Adding task specific categories to save in all data

# add item categories
item_to_category = {
     3: "SStas", 11: "SStas", 16: "SStas", 17: "SStas", 20: "SStas", 21: "SStas", 23: "SStas", 28: "SStas", 38: "SStas", 40: "SStas",
     4: "SSexp", 6: "SSexp", 9: "SSexp", 10: "SSexp", 14: "SSexp", 18: "SSexp", 19: "SSexp", 22: "SSexp", 26: "SSexp", 37: "SSexp",
     1: "SSdis", 12: "SSdis", 13: "SSdis", 25: "SSdis", 29: "SSdis", 30: "SSdis", 32: "SSdis", 33: "SSdis", 35: "SSdis", 36: "SSdis",
     2: "SSbor", 5: "SSbor", 7: "SSbor", 8: "SSbor", 15: "SSbor", 24: "SSbor", 27: "SSbor", 31: "SSbor", 34: "SSbor", 39: "SSbor"
}

model_item_scores_SSSV["category"] = model_item_scores_SSSV["item"].map(item_to_category)


In [173]:
# merge dfs
all_data = pd.concat([all_data, model_item_scores_SSSV], ignore_index=True)
all_data

Unnamed: 0,experiment,model,item,score,category,reverse_coded
0,AUDIT scale,Apertus-70B-Instruct-2509,1,1.072054,,
1,AUDIT scale,Apertus-70B-Instruct-2509,2,3.674954,,
2,AUDIT scale,Apertus-70B-Instruct-2509,3,1.656189,,
3,AUDIT scale,Apertus-70B-Instruct-2509,4,2.163460,,
4,AUDIT scale,Apertus-70B-Instruct-2509,5,1.134567,,
...,...,...,...,...,...,...
11817,SSSV scale,zephyr-7b-beta,36,1.577790,SSdis,
11818,SSSV scale,zephyr-7b-beta,37,1.791699,SSexp,
11819,SSSV scale,zephyr-7b-beta,38,1.661893,SStas,
11820,SSSV scale,zephyr-7b-beta,39,1.429377,SSbor,


# Saving new processed dataframe

In [174]:
# save data
all_data.to_csv('processed_data/items_per_LLM.csv', index=False)