# Data Wrangling with Behavioural Tasks
Starting point:
- 848 datasets (18 tasks per model (minus the NON_IDEAL_OUTPUTS)) with all logprobs for all answer alternatives of each subtask for all ~1.500 tasks. 

What does this script do
- Read data sets: Which?
- ...

Goal:
- first have one value per item per model
- then transform those values in "outcomes" for each subscale (like Frey did)
- Have 36 values per model! (one per (sub-) scale).

## Packages & Helpers

In [191]:
# packages
import pandas as pd
import numpy as np
import glob
import os
import matplotlib.pyplot as plt
from utils import load_dataframes

# load overall df
all_data = pd.read_csv("processed_data/items_per_LLM.csv")

In [192]:
# Helpers

# filter out probability LLM assigned to real item answer  ------------------------------------------
def filter_pred_prob(data, key1_column="box_1_key", key2_column= "box_2_key"):
    # Determine whether the human decision matches box_1 or box_2
    mask_box1 = data["human_decision"] == data[key1_column]
    mask_box2 = data["human_decision"] == data[key2_column]

    # Assign probability based on which box matches
    data["prob_pred"] = data["prob_1"].where(mask_box1, data["prob_2"].where(mask_box2, None))
    
    return data


## BART TASK

Question:
- Wie soll ich hier über verschiedene Menschen zusammenfassen, wenn die Situation und Entscheidung für jeden Menschen individuell ist?
- same bei DFE (unterschiedliche Experience bei Karten samplen durch randomness) und CCT (auch wieder randomness involviert)
- bei DFE muss man die sampling decision, die ja wesentlich die chosse decision beeinflussen sollte, ohnehin noch aus den menschlichen Daten aussuchen und irgendwie hinzufügen

In [193]:
# load data
BART_data = load_dataframes(task_name="BART", path = "LLM_data")

Merged DataFrame shape: (74346712, 13)
Total models: 44


In [194]:
#BART_data.head(60)

## CCT TASK

In [195]:
# load data
CCT_data = load_dataframes(task_name="CCT")

Merged DataFrame shape: (23305362, 16)
Total models: 38


In [196]:
#CCT_data.head(30)

## DFD TASK

In [197]:
# load data
DFD_data = load_dataframes(task_name="DFD")
DFD_human_data = pd.read_csv("orig_human_data/dfd_perprob.csv")

Merged DataFrame shape: (554576, 9)
Total models: 46


In [198]:
# normalise answer option sum to one
DFD_data["prob_1"] = np.exp(DFD_data["log_prob_box_1"])/(np.exp(DFD_data["log_prob_box_1"]) + np.exp(DFD_data["log_prob_box_2"]))
DFD_data["prob_2"] = np.exp(DFD_data["log_prob_box_2"])/(np.exp(DFD_data["log_prob_box_1"]) + np.exp(DFD_data["log_prob_box_2"]))

In [199]:
# filter out probability LLM assigned to real item answer 
DFD_data=filter_pred_prob(DFD_data)

In [200]:
# Merge only selected columns from DFD_human_data
DFD_data = DFD_data.merge(
    DFD_human_data[["partid", "gamble_ind", "gamble_lab", "H", "R"]],
    left_on=["participant", "round"],
    right_on=["partid", "gamble_ind"],
    how="left"
)

# Drop duplicate key columns if you don’t need them anymore
DFD_data = DFD_data.drop(columns=["partid", "gamble_ind"])


In [201]:
# produce df with one value per model per item -------------------------------------------------------
def get_LLM_value_per_item(data):
    grouped = data.groupby(["experiment", "model", "gamble_lab"])
    prob_sum = grouped["prob_pred"].sum()
    
    # compute weighted means
    scoreH = (grouped["H"].apply(lambda x: (x * data.loc[x.index, "prob_pred"]).sum()) / prob_sum)
    scoreR = (grouped["R"].apply(lambda x: (x * data.loc[x.index, "prob_pred"]).sum()) / prob_sum)
    
    # combine into one DataFrame
    result = pd.concat([scoreH, scoreR], axis=1).reset_index()
    result.columns = ["experiment", "model", "gamble_lab", "H_score", "R_score"]
    
    return result


model_item_scores_DFD = get_LLM_value_per_item(DFD_data)

In [202]:
# add_new_df
all_data = pd.concat([all_data, model_item_scores_DFD], ignore_index=True)


## DFE TASK

In [203]:
# load data
DFE_data = load_dataframes(task_name="DFE")

Merged DataFrame shape: (458052, 9)
Total models: 38


In [204]:
#DFE_data.head(30)

## LOT TASK

In [205]:
# load data
LOT_data = load_dataframes(task_name="LOT")
LOT_human_data = pd.read_csv("orig_human_data/lotteries.csv")

Merged DataFrame shape: (1733050, 9)
Total models: 46


In [206]:
LOT_data.head(30)

Unnamed: 0,human_decision,log_prob_box_1,log_prob_box_2,model,round,participant,experiment,box_1_key,box_2_key
0,P,-19.125,-15.875,Falcon-3-10B-Instruct,1,64000401,LOT task,V,P
1,V,-24.375,-20.625,Falcon-3-10B-Instruct,2,64000401,LOT task,V,P
2,V,-25.125,-23.0,Falcon-3-10B-Instruct,3,64000401,LOT task,V,P
3,P,-30.25,-30.375,Falcon-3-10B-Instruct,4,64000401,LOT task,V,P
4,P,-28.75,-27.0,Falcon-3-10B-Instruct,5,64000401,LOT task,V,P
5,V,-29.25,-27.75,Falcon-3-10B-Instruct,6,64000401,LOT task,V,P
6,P,-32.75,-31.25,Falcon-3-10B-Instruct,7,64000401,LOT task,V,P
7,P,-30.25,-30.75,Falcon-3-10B-Instruct,8,64000401,LOT task,V,P
8,V,-34.25,-31.875,Falcon-3-10B-Instruct,9,64000401,LOT task,V,P
9,V,-22.625,-22.125,Falcon-3-10B-Instruct,10,64000401,LOT task,V,P


In [207]:
# normalise answer option sum to one
LOT_data["prob_1"] = np.exp(LOT_data["log_prob_box_1"])/(np.exp(LOT_data["log_prob_box_1"]) + np.exp(LOT_data["log_prob_box_2"]))
LOT_data["prob_2"] = np.exp(LOT_data["log_prob_box_2"])/(np.exp(LOT_data["log_prob_box_1"]) + np.exp(LOT_data["log_prob_box_2"]))

In [208]:
# filter out probability LLM assigned to real item answer 
LOT_data=filter_pred_prob(LOT_data)

In [209]:
LOT_human_data

Unnamed: 0,partid,Dec_ID,Stage,Substage,External_Dec_ID,V_Decision,X1,X2,PX1,Z1,...,Maxstage,Presentation_Order,Presentation_XZ,Decision_Time,Decision_X,Inconsistent,Change_X,Threshold_Up,Threshold_Lo,R
0,64000401,1,2,2,122,1,50,20,70,90,...,3,44,0,5150,0,0.0,1,90,20,1
1,64000401,1,3,0,130,1,70,20,70,90,...,3,82,1,5581,0,,1,90,50,1
2,64000401,2,3,3,233,1,110,70,70,160,...,4,26,0,9912,1,0.0,1,160,70,0
3,64000401,2,4,0,240,1,90,70,70,160,...,4,47,0,4453,1,,1,110,70,0
4,64000401,3,2,2,322,1,50,10,75,80,...,3,48,0,13371,1,0.0,1,80,20,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75345,68051401,23,4,0,2340,0,-50,-120,30,-40,...,4,74,0,62147,0,,0,-80,-120,0
75346,68051401,24,2,2,2422,0,-50,-120,55,-40,...,3,38,1,17423,1,0.0,0,-50,-120,1
75347,68051401,24,3,0,2430,0,-50,-120,30,-40,...,3,49,0,19239,0,,0,-80,-120,0
75348,68051401,25,2,2,2522,0,-10,-100,80,-20,...,3,29,1,15140,1,0.0,0,-10,-100,1


In [210]:
# Merge only selected columns from DFD_human_data
LOT_data = LOT_data.merge(
    LOT_human_data[["partid", "Dec_ID", "R"]],
    left_on=["participant", "round"],
    right_on=["partid", "Dec_ID"],
    how="left"
)

# Drop duplicate key columns if you don’t need them anymore
LOT_data = LOT_data.drop(columns=["partid",  "Dec_ID"])


In [211]:
# produce df with one value per model per item -------------------------------------------------------
def get_LLM_value_per_item(data):
    grouped = data.groupby(["experiment", "model", "round"])
    prob_sum = grouped["prob_pred"].sum()
    
    # compute weighted mean
    R_score = (grouped["R"].apply(lambda x: (x * data.loc[x.index, "prob_pred"]).sum()) / prob_sum)
    return R_score.reset_index(name="R_score")



model_item_scores_LOT = get_LLM_value_per_item(LOT_data)



In [212]:
# add_new_df
all_data = pd.concat([all_data, model_item_scores_LOT], ignore_index=True)


LOT unvollständig!!!

## MPL TASK

In [213]:
# load data
MPL_data = load_dataframes(task_name="MPL")
MPL_human_data = pd.read_csv("orig_human_data/mpl.csv")

Merged DataFrame shape: (4569180, 10)
Total models: 46


In [214]:
# normalise answer option sum to one
MPL_data["prob_1"] = np.exp(MPL_data["log_prob_lot_1"])/(np.exp(MPL_data["log_prob_lot_1"]) + np.exp(MPL_data["log_prob_lot_2"]))
MPL_data["prob_2"] = np.exp(MPL_data["log_prob_lot_2"])/(np.exp(MPL_data["log_prob_lot_1"]) + np.exp(MPL_data["log_prob_lot_2"]))

In [215]:
# filter out probability LLM assigned to real item answer 
MPL_data=filter_pred_prob(MPL_data, "lot_1_key", "lot_2_key")

In [216]:
# Merge only selected columns from DFD_human_data
MPL_data = MPL_data.merge(
    MPL_human_data[["partid", "dp", "decision", "choice", "R"]],
    left_on=["participant", "problem", "decision"],
    right_on=["partid", "dp", "decision"],
    how="left"
)

# Drop duplicate key columns if you don’t need them anymore
MPL_data = MPL_data.drop(columns=["partid",  "dp"])


In [217]:
# produce df with one value per model per item -------------------------------------------------------
def get_LLM_value_per_item(data):
    grouped = data.groupby(["experiment", "model", "problem", "decision"])
    prob_sum = grouped["prob_pred"].sum()
    
    # compute weighted means
    scoreChoice = (grouped["choice"].apply(lambda x: (x * data.loc[x.index, "prob_pred"]).sum()) / prob_sum)
    scoreR = (grouped["R"].apply(lambda x: (x * data.loc[x.index, "prob_pred"]).sum()) / prob_sum)
    
    # combine into one DataFrame
    result = pd.concat([scoreChoice, scoreR], axis=1).reset_index()
    result.columns = ["experiment", "model", "problem", "decision", "scoreChoice", "R_score"]
    
    return result


model_item_scores_MPL = get_LLM_value_per_item(MPL_data)

In [218]:
# add_new_df
all_data = pd.concat([all_data, model_item_scores_MPL], ignore_index=True)


# Save final complete data

In [219]:
# save data
all_data.to_csv('processed_data/items_per_LLM.csv', index=False)

In [220]:
all_data

Unnamed: 0,experiment,model,item,score,category,reverse_coded,gamble_lab,H_score,R_score,round,problem,decision,scoreChoice
0,AUDIT scale,Apertus-70B-Instruct-2509,1.0,1.072054,,,,,,,,,
1,AUDIT scale,Apertus-70B-Instruct-2509,2.0,3.674954,,,,,,,,,
2,AUDIT scale,Apertus-70B-Instruct-2509,3.0,1.656189,,,,,,,,,
3,AUDIT scale,Apertus-70B-Instruct-2509,4.0,2.163460,,,,,,,,,
4,AUDIT scale,Apertus-70B-Instruct-2509,5.0,1.134567,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
16371,MPL task,zephyr-7b-beta,,,,,,,0.081935,,7.0,2.0,0.081935
16372,MPL task,zephyr-7b-beta,,,,,,,0.319287,,7.0,3.0,0.319287
16373,MPL task,zephyr-7b-beta,,,,,,,0.728582,,7.0,4.0,0.728582
16374,MPL task,zephyr-7b-beta,,,,,,,0.924115,,7.0,5.0,0.924115
