# Pull data from OSF and proprocess

In [1]:
import os
import re
import ast
import json
import uuid
import numpy as np
import pandas as pd
from functools import reduce

from osf_data_handler import OSFDataHandler

In [2]:
criteria = {
    'project': 'design_inference',
    'experiment': 'exp1',
    'iteration_name': 'pilot_local_2',
}

osf_handler = OSFDataHandler('ajz2t')

df = osf_handler.load_filtered_csvs(criteria)

Loaded 3 CSV files from project 'design-inference-exp1' (OSF node ajz2t).


## Make session-level dataframe
A row in this dataframe corresponds to one session (one participants' worth of data). 

The primary key is `game_id`.

In [3]:
session_rename_map = {
    'gameID': 'game_id',
#     'condition': 'condition',
    'iteration_name': 'iteration',
    'dev_mode': 'dev_mode',
    'project': 'project',
    'experiment': 'experiment',

    'startExperimentTS': 'start_experiment_ts',
    'endExperimentTS': 'end_experiment_ts',
    
    'participantYears': 'age',
    'participantGender': 'gender',
    'participantRace': 'race',
    'participantEthnicity': 'ethnicity',
    'participantComments': 'feedback',
    'TechnicalDifficultiesFreeResp': 'technical_difficulties',
    'comprehensionAttempts': 'comprehension_attempts',
    'participantEffort': 'judged_effort',
    'judgedDifficulty': 'judged_difficulty',
    'inputDevice': 'input_device',
    'lottery': 'lottery',
    
    'width': 'browser_width',
    'height': 'browser_height',
    'browser': 'browser',
    'mobile': 'is_mobile_device'
}

session_order = [
    "game_id", 
    "project", "experiment", "iteration", "dev_mode", #condition
    "browser", "browser_width", "browser_height", "is_mobile_device",
    "start_experiment_ts", "end_experiment_ts", "experiment_duration_ms",
    "comprehension_attempts",
    "lottery",
    "age", "gender", "race", "ethnicity",
    "judged_difficulty", "judged_effort", "input_device",
    "feedback", "technical_difficulties"
]

In [4]:
# Extract session data
session_df = []
for game_id, group in df.groupby('gameID'):
    if not (group.trial_type == 'survey').any():
        continue
    S = group[group.trial_type.isna()]
    session = S.iloc[0][~S.iloc[0].isna()]
    session_data = {session_rename_map[k]: v for k, v in session.items() if k in session_rename_map}
    session_data['experiment_duration_ms'] = session.endExperimentTS - session.startExperimentTS
    survey_data = json.loads(group[group.trial_type == 'survey'].iloc[0].response)
    survey_data = {session_rename_map[k]: v for k, v in survey_data.items() if k in session_rename_map}

    browser = group[group.trial_type == 'browser-check'].iloc[0][['width', 'height', 'browser', 'mobile']]
    browser = {session_rename_map[k]: v for k, v in browser.items() if k in session_rename_map}

    session_df.append({**session_data, **survey_data, **browser})
session_df = pd.DataFrame(session_df)[session_order].reset_index(drop=True)

print(session_df.game_id.nunique(), '\n')
print(session_df.iloc[0])

3 

game_id                           1070-5f4cab90-27e4-4dac-88fe-8499ae78c1d7
project                                                    design_inference
experiment                                                             exp1
iteration                                                     pilot_local_2
dev_mode                                                              False
browser                                                              chrome
browser_width                                                        1424.0
browser_height                                                        786.0
is_mobile_device                                                      False
start_experiment_ts                                         1748763133146.0
end_experiment_ts                                           1748763336727.0
experiment_duration_ms                                             203581.0
comprehension_attempts                                                  0.0
lottery 

## Make trial-level dataframe

A row in this dataframe corresponds to one trial, either `quicksand-observe`, `quicksand-simulate`, or `quicksand-eval-navigation`. 

The primary key is `trial_id`.

In [5]:
df['trial_id'] = [str(uuid.uuid4()) for i in range(len(df))]

trial_df = df.loc[
    df.trial_type == 'survey-slider',
    ['trial_id', 'gameID', 'rt', 'trial_index', 'response', 'questions', 'preamble']
]
trial_df['trial_num'] = trial_df.groupby('gameID').trial_index.rank(method='first') - 1
trial_df['slider_num_agents'] = trial_df.response.apply(lambda x: json.loads(x)['intent_agents'])
trial_df['slider_recipe'] = trial_df.response.apply(lambda x: json.loads(x)['intent_recipe'])
trial_df['trial_stim'] = trial_df.preamble.apply(
    lambda x: re.search(r'stims/([^\.]+)\.png', json.loads(x)).group(1))
trial_df[['true_num_agents', 'true_recipe']] = trial_df.trial_stim.str.split('-', expand=True)
trial_df['slider_min_agents'] = '1agent'
trial_df['slider_max_agents'] = '2agent'
trial_df['slider_min_recipe'] = 'tomato'
trial_df['slider_max_recipe'] = 'onion'
trial_df = trial_df.drop(columns=['trial_index', 'response', 'questions', 'preamble'])
trial_df = trial_df.rename(columns={'gameID':'game_id'})

In [6]:
trial_df.iloc[0]

trial_id                  8cc432a7-e907-4aa8-9763-7d55344fd05e
game_id              1070-5f4cab90-27e4-4dac-88fe-8499ae78c1d7
rt                                                      7870.9
trial_num                                                  0.0
slider_num_agents                                           12
slider_recipe                                               75
trial_stim                                        1agent-onion
true_num_agents                                         1agent
true_recipe                                              onion
slider_min_agents                                       1agent
slider_max_agents                                       2agent
slider_min_recipe                                       tomato
slider_max_recipe                                        onion
Name: 7, dtype: object

#### Misc stats

In [7]:
(session_df.experiment_duration_ms / (1000*60)).mean()

3.344711111111111

In [8]:
for i, row in session_df.iterrows():
    print(row.technical_difficulties)
    print(row.feedback, '\n')

None
The amount of space between the cutting boards; the closeness of the tomato/onion to the cutting boards 

None
None 

None
you cant use both chopping boards in some kitchens (based on overcook logic, at least lol) 



In [15]:
os.path.abspath('../../')

'/Users/justyang/Code/design-inference'

In [16]:
project_dir = os.path.abspath('../../')
save_dir = os.path.join(project_dir, 
                        'data', 
                        'behavioral_results', 
                        criteria['experiment'], 
                        criteria['iteration_name'])

save_data = True
if save_data:
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    session_df.to_csv(os.path.join(save_dir, 'session_data.csv'), index=True)
    trial_df.to_csv(os.path.join(save_dir, 'trial_data.csv'), index=True)
    print(f'saved data to {save_dir}...')
else:
    session_df = pd.read_csv(os.path.join(save_dir, 'session_data.csv'))
    trial_df = pd.read_csv(os.path.join(save_dir, 'trial_data.csv'))


saved data to /Users/justyang/Code/design-inference/data/behavioral_results/exp1/pilot_local_2...
