# Data validation and descriptives

In [22]:
# Description: Data validation and descriptive statistics for `/data` and `/outputs` data

import pickle
import pandas as pd
import numpy as np
import os

# Load preprocessed data
current_dir = os.getcwd()
data_path = os.path.join(current_dir, '../data/imported_clean_data.pickle')

with open(data_path, 'rb') as f:
    participants, trials_learning, trials_prediction, trials_control, trials_explanation = pickle.load(f)

# Load output data
predicted_accuracy_path = os.path.join(current_dir, '../outputs/predicted_accuracy.csv')
predicted_accuracy = pd.read_csv(predicted_accuracy_path)

trials_with_Ri_path = os.path.join(current_dir, '../outputs/all_trials_with_Ri.csv')
trials_with_Ri = pd.read_csv(trials_with_Ri_path)

In [25]:
# Check that the number of participants is the same in all dataframes
n_participants = participants.shape[0]
n_participants_learning = trials_learning['participant_id'].nunique()
n_participants_prediction = trials_prediction['participant_id'].nunique()
n_participants_control = trials_control['participant_id'].nunique()
n_participants_explanation = trials_explanation['participant_id'].nunique()
n_participants_predicted_accuracy = predicted_accuracy['participant_id'].nunique()
n_participants_trial_with_Ri = trials_with_Ri['participant_id'].nunique()

assert n_participants == n_participants_learning == \
    sum([n_participants_prediction, n_participants_control, n_participants_explanation]) == \
        n_participants_predicted_accuracy == n_participants_trial_with_Ri

## Check `predicted_accuracy.csv`

In [28]:
# Get unique values for `task`, `condition`, `fsm_type`, `learning_condition`
task_values = predicted_accuracy['task'].unique()
condition_values = predicted_accuracy['condition'].unique()
fsm_type_values = predicted_accuracy['fsm_type'].unique()
learning_condition_values = predicted_accuracy['learning_condition'].unique()

print('tasks: ', task_values)
print('conditions: ', condition_values)
print('fsm types: ', fsm_type_values)
print('learning conditions: ', learning_condition_values)

tasks:  ['prediction' 'explanation' 'control']
conditions:  ['visible' 'hidden' 'hidden_an' 'hidden_normative_subset'
 'hidden_an_subset']
fsm types:  ['easy' 'hard']
learning conditions:  ['Experiment 2 (test preview)' 'Experiment 1 (no preview)']


In [38]:
# Check numeric values (predicted_accuracy, mm_accuracy, count_nan) groupped by task, condition, fsm_type, and learning_condition and save to pandas dataframe
descriptives = pd.DataFrame()

for task in task_values:
    for condition in condition_values:
        for fsm_type in fsm_type_values:
            for learning_condition in learning_condition_values:
                df = predicted_accuracy[
                    (predicted_accuracy['task'] == task) &
                    (predicted_accuracy['condition'] == condition) &
                    (predicted_accuracy['fsm_type'] == fsm_type) &
                    (predicted_accuracy['learning_condition'] == learning_condition)
                ]
                tmp = pd.DataFrame({
                    'task': task,
                    'condition': condition,
                    'fsm_type': fsm_type,
                    'learning_condition': learning_condition,
                    'predicted_accuracy_mean': df['predicted_accuracy'].mean(skipna=True),
                    'predicted_accuracy_std': df['predicted_accuracy'].std(skipna=True),
                    'predicted_accuracy_min': df['predicted_accuracy'].min(skipna=True),
                    'predicted_accuracy_max': df['predicted_accuracy'].max(skipna=True),
                    'predicted_accuracy_nan': df['predicted_accuracy'].isna().sum(),
                    'mm_accuracy_mean': df['mm_accuracy'].mean(skipna=True),
                    'mm_accuracy_std': df['mm_accuracy'].std(skipna=True),
                    'mm_accuracy_min': df['mm_accuracy'].min(skipna=True),
                    'mm_accuracy_max': df['mm_accuracy'].max(skipna=True),
                    'mm_accuracy_nan': df['mm_accuracy'].isna().sum(),
                    'count_nan_mean': df['count_nan'].mean(skipna=True),
                    'count_nan_std': df['count_nan'].std(skipna=True), 
                    'count_nan_min': df['count_nan'].min(skipna=True),
                    'count_nan_max': df['count_nan'].max(skipna=True)
                }, index=[0])
                descriptives = pd.concat([descriptives, tmp])