In [1]:
import pandas as pd
from scipy import stats
import numpy as np
import json
from sharedcontrolpaper.force_sensitive_stopping_task_utils import print_means_t_test, print_effect_size_and_ci, calc_stats_ind, convert_formats_back, rename_index_column

In [2]:
with open('force_sensitive_data.json', 'r') as f:
    loaded_data = json.load(f)

force_sensitive_stopping_task_ssrt = pd.DataFrame.from_records(loaded_data['force_sensitive_stopping_task_ssrt'])
go_task_accuracy_before_stop_onset = pd.DataFrame.from_records(loaded_data['go_task_accuracy_before_stop_onset'])
go_task_accuracy_after_stop_onset = pd.DataFrame.from_records(loaded_data['go_task_accuracy_after_stop_onset'])
duration_of_inhibition = pd.DataFrame.from_records(loaded_data['duration_of_inhibition'])
ssrt_first_half = pd.DataFrame.from_records(loaded_data['ssrt_first_half'])
ssrt_second_half = pd.DataFrame.from_records(loaded_data['ssrt_second_half'])
shared_control_metrics = convert_formats_back(loaded_data['shared_control_metrics'])
first_non_zero_pressure_timestamp = pd.DataFrame.from_records(loaded_data['first_non_zero_pressure_timestamp'])
first_full_pressure_timestamp = pd.DataFrame.from_records(loaded_data['first_full_pressure_timestamp'])

with open('simple_stop_data.json', 'r') as f:
    loaded_data = json.load(f)

simple_stop_metrics = pd.DataFrame.from_records(loaded_data['simple_stop_metrics'])

with open('ai_survey_data.json', 'r') as f:
    loaded_data = json.load(f)

survey_scores = convert_formats_back(loaded_data['survey_scores'])


### Remove the Mean and SD rows for Simple Stop

In [3]:
simple_stop_ssrt = simple_stop_metrics.iloc[:-2]

### Merge Simple Stop and Force Sensitive Stopping Task SSRTs

In [4]:
merged_df = pd.merge(force_sensitive_stopping_task_ssrt, simple_stop_metrics[['ssrt', 'ssrt_without_short_ssd_trials', 'ssrt_without_short_ssd_subs']], 
                    left_index=True, right_index=True, how='left')
merged_df.rename(columns={
    'ssrt': 'simple_stop_ssrt',
    'ssrt_without_short_ssd_trials': 'simple_stop_ssrt_without_short_ssd_trials',
    'ssrt_without_short_ssd_subs': 'simple_stop_ssrt_without_short_ssd_subs'
}, inplace=True)

rename_index_column(merged_df)

## Planned Statistical Tests

### AI-failed vs Non-AI SSRT

In [5]:
print_means_t_test(merged_df, 'ai_failed', 'non_ai')

Mean ai_failed: 298.51
Mean non_ai: 276.53
T-statistic: 5.90, p-value: 0.00
Significant difference (ai_failed vs non_ai)? Yes


In [6]:
print_effect_size_and_ci(merged_df,'ai_failed', 'non_ai')

Cohen's d: 1.32
Mean difference (ai_failed - non_ai): 21.98 ms
95% CI: [14.44, 29.52] ms


### AI-assisted vs AI-failed SSRT

In [7]:
print_means_t_test(merged_df, 'ai_assisted', 'ai_failed')

Mean ai_assisted: 331.50
Mean ai_failed: 298.51
T-statistic: 3.84, p-value: 0.00
Significant difference (ai_assisted vs ai_failed)? Yes


In [8]:
print_effect_size_and_ci(merged_df, 'ai_assisted', 'ai_failed')

Cohen's d: 0.86
Mean difference (ai_assisted - ai_failed): 33.00 ms
95% CI: [15.61, 50.38] ms


### Non-AI SSRT vs Simple Stop SSRT

In [9]:
print_means_t_test(merged_df, 'non_ai', 'simple_stop_ssrt')
non_ai_vs_simple_corr = np.corrcoef(merged_df['non_ai'], merged_df['simple_stop_ssrt'])[1][0]
print(f"Correlation: {non_ai_vs_simple_corr:.2f}")

Mean non_ai: 276.53
Mean simple_stop_ssrt: 210.57
T-statistic: 12.83, p-value: 0.00
Significant difference (non_ai vs simple_stop_ssrt)? Yes
Correlation: 0.46


In [10]:
print_effect_size_and_ci(merged_df, 'non_ai', 'simple_stop_ssrt')

Cohen's d: 2.87
Mean difference (non_ai - simple_stop_ssrt): 65.96 ms
95% CI: [55.56, 76.36] ms


## Order Effects

### Calculate order effects between subjects who had the AI block first vs. the Non-AI block first

In [11]:
# Initialize subjects who had the Non-AI and AI Blocks first
non_ai_first_subs = ['s004', 's009', 's008', 's011', 's012', 's015', 's016', 's019', 's020', 's023', 's024', 's027', 's028', 's031', 's032', 's035', 's036', 's039', 's040', 's043']
ai_first_subs = ['s005', 's006', 's007', 's010', 's013', 's014', 's017', 's018', 's021', 's022', 's025', 's026', 's029', 's030', 's033', 's034', 's037', 's038', 's041', 's042']

In [12]:
ssrt_non_ai_first_non_ai = force_sensitive_stopping_task_ssrt.loc[non_ai_first_subs, "non_ai"].values
ssrt_non_ai_first_ai_failed = force_sensitive_stopping_task_ssrt.loc[non_ai_first_subs, "ai_failed"].values
ssrt_ai_first_non_ai = force_sensitive_stopping_task_ssrt.loc[ai_first_subs, "non_ai"].values
ssrt_ai_first_ai_failed = force_sensitive_stopping_task_ssrt.loc[ai_first_subs, "ai_failed"].values

# Calculate differences
diff_non_ai_first = ssrt_non_ai_first_ai_failed - ssrt_non_ai_first_non_ai
diff_ai_first = ssrt_ai_first_ai_failed - ssrt_ai_first_non_ai

# Create DataFrame
diff_df = pd.DataFrame({
    'Non-AI First': diff_non_ai_first,
    'AI First': diff_ai_first,
})

print_means_t_test(diff_df, 'AI First', 'Non-AI First')
print_effect_size_and_ci(diff_df, 'AI First', 'Non-AI First')

Mean AI First: 25.85
Mean Non-AI First: 18.11
T-statistic: 0.86, p-value: 0.40
Significant difference (AI First vs Non-AI First)? No
Cohen's d: 0.27
Mean difference (AI First - Non-AI First): 7.73 ms
95% CI: [-11.12, 26.59] ms


### Calculate AI-failed vs. Non-AI SSRT depending on whether they occurred in the first half or second half of the block

#### First half

In [13]:
print_means_t_test(ssrt_first_half, 'ai_failed', 'non_ai')
print_effect_size_and_ci(ssrt_first_half, 'ai_failed', 'non_ai')

Mean ai_failed: 296.03
Mean non_ai: 273.70
T-statistic: 4.06, p-value: 0.00
Significant difference (ai_failed vs non_ai)? Yes
Cohen's d: 0.91
Mean difference (ai_failed - non_ai): 22.33 ms
95% CI: [11.22, 33.44] ms


#### Second Half

In [14]:
print_means_t_test(ssrt_second_half, 'ai_failed', 'non_ai')
print_effect_size_and_ci(ssrt_second_half, 'ai_failed', 'non_ai')

Mean ai_failed: 303.34
Mean non_ai: 279.36
T-statistic: 5.32, p-value: 0.00
Significant difference (ai_failed vs non_ai)? Yes
Cohen's d: 1.19
Mean difference (ai_failed - non_ai): 23.98 ms
95% CI: [14.86, 33.11] ms


## Exploratory T-tests and Confidence Intervals

### Non-AI vs AI-failed Duration of Inhibition

In [15]:
print_means_t_test(duration_of_inhibition, 'ai_failed', 'non_ai')
print_effect_size_and_ci(duration_of_inhibition, 'ai_failed', 'non_ai')

Mean ai_failed: 47.72
Mean non_ai: 41.79
T-statistic: 2.87, p-value: 0.01
Significant difference (ai_failed vs non_ai)? Yes
Cohen's d: 0.64
Mean difference (ai_failed - non_ai): 5.93 ms
95% CI: [1.76, 10.10] ms


### Non-AI vs AI-failed Go Task Accuracy Before Stop Onset

In [16]:
print_means_t_test(go_task_accuracy_before_stop_onset, 'ai_failed', 'non_ai')
print_effect_size_and_ci(go_task_accuracy_before_stop_onset, 'ai_failed', 'non_ai')

Mean ai_failed: 0.90
Mean non_ai: 0.87
T-statistic: 2.81, p-value: 0.01
Significant difference (ai_failed vs non_ai)? Yes
Cohen's d: 0.63
Mean difference (ai_failed - non_ai): 0.03 ms
95% CI: [0.01, 0.06] ms


### Non-AI vs AI-failed Go Task Accuracy After Stop Onset (aka Stop Success Rate)

In [17]:
print_means_t_test(go_task_accuracy_after_stop_onset, 'ai_failed', 'non_ai')
print_effect_size_and_ci(go_task_accuracy_after_stop_onset, 'ai_failed', 'non_ai')

Mean ai_failed: 0.41
Mean non_ai: 0.60
T-statistic: -8.88, p-value: 0.00
Significant difference (ai_failed vs non_ai)? Yes
Cohen's d: -1.98
Mean difference (ai_failed - non_ai): -0.19 ms
95% CI: [-0.24, -0.15] ms


### Non-AI vs AI-failed SSRT in subjects who did not show proactive slowing

In [18]:
non_proactive_slowing_subs = go_task_accuracy_before_stop_onset[
    go_task_accuracy_before_stop_onset['non_ai'] > go_task_accuracy_before_stop_onset['ai_failed']
].index.tolist()

ssrt_data = force_sensitive_stopping_task_ssrt.copy() # Avoid modifying original dataframe
ssrt_data = ssrt_data.rename_axis('subject_id').reset_index() #add subject_id column
ssrt_data = ssrt_data[ssrt_data['subject_id'] != 'mean'] #remove mean row

# Calculate difference and assign slowing type using vectorized operations
ssrt_data['difference_ms'] = ssrt_data['ai_failed'] - ssrt_data['non_ai']
ssrt_data['slowing_type'] = np.where(ssrt_data['subject_id'].isin(non_proactive_slowing_subs), 'Non-Proactive', 'Proactive')

# Filter for non-proactive slowing data
non_proactive_slowing_data = ssrt_data[ssrt_data['slowing_type'] == 'Non-Proactive']

In [19]:
print_means_t_test(non_proactive_slowing_data, 'ai_failed', 'non_ai')
print_effect_size_and_ci(non_proactive_slowing_data, 'ai_failed', 'non_ai')

Mean ai_failed: 296.86
Mean non_ai: 267.57
T-statistic: 2.84, p-value: 0.02
Significant difference (ai_failed vs non_ai)? Yes
Cohen's d: 1.27
Mean difference (ai_failed - non_ai): 29.28 ms
95% CI: [5.95, 52.62] ms


### T-test comparing AI-failed minus Non-AI SSRT in subjects who proactively slowed and did not proactively slow

In [20]:
non_proactive_differences = ssrt_data[ssrt_data['slowing_type'] == 'Non-Proactive']['difference_ms'].dropna()
proactive_differences = ssrt_data[ssrt_data['slowing_type'] == 'Proactive']['difference_ms'].dropna()

calc_stats_ind(non_proactive_differences, proactive_differences)

print(f"Mean difference (Non-Proactive): {ssrt_data[ssrt_data['slowing_type'] == 'Non-Proactive']['difference_ms'].mean():.2f} ms")
print(f"Mean difference (Proactive): {ssrt_data[ssrt_data['slowing_type'] == 'Proactive']['difference_ms'].mean():.2f} ms")

Independent samples t-test:
  t-statistic = 1.14
  p-value = 0.263
  Cohen's d = 0.43
Mean difference (Non-Proactive): 29.28 ms
Mean difference (Proactive): 19.55 ms


### T-test comparing the first non-zero pressure timestamp between the AI and Non-AI Blocks

In [21]:
print_means_t_test(first_non_zero_pressure_timestamp, 'ai_failed', 'non_ai') # Both AI-failed and AI-assisted columns represent the enter AI Block
print_effect_size_and_ci(first_non_zero_pressure_timestamp, 'ai_failed', 'non_ai')

Mean ai_failed: 49.29
Mean non_ai: 50.66
T-statistic: -0.30, p-value: 0.76
Significant difference (ai_failed vs non_ai)? No
Cohen's d: -0.07
Mean difference (ai_failed - non_ai): -1.38 ms
95% CI: [-10.52, 7.77] ms


### T-test comparing the first full pressure timestamp between the AI and Non-AI Blocks

In [22]:
print_means_t_test(first_full_pressure_timestamp, 'ai_failed', 'non_ai') # Both AI-failed and AI-assisted columns represent the enter AI Block
print_effect_size_and_ci(first_full_pressure_timestamp, 'ai_failed', 'non_ai')

Mean ai_failed: 148.96
Mean non_ai: 177.99
T-statistic: -2.33, p-value: 0.03
Significant difference (ai_failed vs non_ai)? Yes
Cohen's d: -0.52
Mean difference (ai_failed - non_ai): -29.03 ms
95% CI: [-54.27, -3.79] ms


## Correlate survey scores with SSRT

In [23]:
survey_scores_df = pd.DataFrame(survey_scores)
avg_df = merged_df.merge(survey_scores_df, on='subject_id', how='left')
avg_df['difference_ai_failed_and_non_ai_ssrt'] = avg_df['ai_failed'] - avg_df['non_ai']
correlation, pval = stats.pearsonr(avg_df['average_score'], avg_df["difference_ai_failed_and_non_ai_ssrt"])
print(f"Correlation and p-value between SSRT and survey scores: {correlation:.2f}, {pval:.3f}")

Correlation and p-value between SSRT and survey scores: -0.05, 0.738


## Violations of Context Independence in Simple Stop Data

### a) Non-AI SSRT vs Simple Stop SSRT Excluding trials with SSD < 200 ms

In [24]:
print_means_t_test(merged_df, 'non_ai', 'simple_stop_ssrt_without_short_ssd_trials')
print_effect_size_and_ci(merged_df, 'non_ai', 'simple_stop_ssrt_without_short_ssd_trials')

Mean non_ai: 276.53
Mean simple_stop_ssrt_without_short_ssd_trials: 207.72
T-statistic: 12.28, p-value: 0.00
Significant difference (non_ai vs simple_stop_ssrt_without_short_ssd_trials)? Yes
Cohen's d: 2.74
Mean difference (non_ai - simple_stop_ssrt_without_short_ssd_trials): 68.80 ms
95% CI: [57.46, 80.14] ms


### b) Non-AI SSRT vs Simple Stop SSRT Excluding subjects with average SSDs < 200ms

In [25]:
# Remove rows with NaN in simple_stop_ssrt_without_short_ssd_subs
filtered_df = merged_df.dropna(subset=['simple_stop_ssrt_without_short_ssd_subs'])

print_means_t_test(filtered_df, 'non_ai', 'simple_stop_ssrt_without_short_ssd_subs')
print_effect_size_and_ci(filtered_df, 'non_ai', 'simple_stop_ssrt_without_short_ssd_subs')

Mean non_ai: 279.43
Mean simple_stop_ssrt_without_short_ssd_subs: 208.59
T-statistic: 13.34, p-value: 0.00
Significant difference (non_ai vs simple_stop_ssrt_without_short_ssd_subs)? Yes
Cohen's d: 3.19
Mean difference (non_ai - simple_stop_ssrt_without_short_ssd_subs): 70.84 ms
95% CI: [60.05, 81.64] ms


## Store merged_df to be accessed later

In [26]:
data_to_save = {
    'merged_df': merged_df.to_dict(),
}

with open('merged_df.json', 'w') as f:
    json.dump(data_to_save, f, indent=4)