In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

%load_ext autoreload
%autoreload 2
from delay_discounting_mvpa.config_loader import load_config
from delay_discounting_mvpa.design_utils import create_design_matrices
from delay_discounting_mvpa.io_utils import get_subids, load_tsv_data, resolve_file


# Purpose

This code generates subject_exclusion_summaries.csv, which describes:

1. Subjects to omit (with reasons)
2. Subjects to include (good subjects)

Subjects are excluded based on the following hierarchy of criteria:

* **Behavioral data missing**  
* **Previously defined exclusion criteria** (from `BIDS/suggested_exclusions.csv`)  
* **Always responded with the same choice**  
* **BOLD data missing**  
* **Trial onset beyond the scan duration**:  
    * max onset = 870.99 s, scan duration = 724.20 s  
    * max onset = 735.04 s, scan duration = 724.20 s  

> Note: Because this is a hierarchical system, if a subject was excluded for one criterion (e.g., always responding with the same choice), later criteria (e.g., missing BOLD data) were not checked.

All of this logic is implemented in `create_design_matrices()`. The output of that function has been used to create a CSV (`subject_status.csv`) that can be referenced when running analyses.  

## Summary of subject counts

| Reason                                           | Number of subjects |
|-------------------------------------------------|-----------------|
| Passed all checks                                | 77              |
| Behavioral data missing                          | 7               |
| Met exclusion criteria (`suggested_exclusion.csv`) | 13              |
| Always gave the same response                    | 9               |
| Onset beyond the scan duration                   | 2               |


In [2]:
config_file = '/oak/stanford/groups/russpold/data/uh2/aim1/analysis_code/delay_discounting_mvpa/configs/config.yaml'
cfg = load_config(config_file)


In [3]:
tr = cfg.tr
subids = get_subids(cfg)
hp_filter_cutoff = 1 / 450


model_subids, bold_paths, design_matrices, status_df = create_design_matrices(
    cfg, subids, tr, hp_filter_cutoff
)

Processing s061...
Processing s130...
Processing s144...
Skipping s144 (behav missing: No files found for 'behav' with pattern: /oak/stanford/groups/russpold/data/uh2/aim1/behavioral_data/event_files/s144_discountFix_events.tsv)
Processing s172...
Processing s192...
Processing s234...
Processing s251...
Processing s358...
Processing s373...
Skipping s373 (met suggested_exclusion.csv criteria: missing_more_than_half_the_tasks)
Processing s445...
Skipping s445 (met suggested_exclusion.csv criteria: missing_more_than_half_the_tasks, MRIQC_fail)
Processing s465...
Skipping s465 (singular response: 0 smaller sooner / 120 larger later)
Processing s471...
Processing s483...
Processing s491...
Skipping s491: onset beyond scan duration: max onset=870.99s, scan duration=724.20s
Processing s495...
Skipping s495: onset beyond scan duration: max onset=735.04s, scan duration=724.20s
Processing s497...
Processing s499...
Skipping s499 (behav missing: No files found for 'behav' with pattern: /oak/stan

In [4]:
status_df[['conclusion', 'details']] = status_df['reason'].str.split(
    ':', n=1, expand=True
)

# Optional: strip whitespace
status_df['conclusion'] = status_df['conclusion'].str.strip()
status_df['details'] = status_df['details'].str.strip()

# For rows with no colon, 'details' will be NaN. You can replace with empty string if desired:
status_df['details'] = status_df['details'].fillna('')

status_df = status_df.drop(columns=['reason'])

In [5]:
np.unique(status_df['conclusion'], return_counts=True)

(array(['Passed all checks', 'behav missing',
        'met suggested_exclusion.csv criteria',
        'onset beyond scan duration', 'singular response'], dtype=object),
 array([77,  7, 13,  2,  9]))

In [6]:
status_df.to_csv('subject_status.csv', index=False)

In [9]:
# more useful for slurm jobs to have a simple text file with the good subjects
good_subids = status_df.loc[status_df['include'], 'sub_id']
good_subids.to_csv('good_subids.txt', index=False, header=False)
