In [13]:
# We will need the RBCPath type from the rbclib package to load data from the RBC.
from rbclib import RBCPath

# We'll also want to load some data directly from the filesystem.
from pathlib import Path

# We'll want to load/process some of the data using pandas and numpy.
import pandas as pd
import numpy as np

import plotly.express as px
from ipywidgets import IntProgress

### Getting the Participant Lists and Metadata

We have pre-sorted the participants in the PNC study into a training and a
test dataset. Basic metadata about each participant can be found in TSV files
in the `shared` directory in your home directory.

In [55]:
# Participant meta-data is generally located in the BIDS repository for each
# study:
rbcdata_path = Path('/home/jovyan/shared/data/RBC')
train_filepath = rbcdata_path / 'train_participants.tsv'
test_filepath = rbcdata_path / 'test_participants.tsv'

# Load the PNC participants TSV files...
with train_filepath.open('r') as f:
    train_data = pd.read_csv(f, sep='\t')
with test_filepath.open('r') as f:
    test_data = pd.read_csv(f, sep='\t')

# We can also concatenate the two datasets into a single dataset of all
# study participants:
all_data = pd.concat([train_data, test_data])


### Step 1. Collect Data

In [5]:
def load_fsdata(participant_id, local_cache_dir=(Path.home() / 'cache')):
    "Loads and returns the dataframe of a PNC participant's FreeSurfer data."

    # Check that the local_cache_dir exists and make it if it doesn't.
    if local_cache_dir is not None:
        local_cache_dir = Path(local_cache_dir)
        local_cache_dir.mkdir(exist_ok=True)
    
    # Make the RBCPath and find the appropriate file:
    pnc_freesurfer_path = RBCPath(
        'rbc://PNC_FreeSurfer/freesurfer',
        # We provide the local_cache_dir to the RBCPath object; all paths made
        # from this object will use the same cache directory.
        local_cache_dir=local_cache_dir)
    participant_path = pnc_freesurfer_path / f'sub-{participant_id}'
    tsv_path = participant_path / f'sub-{participant_id}_regionsurfacestats.tsv'

    # Use pandas to read in the TSV file:
    with tsv_path.open('r') as f:
        data = pd.read_csv(f, sep='\t')

    # Return the loaded data:
    return data

In [47]:
subjects = all_data.participant_id.unique()
n_subj = subjects.shape[0]


prog = IntProgress(min=0, max=len(subjects))
display(prog)
all_sub_brain_data = pd.DataFrame()


for num_sub, sub_id in enumerate(subjects):
    print(f"Loading subject ({num_sub + 1}/{n_subj}) ...", end='\r')
    try:
        sub_brain_data = load_fsdata(sub_id)
        all_sub_brain_data = pd.concat([all_sub_brain_data, sub_brain_data], ignore_index=True)
    except Exception as e:
        print(f"Skipping subject {sub_id} due to error: {e}")
    prog.value += 1

IntProgress(value=0, max=1601)

Skipping subject 1342487188 due to error: [Errno 2] No such file or directory: '/home/jovyan/shared/data/RBC/PNC_FreeSurfer/freesurfer/sub-1342487188/sub-1342487188_regionsurfacestats.tsv'
Skipping subject 1649551035 due to error: [Errno 2] No such file or directory: '/home/jovyan/shared/data/RBC/PNC_FreeSurfer/freesurfer/sub-1649551035/sub-1649551035_regionsurfacestats.tsv'
Skipping subject 2003542642 due to error: [Errno 2] No such file or directory: '/home/jovyan/shared/data/RBC/PNC_FreeSurfer/freesurfer/sub-2003542642/sub-2003542642_regionsurfacestats.tsv'
Skipping subject 219325366 due to error: [Errno 2] No such file or directory: '/home/jovyan/shared/data/RBC/PNC_FreeSurfer/freesurfer/sub-219325366/sub-219325366_regionsurfacestats.tsv'
Skipping subject 2249226316 due to error: [Errno 2] No such file or directory: '/home/jovyan/shared/data/RBC/PNC_FreeSurfer/freesurfer/sub-2249226316/sub-2249226316_regionsurfacestats.tsv'
Skipping subject 4184549693 due to error: [Errno 2] No suc

In [54]:
tsv_fn = '~/projects/Neurohack_Group8_Miniproject/all_subject_brainData.tsv.gz'
all_sub_brain_data.to_csv(tsv_fn, sep="\t", na_rep='NaN', index=False, compression='gzip')

In [56]:
parquet_fn = '~/projects/Neurohack_Group8_Miniproject/all_subject_brainData.parquet'
all_sub_brain_data.to_parquet(parquet_fn, index=False, compression='snappy')