In [1]:
# set up libraries
    # We will need the RBCPath type from the rbclib package to load data from the RBC.
from rbclib import RBCPath
    # We'll also want to load some data directly from the filesystem.
from pathlib import Path
    # We'll want to load/process some of the data using pandas and numpy.
import pandas as pd
import numpy as np

In [2]:
# functions form the given analysis function 
def load_fsdata(participant_id, local_cache_dir=(Path.home() / 'cache')):
    "Loads and returns the dataframe of a PNC participant's FreeSurfer data."
    # from Neurohakademy 2025 organizers

    # Check that the local_cache_dir exists and make it if it doesn't.
    if local_cache_dir is not None:
        local_cache_dir = Path(local_cache_dir)
        local_cache_dir.mkdir(exist_ok=True)
    
    # Make the RBCPath and find the appropriate file:
    pnc_freesurfer_path = RBCPath(
        'rbc://PNC_FreeSurfer/freesurfer',
        # We provide the local_cache_dir to the RBCPath object; all paths made
        # from this object will use the same cache directory.
        local_cache_dir=local_cache_dir)
    participant_path = pnc_freesurfer_path / f'sub-{participant_id}'
    tsv_path = participant_path / f'sub-{participant_id}_regionsurfacestats.tsv'

    # Use pandas to read in the TSV file:
    with tsv_path.open('r') as f:
        data = pd.read_csv(f, sep='\t')

    # Return the loaded data:
    return data

In [3]:
# My functions 
def flatten_RBC_participant(participant_id):
    "Flatten the data from RBC for a given participant."
    
    # Load Data
    df = load_fsdata(participant_id)
    
    # Summarize Feature Info
    df['feature_info'] = df[['atlas', 'hemisphere', 'StructName']].agg('__'.join, axis=1)
    df = df.drop(columns=['atlas', 'hemisphere', 'StructName'])
    
    # Make a place to store the data
    output_df = pd.DataFrame(columns = ['measure_info', 'value'])
    
    # Iterate through the measures flattening them and adding them to the new df
    allowed_measures = ['NumVert', 'SurfArea', 'GrayVol', 'ThickAvg', 'ThickStd', 'MeanCurv', 'GausCurv', 'FoldInd', 'CurvInd', 
                        'Index', 'SegId', 'Mean_wgpct', 'StdDev_wgpct', 'Min_wgpct', 'Max_wgpct', 'Range_wgpct', 'SNR_wgpct', 
                        'Mean_piallgi', 'StdDev_piallgi', 'Min_piallgi', 'Max_piallgi', 'Range_piallgi']
    for measure in allowed_measures:
        temp_df = pd.DataFrame(columns = ['measure_info', 'value'])
        labels = df['feature_info'].apply(lambda x : f'{x}__{measure}')
        values = df[measure]
        temp_df['measure_info'] = labels
        temp_df['value'] = values
        output_df = pd.concat([output_df, temp_df])
    
    return output_df


def make_RBC_data_into_a_table(participant_id_list):
    '''Make a dataframe of all RBC participants in the given list'''

    # Initialize the df with the first participant
    output_flattened_df = flatten_RBC_participant(participant_id_list[0])
    output_flattened_df = output_flattened_df.rename(columns={'value':participant_id_list[0]}).set_index('measure_info').T

    # add in all the other participants
    for participant_id in participant_id_list[1:]:
        temp_df = flatten_RBC_participant(participant_id)
        temp_df = temp_df.rename(columns={'value':participant_id}).set_index('measure_info').T
        output_flattened_df = pd.concat([output_flattened_df, temp_df])
    
    return output_flattened_df
        
    

In [4]:
example_participant_id = 1000393599

# Load Data
example_df = load_fsdata(example_participant_id)

# Remove session id because it is empty
example_df = example_df.drop(columns=['session_id'])

# Summarize Feature Info
example_df['feature_info'] = example_df[['atlas', 'hemisphere', 'StructName']].agg('__'.join, axis=1)
example_df = example_df.drop(columns=['atlas', 'hemisphere', 'StructName'])

# Make a place to store the data
output_df = pd.DataFrame(columns = ['measure_info', 'value'])

# Iterate through the measures flattening them and adding them to the new df
allowed_measures = ['NumVert', 'SurfArea', 'GrayVol', 'ThickAvg', 'ThickStd', 'MeanCurv', 'GausCurv', 'FoldInd', 'CurvInd', 
                    'Index', 'SegId', 'Mean_wgpct', 'StdDev_wgpct', 'Min_wgpct', 'Max_wgpct', 'Range_wgpct', 'SNR_wgpct', 
                    'Mean_piallgi', 'StdDev_piallgi', 'Min_piallgi', 'Max_piallgi', 'Range_piallgi']
for measure in allowed_measures:
    temp_df = pd.DataFrame(columns = ['measure_info', 'value'])
    labels = example_df['feature_info'].apply(lambda x : f'{x}__{measure}')
    values = example_df[measure]
    temp_df['measure_info'] = labels
    temp_df['value'] = values
    output_df = pd.concat([output_df, temp_df])
    
output_df

Unnamed: 0,measure_info,value
0,aparc.DKTatlas__lh__caudalanteriorcingulate__N...,1668
1,aparc.DKTatlas__lh__caudalmiddlefrontal__NumVert,3308
2,aparc.DKTatlas__lh__cuneus__NumVert,4102
3,aparc.DKTatlas__lh__entorhinal__NumVert,737
4,aparc.DKTatlas__lh__fusiform__NumVert,4115
...,...,...
13735,Yeo2011_7Networks_N1000__rh__7Networks_3__Rang...,2.25
13736,Yeo2011_7Networks_N1000__rh__7Networks_4__Rang...,3.2703
13737,Yeo2011_7Networks_N1000__rh__7Networks_5__Rang...,2.7538
13738,Yeo2011_7Networks_N1000__rh__7Networks_6__Rang...,3.055


In [5]:
participant_id = 1000393599
flatten_RBC_participant(participant_id).rename(columns={'value':participant_id}).set_index('measure_info').T

measure_info,aparc.DKTatlas__lh__caudalanteriorcingulate__NumVert,aparc.DKTatlas__lh__caudalmiddlefrontal__NumVert,aparc.DKTatlas__lh__cuneus__NumVert,aparc.DKTatlas__lh__entorhinal__NumVert,aparc.DKTatlas__lh__fusiform__NumVert,aparc.DKTatlas__lh__inferiorparietal__NumVert,aparc.DKTatlas__lh__inferiortemporal__NumVert,aparc.DKTatlas__lh__isthmuscingulate__NumVert,aparc.DKTatlas__lh__lateraloccipital__NumVert,aparc.DKTatlas__lh__lateralorbitofrontal__NumVert,...,Yeo2011_7Networks_N1000__lh__7Networks_6__Range_piallgi,Yeo2011_7Networks_N1000__lh__7Networks_7__Range_piallgi,Yeo2011_7Networks_N1000__rh__FreeSurfer_Defined_Medial_Wall__Range_piallgi,Yeo2011_7Networks_N1000__rh__7Networks_1__Range_piallgi,Yeo2011_7Networks_N1000__rh__7Networks_2__Range_piallgi,Yeo2011_7Networks_N1000__rh__7Networks_3__Range_piallgi,Yeo2011_7Networks_N1000__rh__7Networks_4__Range_piallgi,Yeo2011_7Networks_N1000__rh__7Networks_5__Range_piallgi,Yeo2011_7Networks_N1000__rh__7Networks_6__Range_piallgi,Yeo2011_7Networks_N1000__rh__7Networks_7__Range_piallgi
1000393599,1668,3308,4102,737,4115,7381,4828,1578,10035,4309,...,3.3341,3.4807,2.4321,1.2908,3.3515,2.25,3.2703,2.7538,3.055,2.759


In [6]:
make_RBC_data_into_a_table([1000393599,1317462,11407866])

measure_info,aparc.DKTatlas__lh__caudalanteriorcingulate__NumVert,aparc.DKTatlas__lh__caudalmiddlefrontal__NumVert,aparc.DKTatlas__lh__cuneus__NumVert,aparc.DKTatlas__lh__entorhinal__NumVert,aparc.DKTatlas__lh__fusiform__NumVert,aparc.DKTatlas__lh__inferiorparietal__NumVert,aparc.DKTatlas__lh__inferiortemporal__NumVert,aparc.DKTatlas__lh__isthmuscingulate__NumVert,aparc.DKTatlas__lh__lateraloccipital__NumVert,aparc.DKTatlas__lh__lateralorbitofrontal__NumVert,...,Slab__rh__region00878__StdDev_piallgi,CC200__lh__region00047__Min_piallgi,CC400__rh__region00012__Min_piallgi,Slab__rh__region00878__Min_piallgi,CC200__lh__region00047__Max_piallgi,CC400__rh__region00012__Max_piallgi,Slab__rh__region00878__Max_piallgi,CC200__lh__region00047__Range_piallgi,CC400__rh__region00012__Range_piallgi,Slab__rh__region00878__Range_piallgi
1000393599,1668,3308,4102,737,4115,7381,4828,1578,10035,4309,...,,,,,,,,,,
1317462,1931,4304,3388,619,4521,6401,5000,1747,8199,4887,...,,,,,,,,,,
11407866,2014,4213,3230,787,4320,7388,6058,1574,7431,4829,...,0.0,2.4648,2.4339,3.1556,2.4648,2.4339,3.1556,0.0,0.0,0.0


In [7]:
# example_df.columns

In [8]:
example_df.isna().sum(axis=0)

subject_id        0
NumVert           0
SurfArea          0
GrayVol           0
ThickAvg          0
ThickStd          0
MeanCurv          0
GausCurv          0
FoldInd           0
CurvInd           0
Index             0
SegId             0
Mean_wgpct        0
StdDev_wgpct      0
Min_wgpct         0
Max_wgpct         0
Range_wgpct       0
SNR_wgpct         0
Mean_piallgi      0
StdDev_piallgi    0
Min_piallgi       0
Max_piallgi       0
Range_piallgi     0
feature_info      0
dtype: int64

In [9]:
# example_df['StructName'].tolist()
example_df['hemisphere'].value_counts()

KeyError: 'hemisphere'

In [None]:
example_df['StructName'].value_counts().value_counts()

In [None]:
count_data = example_df['StructName'].value_counts()
count_data[count_data == 40]

In [None]:
count_data = example_df['StructName'].value_counts()
count_data[count_data == 4]

In [None]:
count_data = example_df[['StructName','atlas']].value_counts()
count_data.value_counts()

In [None]:
count_data[count_data == 2]

In [None]:
example_df['atlas'].value_counts()

In [None]:
example_df[example_df['atlas'] == 'Yeo2011_17Networks_N1000']