Description: The purpose of this script is to summarize the content of a BIDS input directory in order to quantify the number of T1w, fieldmap, and functional files present and later remove those participants without the desired files from the fmriprep outputs. 

In [1]:
#imports
import argparse
import os
import glob
import pandas as pd

In [8]:
#to run .py from command line

#parser = argparse.ArgumentParser(description='Sorts a BIDS data directory and give a summary of T1w, fieldmap, and fMRI data.')

#parser.add_argument('--bids_dir', help='Path to the bids directory', type=str)
#parser.add_argument('--pp_out', help='Out put path where you want the summary tables to be saved', type=str)
                    
#args = parser.parse_args()
#bids_dir = args.bids_dir
#pp_out = args.pp_out

In [2]:
#define directories for testing

bids_dir = '/cifs/butler/HBN_data/TD_test_set' #contains unzipped input files in BIDS format
pp_out = '/cifs/butler/HBN_data/preprocessing/postprocessing_outputs' #save .csv here

In [4]:
#define functions

def list_directories(path, *keywords):
    os.chdir(path)
    dirs = []
    for keyword in keywords:
        dirs.extend([f for f in glob.glob(keyword) if os.path.isdir(f)])
    return [os.path.basename(d) for d in dirs] #returns list of directories in given folder

def list_files(path, *keywords): #keyword is type of file (e.g., "*T1w.nii.gz*", "*bold.nii.gz*", "*fMRI_epi.nii.gz*")
    os.chdir(path)
    files = []
    for keyword in keywords:
        files.extend(glob.glob(keyword))
    return [os.path.basename(file) for file in files] #returns list of files in given directory


In [5]:
sub_directories = list_directories(bids_dir, "sub*")

dict_list = [] #create list to contain individual subject dictionaries 

for i, sub in enumerate(sub_directories): #for each subject directory
    content = list_directories(bids_dir + '/' + sub, '*') #gets data folders in each subject directory
    sub_dict = {'number': i, 'id': sub} #initialize dictionary for each subject
    t1_count = 0
    func_count = 0
    fmap_count = 0
    
    for c in content: #check/quantify content in each subject's subfolders
        if c == 'anat':
            T1 = list_files(bids_dir + '/' + sub + '/' + c, '*T1w.nii.gz*')
            t1_count = len(T1)
            sub_dict['t1'] = 'yes'
            sub_dict['t1_files'] = t1_count
        elif c == 'func':
            func = list_files(bids_dir + '/' + sub + '/' + c, '*bold.nii.gz*')
            func_count = len(func)
            sub_dict['func'] = 'yes'
            sub_dict['func_files'] = func_count
        elif c == 'fmap':
            fmap = list_files(bids_dir + '/' + sub + '/' + c, '*fMRI_epi.nii.gz*')
            fmap_count = len(fmap)
            sub_dict['fmap'] = 'yes'
            sub_dict['fmap_files'] = fmap_count         
    
    #if no content (values unchanged from initialization)
    if t1_count == 0:
        sub_dict['t1'] = 'no'
        sub_dict['t1_files'] = 0
    if func_count == 0:
        sub_dict['func'] = 'no'
        sub_dict['func_files'] = 0
    if fmap_count == 0:
        sub_dict['fmap'] = 'no'
        sub_dict['fmap_files'] = 0
    
    dict_list.append(sub_dict)

In [6]:
df = pd.DataFrame(dict_list)
df.to_csv(pp_out+'/BIDS-count_all.csv', sep=',', index=False)

#get df of participants with no T1 OR fmaps to exclude
df_exclude = df[(df["t1_files"] == 0) | (df["fmap_files"] == 0)]
df_exclude.to_csv(pp_out+'/BIDS-count_exclude.csv', sep=',', index=False)

#get included (inverse of above, df of all-df exclude)
df_include = df[~df['id'].isin(df_exclude["id"])]
df_include.to_csv(pp_out+'/BIDS-count_include.csv', sep=',', index=False)