Description: This script checks the size of the .out files in the output>code>jobs directory to determine whether fmriprep was completed for a given participant. File size <10kb fmriprep did not start, >400kb likely complete, ~200-300kb partially complete/error. Incomplete participants may be diagnosed and processed again at a later date.

In [1]:
#imports
import argparse
import os
import glob
import pandas as pd
import numpy as np

In [8]:
#to run .py from command line

#parser = argparse.ArgumentParser(description='Sorts a BIDS data directory and give a summary of T1w, fieldmap, and fMRI data.')

#parser.add_argument('--bids_dir', help='Path to the bids directory', type=str)
#parser.add_argument('--out_dir', help='Out put path where you want the summary tables to be saved', type=str)
                    
#args = parser.parse_args()
#bids_dir = args.bids_dir
#out_dir = args.out_dir

In [2]:
#define directories for testing

jobs_dir = '/cifs/butler/HBN_data/TD_test_set_output/code/jobs' #contains .out files
pp_out = '/cifs/butler/HBN_data/preprocessing/postprocessing_outputs' #save .csv here

In [3]:
#define functions

def list_files(path, *keywords): #keyword is type of file (e.g., "*T1w.nii.gz*", "*bold.nii.gz*", "*fMRI_epi.nii.gz*")
    os.chdir(path)
    files = []
    for keyword in keywords:
        files.extend(glob.glob(keyword))
    return [os.path.basename(file) for file in files] #returns list of files in given directory


In [4]:
out_list = list_files(jobs_dir, "*.out*") #get .out filenames

out_content = []
for file in out_list:
    with open(jobs_dir + "/" + file) as f:
        data = f.read()#read .out file contents
    out_content.append(data)

out_ids = [] #get participant ID from each file
for o in out_content:
    p_id = o.partition("participant_label ")[2] #find ID after "participant_label "
    p_id = p_id[0:12] #remove everything after the ID (first 12 characters)
    out_ids.append(p_id)
    
out_size = [] #get size of each .out file
for file in out_list:
    size_b = os.stat(jobs_dir + "/" + file)
    size_kb = size_b.st_size / 1000 #convert size to kb
    out_size.append(size_kb)
    
#combine lists into df
size_df = pd.DataFrame({'file_name': out_list, 'p_id': out_ids, 'size_kb': out_size})

#clean df
mask = size_df["p_id"].str.startswith("NDA") #remove non-IDs (does not start with NDA)
clean_df = size_df[mask]
max_size = clean_df.groupby('p_id').max().reset_index() #gives file with the largest size for each unique participant id
max_df = pd.DataFrame({'p_id': max_size["p_id"], 'size_kb': max_size["size_kb"], 'file_name': max_size["file_name"]}) #organize in df

#add corresponding categories based on description above
#if max_df["size_kb"] < 10:
#    max_df["status"] = "not started"

max_df['status'] = np.where(max_df.size_kb < 10,'not started', 'unknown')
max_df['status'] = np.where(max_df.size_kb > 400,'likely complete', max_df['status'])
max_df['status'] = np.where((max_df.size_kb > 200) & (max_df.size_kb < 400),'partial/error', max_df['status'])

In [5]:
max_df.to_csv(pp_out+'/out-size_all.csv', sep=',', index=False)

#get df of incomplete participants
df_incomp = max_df[(max_df["status"] == "not started") | (max_df["status"] == "unknown") | (max_df["status"] == "partial/error")]
df_incomp.to_csv(pp_out+'/out-size_incomp.csv', sep=',', index=False)

#likely complete participants
df_comp = max_df[max_df["status"] == "likely complete"]
df_comp.to_csv(pp_out+'/out-size_comp.csv', sep=',', index=False)