In [4]:
import pandas as pd
import os
import glob
from pathlib import Path
#from dotenv import load_dotenv
import numpy as np
# import dask.dataframe as dd
# import dask.bag as db

import matplotlib.pyplot as plt
import seaborn as sns

## generate merged csv with all of the data demographics, diagnoses etc

In [3]:
nat_img_dir='/nese/mit/group/sig/projects/naturalistic/nat_img/'

all_qc = pd.read_csv(  f"../data/all_qc.csv"  )
all_qc = all_qc[all_qc['task'].isin(['movieDM', 'movieTP'])]

def get_imaging_site(identifier):
    path = f'/nese/mit/group/sig/projects/hbn/hbn_bids/sub-{identifier}/'
    if os.path.isdir(path + 'ses-HBNsiteRU'):
        return 'HBNsiteRU'
    elif os.path.isdir(path + 'ses-HBNsiteCBIC'):
        return 'HBNsiteCBIC'
    elif os.path.isdir(path + 'ses-HBNsiteSI'):
        return 'HBNsiteSI'
    elif os.path.isdir(path + 'ses-HBNsiteCUNY'):
        return 'HBNsiteCUNY'
    else:
        return None
        
dx_final=pd.read_csv(f'{nat_img_dir}sourcedata/data/HBN/phenotype/parsed/dx_onehot.csv')
dx_final['imaging_site'] = dx_final['Identifiers'].apply(get_imaging_site)

filtered_df = dx_final[dx_final['Autism Spectrum Disorder'] == 1]
asd_df = filtered_df[['Identifiers','imaging_site']].rename(columns={'Identifiers': 'subj_id'})
asd_df['subj_id'] = 'sub-' + asd_df['subj_id'].astype(str)

filtered_df = dx_final[dx_final['No Diagnosis Given'] == 1]
nt_df = filtered_df[['Identifiers','imaging_site']].rename(columns={'Identifiers': 'subj_id'})
nt_df['subj_id'] = 'sub-' + nt_df['subj_id'].astype(str)

filtered_df = dx_final[(dx_final['No Diagnosis Given'] == 0) & (dx_final['Autism Spectrum Disorder'] == 0)]
pilot_df = filtered_df[['Identifiers','imaging_site']].rename(columns={'Identifiers': 'subj_id'})
pilot_df['subj_id'] = 'sub-' + pilot_df['subj_id'].astype(str)


KeyboardInterrupt



In [None]:
merged_qc = all_qc.merge(dx_final, left_on='sub', right_on='Identifiers', how='inner')

In [None]:
import pandas as pd

# Initialize an empty list to store the max FD values
max_fd_values = []

for index, row in merged_qc.iterrows():
    try:
        # Generate the confounds_file path based on the row's sub, ses, and task columns
        confounds_file = f'/nese/mit/group/sig/projects/hbn/hbn_bids/derivatives/fmriprep_23.2.0/sub-{row["sub"]}/ses-{row["ses"]}/func/sub-{row["sub"]}_ses-{row["ses"]}_task-{row["task"]}_desc-confounds_timeseries.tsv'
        
        # Load the confounds file and extract the max FD value
        confounds_df = pd.read_csv(confounds_file, sep='\t')
        max_fd_value = confounds_df['framewise_displacement'].max()
        #print(max_fd_value)
        # Append the max FD value to the list
        max_fd_values.append(max_fd_value)
    except:
        max_fd_values.append('nan')
# Assign the list as a new column in the DataFrame
merged_qc['max_fd'] = max_fd_values


In [23]:
(merged_qc['max_fd'] == 'nan').sum()

690

690 have 'nan' so they do not have the confounds file available...

In [6]:
#make a column for if the t1 exists
t1_exists = []

for index, row in merged_qc.iterrows():
    # Generate the confounds_file path based on the row's sub, ses, and task columns
    t1_file=f'/nese/mit/group/sig/projects/hbn/hbn_bids/sub-{row["sub"]}/ses-{row["ses"]}/anat/sub-{row["sub"]}_ses-{row["ses"]}_*T1w.nii.gz'
    matching_files = glob.glob(t1_file)

    if matching_files:
        #print(f"{t1_file} exists")
        t1_exists.append(1)
    else:
        #print(f"{t1_file} does not exist")
        t1_exists.append(0)
# Assign the list as a new column in the DataFrame
merged_qc['t1_exists'] = t1_exists

In [8]:
merged_qc.to_csv('../data/merged_qc.csv', index=False)

## do stuff with the merged_qc data

In [5]:
merged_qc = pd.read_csv('../data/merged_qc.csv', index_col=False)

In [6]:
pd.set_option('display.max_columns', None)
print(merged_qc)

           aor       aqi                                          bids_meta  \
0     0.001634  0.011370  {'AcquisitionMatrixPE': 84, 'BandwidthPerPixel...   
1     0.030871  0.069020  {'AcquisitionMatrixPE': 84, 'BandwidthPerPixel...   
2     0.000461  0.004402  {'AcquisitionMatrixPE': 84, 'BandwidthPerPixel...   
3     0.014249  0.051829  {'AcquisitionMatrixPE': 84, 'BandwidthPerPixel...   
4     0.001283  0.010898  {'AcquisitionMatrixPE': 84, 'BandwidthPerPixel...   
...        ...       ...                                                ...   
4259  0.006022  0.011833  {'AcquisitionMatrixPE': 84, 'BandwidthPerPixel...   
4260  0.001287  0.017197  {'AcquisitionMatrixPE': 84, 'BandwidthPerPixel...   
4261  0.009926  0.018417  {'AcquisitionMatrixPE': 84, 'BandwidthPerPixel...   
4262  0.002157  0.013886  {'AcquisitionMatrixPE': 84, 'BandwidthPerPixel...   
4263  0.004356  0.013524  {'AcquisitionMatrixPE': 84, 'BandwidthPerPixel...   

      dummy_trs  dvars_nstd  dvars_std  dvars_vstd 

### how many subjects are lost to no existing t1?

In [7]:
other_qc = merged_qc[(merged_qc['No Diagnosis Given'] == 0) & (merged_qc['Autism Spectrum Disorder'] == 0)]
other_qc_DM=other_qc[(other_qc['task']=='movieDM')]
other_qc_TP=other_qc[(other_qc['task']=='movieTP')]

asd_qc = merged_qc[(merged_qc['Autism Spectrum Disorder'] == 1)]
asd_qc_DM=asd_qc[(asd_qc['task']=='movieDM')]
asd_qc_TP=asd_qc[(asd_qc['task']=='movieTP')]

control_qc = merged_qc[(merged_qc['No Diagnosis Given'] == 1)]
control_qc_DM=control_qc[(control_qc['task']=='movieDM')]
control_qc_TP=control_qc[(control_qc['task']=='movieTP')]


#total subjects

In [8]:
for df in [merged_qc, asd_qc, control_qc, other_qc]:
#    print(f'{df}')
    df_name = [name for name in globals() if globals()[name] is df][0]
    print(f"{df_name}")

    print( 'all listed:  ',len(df))
    print( 'with t1:     ', len(df[(df['t1_exists'] == 1)]) )
    print( 'max FD <2.4: ', df[df['max_fd'] < 2.4].shape[0])
    print( '\n')


merged_qc
all listed:   4264
with t1:      4229
max FD <2.4:  1519


asd_qc
all listed:   577
with t1:      570
max FD <2.4:  200


control_qc
all listed:   321
with t1:      320
max FD <2.4:  121


other_qc
all listed:   3366
with t1:      3339
max FD <2.4:  1198




In [9]:
unique_sites = merged_qc['ses'].unique()
print(unique_sites)

['HBNsiteRU' 'HBNsiteCBIC' 'HBNsiteCUNY']


In [10]:
leest=['asd','asd_DM','asd_TP',
'control','control_DM','control_TP',
'other','other_DM','other_TP',]
for i,df in enumerate([asd_qc, asd_qc_DM, asd_qc_TP, control_qc, control_qc_DM, control_qc_TP, other_qc, other_qc_DM, other_qc_TP]):
#    print(f'{df}')
    
    print(f'{leest[i]}')
    print( 'all listed:  ',len(df))
    print( 'with t1:     ', len(df[(df['t1_exists'] == 1)]) )
    print( 'max FD <2.4: ', df[df['max_fd'] < 2.4].shape[0])
    print( '\n')


asd
all listed:   577
with t1:      570
max FD <2.4:  200


asd_DM
all listed:   288
with t1:      283
max FD <2.4:  78


asd_TP
all listed:   289
with t1:      287
max FD <2.4:  122


control
all listed:   321
with t1:      320
max FD <2.4:  121


control_DM
all listed:   162
with t1:      161
max FD <2.4:  51


control_TP
all listed:   159
with t1:      159
max FD <2.4:  70


other
all listed:   3366
with t1:      3339
max FD <2.4:  1198


other_DM
all listed:   1660
with t1:      1645
max FD <2.4:  506


other_TP
all listed:   1706
with t1:      1694
max FD <2.4:  692




### pick another pilot cohort

In [12]:
#pick 30 new random subjects
with open('pilots_ru_dm_list.txt', 'r') as file:
    exclude_list = file.read().splitlines()

# Step 2: Filter the DataFrame to exclude rows where 'sub' is in the exclude_list
filtered_df = other_qc[~other_qc['sub'].isin(exclude_list)]

# Step 3: Select 26 random rows from the filtered DataFrame
random_rows = filtered_df.sample(n=26)

# Display the selected random rows
sub_column = random_rows['sub']

# Step 5: Save the 'sub' column to a new text file
sub_column.to_csv('pilots_ru_dm_list2.txt', index=False, header=False)

# Verify the output by printing the saved list (optional)
print(sub_column)

1476    NDARTF079JGU
896     NDARJZ526HN3
2439    NDARDE502TEU
3063    NDARKG859AGN
221     NDARCH795JD4
632     NDARGH425GB9
2478    NDARDR254KXZ
2947    NDARJB501MJL
1128    NDARMP745EAC
980     NDARKZ519FBT
2534    NDAREC648WEL
2241    NDARBF183RFB
864     NDARJR437DJH
1755    NDARWB149BL1
434     NDAREK255DEE
3811    NDARVB151GXF
228     NDARCJ348YVB
2779    NDARGL586HN9
2447    NDARDF568GL5
2203    NDARAV554TP2
2236    NDARBE641DGZ
1945    NDARYF272EDC
1536    NDARTX795AKR
1747    NDARWA351ZE2
494     NDARFB263RJK
3933    NDARWG820RKE
Name: sub, dtype: object
