# Notebook to take a first look at the HBN data and classify subjects w/ and w/out ADHD

In [1]:
import numpy as np
import pandas as pd
from glob import glob
import os

In [2]:
brain_dir='/nobackup/scratch/Mon/jsmentch/hbn_cifti_cleaned/smoothed/'

### list all subjects to get a look at who has a dx or not

In [3]:
#load data
all_dx = np.load('../sourcedata/data/HBN/phenotype/parsed/dx_list.npy')
sub_list = np.load('../sourcedata/data/HBN/phenotype/parsed/sub_list.npy')
dx_array = np.load('../sourcedata/data/HBN/phenotype/parsed/dx_onehot.npy')

In [4]:
sub_list = list(sub_list)

In [5]:
subdirs=glob(f"{brain_dir}*/")
stripped = [s[:-1] for s in subdirs]
stripped = [os.path.basename(s) for s in stripped]
sub_ids = [s[4:] for s in stripped]

### how many subjects have dx data???

In [6]:
dx_count = 0
dne_count = 0
dx_array_RU = []

sub_list_w_dx = [] # ru subjects with diagnostic data available
dx_list_w_dx = [] # corresponding list of dx one hot data

for s in sub_ids:
    try:
        ind = sub_list.index(s)
        dx_count = dx_count+1
        sub_list_w_dx.append(sub_list[ind])
        dx_list_w_dx.append(dx_array[ind])
        
    except:
        #print("DNE")
        dne_count=dne_count+1
print(f"no dx data for: {dne_count}")
print(f"dx data for: {dx_count}")  

no dx data for: 513
dx data for: 391


Less than half at the moment. 391/904 = ~43% 7/13/21

In [7]:
dx_df = pd.DataFrame(dx_list_w_dx)
dx_df.columns = all_dx

In [8]:
dx_sum = dx_df.sum()
print(dx_sum.to_string())

ADHD-Combined Type                                                              90.0
ADHD-Hyperactive/Impulsive Type                                                 12.0
ADHD-Inattentive Type                                                           95.0
Acute Stress Disorder                                                            0.0
Adjustment Disorders                                                             9.0
Agoraphobia                                                                      1.0
Alcohol Use Disorder                                                             0.0
Autism Spectrum Disorder                                                        49.0
Avoidant/Restrictive Food Intake Disorder                                        0.0
Binge-Eating Disorder                                                            1.0
Bipolar I Disorder                                                               0.0
Bipolar II Disorder                                              

### get subjects with no dx and subjects with ASD since it is about balanced

In [9]:
list(all_dx).index("Autism Spectrum Disorder")
list(all_dx).index("No Diagnosis Given")

35

In [20]:
asd_sub_list = [] # list of RU subject ids w asd
asd_dx_list = []  # list of RU subjects w asd full dx
nt_sub_list = []  # list of RU subjects w no dx

adhd_c_sub_list = []
adhd_c_dx_list = []

for i,s in enumerate(dx_list_w_dx):
    if s[35]==1:
        #print("NT")
        nt_sub_list.append(sub_list_w_dx[i])
    if s[7]==1:
        #print("ASD")
        asd_sub_list.append(sub_list_w_dx[i])
        asd_dx_list.append(dx_list_w_dx[i])
    if s[0]==1:
        #print("ASD")
        adhd_c_sub_list.append(sub_list_w_dx[i])
        adhd_c_dx_list.append(dx_list_w_dx[i])
    

In [22]:
len(nt_sub_list)

48

### load asd subjects. load nt subjects... Then.. Compare something

In [11]:
asd_sub_list
nt_sub_list
adhd_c_sub_list

['NDARVD635FX8',
 'NDARAC853DTE',
 'NDARUZ206DRV',
 'NDARZW262ZLV',
 'NDARXL023NG9',
 'NDARYN595JMA',
 'NDARML406ZB8',
 'NDARCF247TDJ',
 'NDARPW915RGD',
 'NDARFR095UJK',
 'NDARZE850WXD',
 'NDARMZ200GVD',
 'NDARLN778RYN',
 'NDAREF164ZUJ',
 'NDARKP815KPZ',
 'NDARXK076XU8',
 'NDARFA402LMW',
 'NDARWA351ZE2',
 'NDARPV303LAX',
 'NDARRA717GYV',
 'NDARUX284GGB',
 'NDARYZ909VND',
 'NDARVP135ZGE',
 'NDARTX795AKR',
 'NDARAP782TVC',
 'NDARUN221VCJ',
 'NDARDR458MR7',
 'NDARPP622WV4',
 'NDARJW373UE3',
 'NDARJZ526HN3',
 'NDARPL201YL4',
 'NDAREK801BPB',
 'NDARYG391PMU',
 'NDARXY745NXJ',
 'NDARDF568GL5',
 'NDARKJ322ELL',
 'NDARHT095YB4',
 'NDARAD232HVV',
 'NDARDC987BMU',
 'NDARXD388TTE',
 'NDARCK647MU6',
 'NDARLL790WLF',
 'NDARNR734JZH',
 'NDARFR820KFF',
 'NDARAA948VFH',
 'NDARUB231LHN',
 'NDARRL685WB7',
 'NDARTP313AGH',
 'NDARAE358VBE',
 'NDARGA499CKF',
 'NDARHN749RW4',
 'NDARLF142AF5',
 'NDARGU395RFP',
 'NDARFT881VT6',
 'NDARRV837BZQ',
 'NDARCZ915NC1',
 'NDARKW521EMY',
 'NDAREC647MKW',
 'NDARNZ792HBN

In [12]:
#how many have both the present and despicable me

In [26]:
import os.path
prefix='/nobackup/scratch/Mon/jsmentch/hbn_cifti_cleaned/smoothed/sub-'

for s in [asd_sub_list, nt_sub_list, adhd_c_sub_list]:
    total_count=0
    DM_count=0
    TP_count=0
    for a in s:
        total_count = total_count+1
        if os.path.isfile(f'{prefix}{a}/sub-{a}_clean_task-movieDM_space-fsLR_den-91k_bold.dtseries.nii'):
            DM_count =DM_count+1
        if os.path.isfile(f'{prefix}{a}/sub-{a}_clean_task-movieTP_space-fsLR_den-91k_bold.dtseries.nii'):
            TP_count =TP_count+1
    print(f'total={total_count}, the present={TP_count}, despicable me={DM_count}')

total=49, the present=46, despicable me=35
total=48, the present=46, despicable me=43
total=90, the present=84, despicable me=71
