In [1]:
import pandas as pd
import nibabel as nib
import os
import numpy as np
import matplotlib.pyplot as plt
from nilearn import image
import ants
from datetime import datetime, timedelta
import pydicom as dicom
import pickle
import matplotlib.image as mpimg
from collections import defaultdict, Counter
from scipy import stats
from glob import glob

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)

Matplotlib created a temporary config/cache directory at /tmp/matplotlib-7h8i4fw3 because the default path (/gpfs/home/lc3424/.config/matplotlib) is not a writable directory; it is highly recommended to set the MPLCONFIGDIR environment variable to a writable directory, in particular to speed up the import of Matplotlib and to better support multiprocessing.
Matplotlib is building the font cache; this may take a moment.


In [2]:
dataset_path_list = ['/gpfs/data/razavianlab/data/mri/nyu/barlow_bids_t1_preprocess_A_part_a/subjects/', '/gpfs/data/razavianlab/data/mri/nyu/barlow_bids_t1_preprocess_A_part_b/subjects/']
dataset_file_path = []

for dataset_path in dataset_path_list:
    temp = glob(dataset_path + '*/*/t1/spm/segmentation/normalized_space/*')
    temp = [f for f in temp if 'Space_T1w.nii.gz' in f] # condition for a T1 image as input for the model
    files = []

    for f in temp:
        res = f.split('/')
        if '_' in res[10]:
            print(f)
        files.append((res[9].split('-')[1], res[10].split('-')[1], f))  # (sub, ses, file_path)

    dataset_file_path = dataset_file_path + files


In [27]:
file_df = pd.DataFrame(dataset_file_path, columns=['Subject', 'Session', 'Path'])
file_df['Session'] = file_df['Session'].astype(int)
file_df

Unnamed: 0,Subject,Session,Path
0,26a0b76ad3804709969fc4ee26bb6a35,146241246534,/gpfs/data/razavianlab/data/mri/nyu/barlow_bids_t1_preprocess_A_part_a/subjects/sub-26a0b76ad3804709969fc4ee26bb6a35/ses-146241246534/t1/spm/segmentation/normalized_space/sub-26a0b76ad3804709969fc4ee26bb6a35_ses-146241246534_3_20100111-Head_SAG_3D_MPR_PRE_COG-21_space-Ixi549Space_T1w.nii.gz
1,26a0b76ad3804709969fc4ee26bb6a35,146241246534,/gpfs/data/razavianlab/data/mri/nyu/barlow_bids_t1_preprocess_A_part_a/subjects/sub-26a0b76ad3804709969fc4ee26bb6a35/ses-146241246534/t1/spm/segmentation/normalized_space/sub-26a0b76ad3804709969fc4ee26bb6a35_ses-146241246534_4_20100111-SAG_MPR_RECON-22_space-Ixi549Space_T1w.nii.gz
2,23d448d44b6d405db15b2d7c39530bf1,620076860576,/gpfs/data/razavianlab/data/mri/nyu/barlow_bids_t1_preprocess_A_part_a/subjects/sub-23d448d44b6d405db15b2d7c39530bf1/ses-620076860576/t1/spm/segmentation/normalized_space/sub-23d448d44b6d405db15b2d7c39530bf1_ses-620076860576_20150804-AX_MPR_RECON-100_space-Ixi549Space_T1w.nii.gz
3,e11038a612204528b0797519337fd5a8,151767910304,/gpfs/data/razavianlab/data/mri/nyu/barlow_bids_t1_preprocess_A_part_a/subjects/sub-e11038a612204528b0797519337fd5a8/ses-151767910304/t1/spm/segmentation/normalized_space/sub-e11038a612204528b0797519337fd5a8_ses-151767910304_3_20080714-SAG_MPR_3MM-10_space-Ixi549Space_T1w.nii.gz
4,4f5e247c8ca940f59eef24ce0cee04a4,129557817883,/gpfs/data/razavianlab/data/mri/nyu/barlow_bids_t1_preprocess_A_part_a/subjects/sub-4f5e247c8ca940f59eef24ce0cee04a4/ses-129557817883/t1/spm/segmentation/normalized_space/sub-4f5e247c8ca940f59eef24ce0cee04a4_ses-129557817883_3_20081226-SAG_MPR_RECON-11_space-Ixi549Space_T1w.nii.gz
...,...,...,...
6703,63ab908c9047411998a221cc438bcf3d,837643063570,/gpfs/data/razavianlab/data/mri/nyu/barlow_bids_t1_preprocess_A_part_b/subjects/sub-63ab908c9047411998a221cc438bcf3d/ses-837643063570/t1/spm/segmentation/normalized_space/sub-63ab908c9047411998a221cc438bcf3d_ses-837643063570_20080321-SAG_3D_MPR-20_space-Ixi549Space_T1w.nii.gz
6704,63ab908c9047411998a221cc438bcf3d,308915443481,/gpfs/data/razavianlab/data/mri/nyu/barlow_bids_t1_preprocess_A_part_b/subjects/sub-63ab908c9047411998a221cc438bcf3d/ses-308915443481/t1/spm/segmentation/normalized_space/sub-63ab908c9047411998a221cc438bcf3d_ses-308915443481_20080407-AX_MPR_RECON-17_space-Ixi549Space_T1w.nii.gz
6705,63ab908c9047411998a221cc438bcf3d,168336444002,/gpfs/data/razavianlab/data/mri/nyu/barlow_bids_t1_preprocess_A_part_b/subjects/sub-63ab908c9047411998a221cc438bcf3d/ses-168336444002/t1/spm/segmentation/normalized_space/sub-63ab908c9047411998a221cc438bcf3d_ses-168336444002_3_20071228-SAG_MPR_RECON-103_space-Ixi549Space_T1w.nii.gz
6706,92a60c6773744442bf8a5384d245a4a4,227760717173,/gpfs/data/razavianlab/data/mri/nyu/barlow_bids_t1_preprocess_A_part_b/subjects/sub-92a60c6773744442bf8a5384d245a4a4/ses-227760717173/t1/spm/segmentation/normalized_space/sub-92a60c6773744442bf8a5384d245a4a4_ses-227760717173_20140521-SAG_MPR_ISO-3_space-Ixi549Space_T1w.nii.gz


In [69]:
# get age for subject and session
# age is the scan age, derived by the original age, scan date and visit date

orig_label_file = pd.read_csv('/gpfs/home/lc3424/capstone/de_id_MRIs_Specialists_diag.csv')
orig_label_file.drop('Unnamed: 0', axis=1, inplace=True)
orig_label_file = orig_label_file[['random_pat_id', 'de-identified acc', 'Age', 'scan_date_time', 'visit_date']]
orig_label_file.dropna(axis=0, inplace=True)
orig_label_file['de-identified acc'] = orig_label_file['de-identified acc'].astype(int)
orig_label_file.scan_date_time = pd.to_datetime(orig_label_file.scan_date_time)
orig_label_file.visit_date = pd.to_datetime(orig_label_file.visit_date)
orig_label_file['scan_age'] = orig_label_file.apply(lambda x: x[2] - (x[4] - x[3]) / timedelta(days=365), axis=1)
orig_label_file = orig_label_file[['random_pat_id', 'de-identified acc', 'scan_age']]
orig_label_file = orig_label_file.groupby(['random_pat_id', 'de-identified acc']).mean()
orig_label_file.reset_index(inplace=True)
orig_label_file['scan_age'] = orig_label_file['scan_age'].apply(lambda x: round(x))
orig_label_file.rename({'random_pat_id': 'Subject', 'de-identified acc': 'Session', 'scan_age':'Age'}, axis=1, inplace=True)
orig_label_file

Unnamed: 0,Subject,Session,Age
0,000639ac567846828f696df38fe1e260,109626762488,24
1,000639ac567846828f696df38fe1e260,189102882698,24
2,000639ac567846828f696df38fe1e260,207888400400,24
3,000639ac567846828f696df38fe1e260,327622431994,25
4,0049a2c99f4f47438db63c10bb5884ab,471741632715,74
...,...,...,...
10311,ff94c507d9e242ec81789acde8278c84,103395921560,67
10312,ff970108931c41829c901c6d8c150ede,223196674637,78
10313,ffa41e1903c74318ab119ee5dcbed437,843988142885,85
10314,ffb0a53017ca44e7b8602383fb8b1cfb,530894228721,43


In [70]:
label_df = pd.read_csv('/gpfs/home/lc3424/capstone/2021_dementia/lc3424_workspace/experiments/20211102/label_20211102.tsv', sep='\t')
label_df.columns = ['Session', 'Subject', 'Label']
print(label_df.shape)
label_df = label_df.merge(orig_label_file, on=['Session'], how='left')
label_df.drop('Subject_y', inplace=True, axis=1)
label_df.rename({'Subject_x':'Subject'}, axis=1, inplace=True)
label_df

(3260, 3)


Unnamed: 0,Session,Subject,Label,Age
0,100027089657,f37b1d2e3cec40ba88ec39be79577f65,1,84
1,100039817943,210909b9725245c5a09e052b931447f4,0,52
2,100086429574,ad32654d20f345b0bab70ffba08df770,1,63
3,100166373876,bcc51523e7014bdda6b5867de8e7a6a5,2,69
4,100197260038,d8200ab1eb464d72998f497732d921a3,1,70
...,...,...,...,...
3255,997728775571,d71ac2a05099410b82d8a070b59ec48d,0,54
3256,998819920173,bbc73dd2480442108e5307511a1b9b75,1,54
3257,999155686841,b3ddd61266c842a9b98597579f1054df,0,56
3258,999370949215,4813b0173c2544f3b0bd987300cb8021,2,57


In [71]:
label_df.Label.value_counts()

0    1475
1    1429
2     356
Name: Label, dtype: int64

In [72]:
temp_df = label_df.merge(file_df, on=['Subject', 'Session'], how='left')
temp_df

Unnamed: 0,Session,Subject,Label,Age,Path
0,100027089657,f37b1d2e3cec40ba88ec39be79577f65,1,84,
1,100039817943,210909b9725245c5a09e052b931447f4,0,52,/gpfs/data/razavianlab/data/mri/nyu/barlow_bids_t1_preprocess_A_part_b/subjects/sub-210909b9725245c5a09e052b931447f4/ses-100039817943/t1/spm/segmentation/normalized_space/sub-210909b9725245c5a09e052b931447f4_ses-100039817943_20120107-AX_MPR_RECON-13_space-Ixi549Space_T1w.nii.gz
2,100086429574,ad32654d20f345b0bab70ffba08df770,1,63,
3,100166373876,bcc51523e7014bdda6b5867de8e7a6a5,2,69,
4,100197260038,d8200ab1eb464d72998f497732d921a3,1,70,
...,...,...,...,...,...
4767,998819920173,bbc73dd2480442108e5307511a1b9b75,1,54,/gpfs/data/razavianlab/data/mri/nyu/barlow_bids_t1_preprocess_A_part_b/subjects/sub-bbc73dd2480442108e5307511a1b9b75/ses-998819920173/t1/spm/segmentation/normalized_space/sub-bbc73dd2480442108e5307511a1b9b75_ses-998819920173_2_20080408-AX_MPR_1MM_POST-102_space-Ixi549Space_T1w.nii.gz
4768,999155686841,b3ddd61266c842a9b98597579f1054df,0,56,/gpfs/data/razavianlab/data/mri/nyu/barlow_bids_t1_preprocess_A_part_a/subjects/sub-b3ddd61266c842a9b98597579f1054df/ses-999155686841/t1/spm/segmentation/normalized_space/sub-b3ddd61266c842a9b98597579f1054df_ses-999155686841_2_20130130-SAG_3D_T1___MS_P_MPR_Ax_mpr_recon-19_space-Ixi549Space_T1w.nii.gz
4769,999155686841,b3ddd61266c842a9b98597579f1054df,0,56,/gpfs/data/razavianlab/data/mri/nyu/barlow_bids_t1_preprocess_A_part_b/subjects/sub-b3ddd61266c842a9b98597579f1054df/ses-999155686841/t1/spm/segmentation/normalized_space/sub-b3ddd61266c842a9b98597579f1054df_ses-999155686841_20130130-SAG_3D_T1___MS_P_MPR_Ax_mpr_recon-16_space-Ixi549Space_T1w.nii.gz
4770,999370949215,4813b0173c2544f3b0bd987300cb8021,2,57,


In [85]:
final_df = temp_df[~temp_df.isna().any(axis=1)]
final_df.head()

Unnamed: 0,Session,Subject,Label,Age,Path
1,100039817943,210909b9725245c5a09e052b931447f4,0,52,/gpfs/data/razavianlab/data/mri/nyu/barlow_bids_t1_preprocess_A_part_b/subjects/sub-210909b9725245c5a09e052b931447f4/ses-100039817943/t1/spm/segmentation/normalized_space/sub-210909b9725245c5a09e052b931447f4_ses-100039817943_20120107-AX_MPR_RECON-13_space-Ixi549Space_T1w.nii.gz
5,100402864124,fc9d3132547b40d88b14b49f22059a7d,0,58,/gpfs/data/razavianlab/data/mri/nyu/barlow_bids_t1_preprocess_A_part_a/subjects/sub-fc9d3132547b40d88b14b49f22059a7d/ses-100402864124/t1/spm/segmentation/normalized_space/sub-fc9d3132547b40d88b14b49f22059a7d_ses-100402864124_20111201-AX_3D_MPR-15_space-Ixi549Space_T1w.nii.gz
6,100402864124,fc9d3132547b40d88b14b49f22059a7d,0,58,/gpfs/data/razavianlab/data/mri/nyu/barlow_bids_t1_preprocess_A_part_a/subjects/sub-fc9d3132547b40d88b14b49f22059a7d/ses-100402864124/t1/spm/segmentation/normalized_space/sub-fc9d3132547b40d88b14b49f22059a7d_ses-100402864124_2_20111201-AX_3D_MPR_MPR_3mm_sag_mpr-17_space-Ixi549Space_T1w.nii.gz
9,100561623079,19a7816500184206baae665fb54d9486,1,64,/gpfs/data/razavianlab/data/mri/nyu/barlow_bids_t1_preprocess_A_part_b/subjects/sub-19a7816500184206baae665fb54d9486/ses-100561623079/t1/spm/segmentation/normalized_space/sub-19a7816500184206baae665fb54d9486_ses-100561623079_20111115-SAG_3D_MPR-10_space-Ixi549Space_T1w.nii.gz
11,100705385639,4c1833793b70449da1017a8eb7592b12,2,77,/gpfs/data/razavianlab/data/mri/nyu/barlow_bids_t1_preprocess_A_part_b/subjects/sub-4c1833793b70449da1017a8eb7592b12/ses-100705385639/t1/spm/segmentation/normalized_space/sub-4c1833793b70449da1017a8eb7592b12_ses-100705385639_20100307-SAG_MPR_ISO-3_space-Ixi549Space_T1w.nii.gz


In [83]:
final_df.reset_index(inplace=True, drop=True)
final_df.to_csv('/gpfs/home/lc3424/capstone/2021_dementia/lc3424_workspace/experiments/20211102/volume_label_with_file_path_with_age_20211102.tsv', sep='\t')