### MR files are nested in directories, make the structure flat

In [2]:
import glob
import os
from os import write

In [106]:
dataset_base_path = "/media/jfallmann/T9/University/master_thesis/dataset"

mri_raw_path = f"{dataset_base_path}/mri/raw"
mri_base_path = f"{dataset_base_path}/mri"
snp_raw_path = f"{dataset_base_path}/snp/raw"
mri_bids_path = f"{dataset_base_path}/mri/bids"
mri_fastsurfer_out = f"{dataset_base_path}/mri/processed"
tables_path = f"{dataset_base_path}/tables"

In [107]:
# Get all the MR files
mr_input_files = glob.glob(f'{mri_raw_path}/**/*.nii', recursive=True)
print(f"Found {len(mr_input_files)} MR files")

Found 2248 MR files


In [108]:
## are there any directories with multiple files?
from collections import defaultdict
dir_files = defaultdict(list)
for file in mr_input_files:
    dir_files[os.path.dirname(file)].append(file)

for k, v in dir_files.items():
    if len(v) > 1:
        print(k, v)

print("Done")

Done


In [109]:
def clean_id(id: str) -> str:
    return id.replace("_", "").replace(".", "").replace("-", "").lower()

In [110]:
import re
# Get all subject ids where genome data is available
genome_files = glob.glob(f'{snp_raw_path}/**/*.vcf', recursive=True)
genome_split_regex = r'(.*)_SNP'
genome_subjects = [re.search(genome_split_regex, os.path.basename(file)).group(1) for file in genome_files]
genome_subjects = [clean_id(subject) for subject in genome_subjects]
print(f"Found {len(genome_subjects)} subjects with genome data")

Found 809 subjects with genome data


In [113]:
banned_subjects = []

with open(f'{dataset_base_path}/banned_subjects.txt') as subjects_file:
    for line in subjects_file:
        banned_subjects.append(line.strip())

allowed_subjects = [subject for subject in genome_subjects if subject not in banned_subjects]
print(f"Found {len(allowed_subjects)} subjects after removing banned subjects")

Found 806 subjects after removing banned subjects


In [114]:
# files have the structure ADNI/<subject-id>/<preprocessing-name>/<session-id>/<some-id>/<filename>.nii
# we want to group files by subject id, choose the folder with the longest name and use the most recent session
# if there are multiple files in the same session, choose the one with the longest name
# session name is a date string in the format YYYY-MM-DD_THH_MM.0

files_per_subject = defaultdict(str)

for file in mr_input_files:
    file_splits = file.split('/')

    adni_index = file_splits.index('ADNI')
    subject_id = clean_id(file_splits[adni_index + 1])
    preprocessing_name = file_splits[adni_index + 2]
    session_id =  file_splits[adni_index + 3]

    # skip if subject has no genome data
    if subject_id not in allowed_subjects:
        print(f"Skipping {subject_id} as no genome data is available")
        continue

    # get current file stored for subject
    current_file = files_per_subject.get(subject_id, None)
    if current_file is None:
        files_per_subject[subject_id] = file
    else:
        current_file_splits = current_file.split('/')
        current_session_id = current_file_splits[adni_index + 3]
        current_preprocessing_name = current_file_splits[adni_index + 2]

        if session_id > current_session_id:
            files_per_subject[subject_id] = file
        elif session_id == current_session_id:
            if len(preprocessing_name) > len(current_preprocessing_name):
                files_per_subject[subject_id] = file

print(f"Found {len(files_per_subject)} subjects with MR data")

Skipping 099s4124 as no genome data is available
Skipping 153s4165 as no genome data is available
Skipping 003s4142 as no genome data is available
Skipping 137s4227 as no genome data is available
Skipping 094s4295 as no genome data is available
Skipping 109s4378 as no genome data is available
Skipping 041s4375 as no genome data is available
Skipping 941s4377 as no genome data is available
Skipping 941s4377 as no genome data is available
Skipping 109s4380 as no genome data is available
Skipping 041s4495 as no genome data is available
Skipping 109s4471 as no genome data is available
Skipping 016s4601 as no genome data is available
Skipping 137s4623 as no genome data is available
Skipping 137s4587 as no genome data is available
Skipping 109s4594 as no genome data is available
Skipping 109s4594 as no genome data is available
Skipping 021s4633 as no genome data is available
Skipping 003s4644 as no genome data is available
Skipping 024s4674 as no genome data is available
Skipping 029s4652 as

In [115]:
# create a bids dataset in data/mri
# folder structure should be data/mri/adni_thesis/sub-<subject_id>/ses-<session_id>

if not os.path.exists(mri_bids_path):
    os.makedirs(mri_bids_path)

for subject_id, filepath in files_per_subject.items():
    file_splits = filepath.split('/')

    adni_index = file_splits.index('ADNI')

    subject_id_clean = subject_id
    session_id =  clean_id(file_splits[adni_index + 3])

    folder = f'{mri_bids_path}/sub-{subject_id}/ses-{session_id}/anat'
    if not os.path.exists(folder):
        os.makedirs(folder)

    new_file = f'{folder}/sub-{subject_id}_ses-{session_id}_T1w.nii'
    os.rename(filepath, new_file)

print("Done, created BIDS dataset")

Done, created BIDS dataset


In [116]:
# get all subject ids in the bids dataset
bids_subjects = glob.glob(f'{mri_bids_path}/sub-*')
bids_subjects = [os.path.basename(subject).replace("sub-","") for subject in bids_subjects]
print(f"Found {len(bids_subjects)} subjects in BIDS dataset")

Found 410 subjects in BIDS dataset


In [117]:
# write list of subjects to dataset folder
with open(f"{dataset_base_path}/subjects.txt", "w") as f:
    for subject in bids_subjects:
        f.write(f"{subject}\n")

## Create batch file for fastsurfer

In [119]:
# find all subjects not already processed
processed_folders = glob.glob(f"{mri_fastsurfer_out}/*")
processed_subjects = [processed_folder.split("/")[-1] for processed_folder in processed_folders]

# move all unused processed files to tmp folder
#for processed_file, processed_subject in zip(processed_folders, processed_subjects):
#    if processed_subject not in genome_subjects:
#        os.rename(processed_file, f"{mri_fastsurfer_out}_temp/{processed_subject}")


In [120]:
# fastsurfer batchfile format is as follows:
# <subject-id>=<path-to-mr-file>

mr_files = glob.glob(f'{mri_bids_path}/**/*.nii', recursive=True)
docker_filepath = "/data"

with open(f"{mri_bids_path}/fastsurfer_batch.txt", "w") as f:
    for filepath in mr_files:
        # replace current filesystem path with docker path
        docker_file = filepath.replace(mri_bids_path, docker_filepath)
        file_splits = docker_file.split('/')
        subject_id = clean_id(file_splits[2].replace("sub-", ""))
        if subject_id in processed_subjects:
            continue
        f.write(f"{subject_id}={docker_file}\n")

In [None]:
# Run preprocessing pipeline
#! smriprep-docker ./data/mri/adni_thesis ./data_processed/mri participant --fs-license-file ./license.txt --ncpus 4 --skull-strip-mode force

In [None]:
# create fastsurfer output directory
if not os.path.exists(mri_fastsurfer_out):
    os.makedirs(mri_fastsurfer_out)

In [247]:
!docker run --privileged --gpus all -v /media/jfallmann/T9/University/master_thesis/dataset/mri/bids:/data -v /media/jfallmann/T9/University/master_thesis/dataset/mri/processed:/output -v /media/jfallmann/T9/University/master_thesis/license:/fs_license --entrypoint "/fastsurfer/brun_fastsurfer.sh" --rm --user $(id -u):$(id -g) deepmi/fastsurfer:latest --fs_license /fs_license/license.txt --subject_list /data/fastsurfer_batch.txt --sd /output --parallel --3T --seg_only --no_cereb --no_hypothal

/fastsurfer/brun_fastsurfer.sh --fs_license /fs_license/license.txt --subject_list /data/fastsurfer_batch.txt --sd /output --parallel --3T --seg_only --no_cereb --no_hypothal
Tue, 07 Jan 2025 13:30:00 +0000

INFO: run_fastsurfer not explicitly specified, using $FASTSURFER_HOME/run_fastsurfer.sh.
098s4050=/data/sub-098s4050/ses-201105271516560/anat/sub-098s4050_ses-201105271516560_T1w.nii 011s4075=/data/sub-011s4075/ses-201206221215290/anat/sub-011s4075_ses-201206221215290_T1w.nii 041s4051=/data/sub-041s4051/ses-201207121118550/anat/sub-041s4051_ses-201207121118550_T1w.nii 068s4134=/data/sub-068s4134/ses-201210151304380/anat/sub-068s4134_ses-201210151304380_T1w.nii 123s4127=/data/sub-123s4127/ses-201208030941490/anat/sub-123s4127_ses-201208030941490_T1w.nii 005s4168=/data/sub-005s4168/ses-201209060947300/anat/sub-005s4168_ses-201209060947300_T1w.nii 023s4122=/data/sub-023s4122/ses-201303051114060/anat/sub-023s4122_ses-201303051114060_T1w.nii 023s4115=/data/sub-023s4115/ses-201210021

In [122]:
!docker run --gpus all -v /media/jfallmann/T9/University/master_thesis/dataset/mri/bids:/data -v /media/jfallmann/T9/University/master_thesis/dataset/mri/processed:/output -v /media/jfallmann/T9/University/master_thesis/license:/fs_license --entrypoint "/fastsurfer/brun_fastsurfer.sh" --rm --user $(id -u):$(id -g) deepmi/fastsurfer:latest --fs_license /fs_license/license.txt --subject_list /data/fastsurfer_batch2.txt --sd /output --parallel --3T --seg_only --no_cereb --no_hypothal

/fastsurfer/brun_fastsurfer.sh --fs_license /fs_license/license.txt --subject_list /data/fastsurfer_batch2.txt --sd /output --parallel --3T --seg_only --no_cereb --no_hypothal
Tue, 07 Jan 2025 02:28:33 +0000

INFO: run_fastsurfer not explicitly specified, using $FASTSURFER_HOME/run_fastsurfer.sh.
072s4610=/data/sub-072s4610/ses-201304090730570/anat/sub-072s4610_ses-201304090730570_T1w.nii 099s4104=/data/sub-099s4104/ses-201207110807380/anat/sub-099s4104_ses-201207110807380_T1w.nii 128s4609=/data/sub-128s4609/ses-201204041828370/anat/sub-128s4609_ses-201204041828370_T1w.nii 041s4629=/data/sub-041s4629/ses-201304221158430/anat/sub-041s4629_ses-201304221158430_T1w.nii 070s4692=/data/sub-070s4692/ses-201205081035540/anat/sub-070s4692_ses-201205081035540_T1w.nii 037s4030=/data/sub-037s4030/ses-201205141108430/anat/sub-037s4030_ses-201205141108430_T1w.nii 014s4080=/data/sub-014s4080/ses-201206211033430/anat/sub-014s4080_ses-201206211033430_T1w.nii 016s4009=/data/sub-016s4009/ses-20120718

In [248]:
structs_of_interest = ['Left-Cerebral-White-Matter',
 'Left-Lateral-Ventricle',
 'Left-Inf-Lat-Vent',
 'Left-Cerebellum-White-Matter',
 'Left-Cerebellum-Cortex',
 'Left-Thalamus',
 'Left-Caudate',
 'Left-Putamen',
 'Left-Pallidum',
 '3rd-Ventricle',
 '4th-Ventricle',
 'Brain-Stem',
 'Left-Hippocampus',
 'Left-Amygdala',
 'CSF',
 'Left-Accumbens-area',
 'Left-VentralDC',
 'Left-choroid-plexus',
 'Right-Cerebral-White-Matter',
 'Right-Lateral-Ventricle',
 'Right-Inf-Lat-Vent',
 'Right-Cerebellum-White-Matter',
 'Right-Cerebellum-Cortex',
 'Right-Thalamus',
 'Right-Caudate',
 'Right-Putamen',
 'Right-Pallidum',
 'Right-Hippocampus',
 'Right-Amygdala',
 'Right-Accumbens-area',
 'Right-VentralDC',
 'Right-choroid-plexus',
 'WM-hypointensities',
 'ctx-lh-caudalanteriorcingulate',
 'ctx-lh-caudalmiddlefrontal',
 'ctx-lh-cuneus',
 'ctx-lh-entorhinal',
 'ctx-lh-fusiform',
 'ctx-lh-inferiorparietal',
 'ctx-lh-inferiortemporal',
 'ctx-lh-isthmuscingulate',
 'ctx-lh-lateraloccipital',
 'ctx-lh-lateralorbitofrontal',
 'ctx-lh-lingual',
 'ctx-lh-medialorbitofrontal',
 'ctx-lh-middletemporal',
 'ctx-lh-parahippocampal',
 'ctx-lh-paracentral',
 'ctx-lh-parsopercularis',
 'ctx-lh-parsorbitalis',
 'ctx-lh-parstriangularis',
 'ctx-lh-pericalcarine',
 'ctx-lh-postcentral',
 'ctx-lh-posteriorcingulate',
 'ctx-lh-precentral',
 'ctx-lh-precuneus',
 'ctx-lh-rostralanteriorcingulate',
 'ctx-lh-rostralmiddlefrontal',
 'ctx-lh-superiorfrontal',
 'ctx-lh-superiorparietal',
 'ctx-lh-superiortemporal',
 'ctx-lh-supramarginal',
 'ctx-lh-transversetemporal',
 'ctx-lh-insula',
 'ctx-rh-caudalanteriorcingulate',
 'ctx-rh-caudalmiddlefrontal',
 'ctx-rh-cuneus',
 'ctx-rh-entorhinal',
 'ctx-rh-fusiform',
 'ctx-rh-inferiorparietal',
 'ctx-rh-inferiortemporal',
 'ctx-rh-isthmuscingulate',
 'ctx-rh-lateraloccipital',
 'ctx-rh-lateralorbitofrontal',
 'ctx-rh-lingual',
 'ctx-rh-medialorbitofrontal',
 'ctx-rh-middletemporal',
 'ctx-rh-parahippocampal',
 'ctx-rh-paracentral',
 'ctx-rh-parsopercularis',
 'ctx-rh-parsorbitalis',
 'ctx-rh-parstriangularis',
 'ctx-rh-pericalcarine',
 'ctx-rh-postcentral',
 'ctx-rh-posteriorcingulate',
 'ctx-rh-precentral',
 'ctx-rh-precuneus',
 'ctx-rh-rostralanteriorcingulate',
 'ctx-rh-rostralmiddlefrontal',
 'ctx-rh-superiorfrontal',
 'ctx-rh-superiorparietal',
 'ctx-rh-superiortemporal',
 'ctx-rh-supramarginal',
 'ctx-rh-transversetemporal',
 'ctx-rh-insula']

In [249]:
fs_table = pd.read_csv(f"{tables_path}/UCSFFSX7_06Jan2025.csv")
fs_table

Unnamed: 0,PHASE,PTID,RID,VISCODE,VISCODE2,IMAGEUID,EXAMDATE,RUNDATE,STATUS,FSVER,...,ST147SV,ST148SV,ST149SV,ST150SV,ST151SV,ST152SV,ST153SV,ST154SV,ST155SV,update_stamp
0,ADNI3,002_S_4213,4213,init,m72,888008,2017-08-14,2022-07-23,partial,7.2.0,...,217434.885092,217173.802571,434608.687663,175240.0,171111.0,346351.0,47936.0,576920.687663,874725.0,2024-12-02 08:27:54.0
1,ADNI2,116_S_4453,4453,v02,scmri,281425,2012-01-18,2022-10-05,partial,7.2.0,...,224522.552452,221805.186019,446327.738471,225291.0,221178.0,446469.0,50711.0,599707.738471,984072.0,2024-12-02 08:27:54.0
2,ADNI2,099_S_4104,4104,v11,m12,322667,2012-07-11,2022-09-26,partial,7.2.0,...,219930.656696,224579.475317,444510.132013,199702.0,200555.0,400257.0,52008.0,596412.132013,928837.0,2024-12-02 08:27:54.0
3,ADNI2,073_S_2153,2153,v21,m36,417092,2014-03-03,2022-08-14,partial,7.2.0,...,200725.779310,201273.901030,401999.680340,174186.0,176432.0,350618.0,49603.0,538118.680340,846706.0,2024-12-02 08:27:54.0
4,ADNI2,137_S_4303,4303,v04,m03,288921,2012-02-27,2022-10-05,partial,7.2.0,...,198949.362921,200313.596282,399262.959202,202949.5,196986.5,399936.0,47355.0,543084.959202,902306.0,2024-12-02 08:27:54.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6326,ADNI2,021_S_5237,5237,v02,scmri,379938,2013-06-27,2022-09-11,partial,7.2.0,...,212252.638922,208494.814443,420747.453366,211465.5,212177.5,423643.0,51856.0,584021.453366,937760.0,2024-12-02 08:27:54.0
6327,ADNI2,100_S_4556,4556,v02,scmri,305408,2012-04-26,2022-08-14,partial,7.2.0,...,224566.873931,229790.009048,454356.882979,223665.5,227203.5,450869.0,53580.0,603257.882979,999649.0,2024-12-02 08:27:54.0
6328,ADNI3,005_S_6093,6093,sc,sc,924228,2017-10-27,2022-07-23,partial,7.2.0,...,237455.319821,235727.874558,473183.194379,214672.5,213593.5,428266.0,50758.0,630561.194379,967388.0,2024-12-02 08:27:54.0
6329,ADNI3,129_S_6852,6852,y2,m24,1547010,2022-02-21,2023-08-29,partial,7.2.0,...,202401.632540,203515.760212,405917.392752,189276.5,185867.5,375144.0,48543.0,573948.392752,885358.0,2024-12-02 08:27:54.0


In [250]:
subjects_in_order = []

with open(f'{dataset_base_path}/subjects.txt') as subjects_file:
    for line in subjects_file:
        subjects_in_order.append(line.strip())

In [251]:
from datetime import datetime
bids_files = glob.glob(f"{mri_bids_path}/**/*.nii", recursive=True)

session_dates = {}

for file in bids_files:
    subject_id = re.search(r'sub-(\w+)', file).group(1)
    session_id = re.search(r'ses-(\w+)', file).group(1)

    session_dates[subject_id] = datetime.strptime(session_id[:8], '%Y%m%d')

In [252]:
fs_table['EXAMDATE'] = pd.to_datetime(fs_table['EXAMDATE'], format='%Y-%m-%d')

# ignore all rows where the diagnosis is nan
fs_table = fs_table[fs_table['ST10CV'].notna()]

# ignore all rows where the exam date is nan
fs_table = fs_table[fs_table['EXAMDATE'].notna()]

etiv_per_subject = {}

# for each subject, get the diagnosis where the date is closest to the session date
for subject_id in subjects_in_order:
    etiv = fs_table[fs_table['PTID'].map(lambda x: clean_id(x)) == subject_id]
    if etiv.empty:
        print(f"Subject {subject_id} has no ETIV")
        continue
    # remove NA values

    closest_date = etiv.iloc[(etiv['EXAMDATE'] - session_dates[subject_id]).abs().argsort()[:1]]
    etiv_per_subject[subject_id] = closest_date['ST10CV'].values[0]

In [253]:
# collect all processed files and generate feature vectors with volume information

processed_folders = glob.glob(f"{mri_fastsurfer_out}/*")

volumes_per_subject = {}
for processed_folder in processed_folders:
    subject_id = processed_folder.split("/")[-1]
    stats_path = os.path.join(processed_folder, "stats/aseg+DKT.stats")
    if not os.path.exists(stats_path):
        print(f"Stats folder does not exist for {processed_folder}")
        continue

    aseg_csv = pd.read_csv(stats_path, sep=r'\s+', skiprows=65, names=["Index", "SegId", "NVoxels", "Volume_mm3", "StructName", "normMean", "normStdDev", "normMin", "normMax", "normRange"])

    if subject_id not in etiv_per_subject:
        print(f"No etiv found for subject {subject_id}")
        continue

    etiv_volume = etiv_per_subject[subject_id]
    volumes = []
    for structure in structs_of_interest:
        try:
            volume = aseg_csv.loc[aseg_csv["StructName"] == structure, "Volume_mm3"].iloc[0]
            if volume > etiv_volume:
                print(f"Volume for structure {structure} and subject {subject_id} is larger than etiv {etiv_volume}")
            volumes.append(volume / etiv_volume)
        except:
            print(f"Volume not found for structure {structure} in {subject_id}")
    volumes_per_subject[subject_id] = volumes


Stats folder does not exist for /media/jfallmann/T9/University/master_thesis/dataset/mri/processed/024s4169
Stats folder does not exist for /media/jfallmann/T9/University/master_thesis/dataset/mri/processed/137s4331
No etiv found for subject 023s4035
No etiv found for subject 003s4373


In [254]:
# create a numpy array in order of subjects with all volumes
import numpy as np
final_volumes = np.zeros((len(subjects_in_order), len(structs_of_interest)))

for i, subject in enumerate(subjects_in_order):
    if subject not in volumes_per_subject:
        print(f"subject {subject} not found")
        continue
    volumes = volumes_per_subject[subject]
    final_volumes[i] = np.asarray(volumes)

subject 024s4169 not found
subject 137s4331 not found


In [255]:
final_volumes

array([[0.1527678 , 0.0123619 , 0.00023059, ..., 0.00552149, 0.00054639,
        0.00358952],
       [0.07733025, 0.00861479, 0.00097355, ..., 0.00307814, 0.00037573,
        0.00244975],
       [0.1372052 , 0.02000124, 0.00037388, ..., 0.00546184, 0.00047201,
        0.00430054],
       ...,
       [0.1456701 , 0.01478447, 0.00153851, ..., 0.00553334, 0.0007862 ,
        0.0036883 ],
       [0.16584519, 0.01207906, 0.00039125, ..., 0.00491391, 0.00052085,
        0.00389304],
       [0.17378866, 0.00530221, 0.00031831, ..., 0.007226  , 0.0005726 ,
        0.0040541 ]])

In [256]:
np.save(f"{mri_base_path}/processed_volumes.npy", final_volumes)