In [1]:
import os
from pathlib import Path
import json

In [3]:
## CONSTANTS
DATA_DIR = '/red/ruogu.fang/UKB/data/Brain'
T1_FILE_PATH = '20252_T1_NIFTI'
T2_FILE_PATH = '20253_T2_NIFTI'

In [4]:
data_path = Path(DATA_DIR)

In [5]:
def split_id_from_file(filename: str) -> str: 
    '''Split patient ID from a filename. Note that this function
    does not take into account the visit number, nor number of s
    '''
    return filename.split('_')[0]  # First section is id.

In [6]:
def ids_to_json(ids: set, outdir: os.PathLike, 
                name='GatorBrain_matched_subjects.json') -> None:
    outdir = Path(outdir)
    with open(outdir / name, 'w') as json_file:
        json.dump(list(ids), json_file)
    print(f'Saved matching subject IDs to {outdir / name}')

In [7]:
t1_path = data_path / T1_FILE_PATH
t2_path = data_path / T2_FILE_PATH / 'T2_unzip'

# T1 data is stored in two different areas
new_t1_path = t1_path / 'T1_new_unzip'
old_t1_path = t1_path / 'T1_unzip'

In [8]:
old_t1_ids = [split_id_from_file(filename.name) 
              for filename in old_t1_path.iterdir()]

new_t1_ids = [split_id_from_file(filename.name) 
              for filename in new_t1_path.iterdir()]

t1_ids = set(old_t1_ids).union(set(new_t1_ids))

In [9]:
t2_ids = [split_id_from_file(filename.name)
          for filename in t2_path.iterdir()]
t2_ids = set(t2_ids)

In [10]:
matching_ids = t1_ids.intersection(t2_ids)

In [11]:
print(f'Total of {len(t1_ids)} unique T1 subjects')
print(f'Total of {len(t2_ids)} unique T2 subjects')
print(f'Total of {len(matching_ids)} matching subjects found')
ids_to_json(matching_ids, outdir='.')

Total of 44172 unique T1 subjects
Total of 43369 unique T2 subjects
Total of 43367 matching subjects found
Saved matching subject IDs to GatorBrain_matched_subjects.json


In [13]:
datalist = list(matching_ids)

In [104]:
print(old_t1_path)
print(new_t1_path)

/red/ruogu.fang/UKB/data/Brain/20252_T1_NIFTI/T1_unzip
/red/ruogu.fang/UKB/data/Brain/20252_T1_NIFTI/T1_new_unzip


/red/ruogu.fang/UKB/data/Brain/20252_T1_NIFTI/T1_unzip
/red/ruogu.fang/UKB/data/Brain/20252_T1_NIFTI/T1_new_unzip


In [None]:
train_datalist=[]
for subject in datalist[:-2000]:
    old_image_t1 = str(old_t1_path) + '/' + subject + "_20252_2_0/T1_brain_to_MNI.nii.gz"
    new_image_t1 = str(new_t1_path) + '/' + subject + "_20252_2_0/T1/T1_brain_to_MNI.nii.gz"
    if os.path.exists(old_image_t1):
        train_datalist.append({"image":old_image_t1}) 
    elif os.path.exists(new_image_t1):
        train_datalist.append({"image":new_image_t1}) 
    else:
        print(f"subject:{subject} T1 not found")
    
    
    image_t2 = str(t2_path) + '/' + subject + "_20253_2_0/T2_FLAIR/T2_FLAIR_brain_to_MNI.nii.gz"
    
    if os.path.exists(image_t2):
        train_datalist.append({"image":image_t2})
    else:
        print(f"subject:{subject} T2 not found")


In [106]:
len(train_datalist)

79599

In [111]:
val_datalist=[]
for subject in datalist[-2000:]:
    old_image_t1 = str(old_t1_path) + '/' + subject + "_20252_2_0/T1_brain_to_MNI.nii.gz"
    new_image_t1 = str(new_t1_path) + '/' + subject + "_20252_2_0/T1/T1_brain_to_MNI.nii.gz"
    if os.path.exists(old_image_t1):
        val_datalist.append({"image":old_image_t1}) 
    elif os.path.exists(new_image_t1):
        val_datalist.append({"image":new_image_t1}) 
    else:
        print(f"subject:{subject} T1 not found")
    
    image_t2 = str(t2_path) + '/' + subject + "_20253_2_0/T2_FLAIR/T2_FLAIR_brain_to_MNI.nii.gz"
    if os.path.exists(image_t2):
        val_datalist.append({"image":image_t2})
    else:
        print(f"subject:{subject} T2 not found")


subject:4351722 T1 not found
subject:4351722 T2 not found
subject:1908048 T2 not found
subject:1557943 T1 not found
subject:5928357 T1 not found
subject:5928357 T2 not found
subject:3769383 T2 not found
subject:4552556 T2 not found
subject:4689269 T1 not found
subject:4689269 T2 not found
subject:2132060 T2 not found
subject:2870212 T1 not found
subject:2870212 T2 not found
subject:3317194 T1 not found
subject:3317194 T2 not found
subject:5521801 T2 not found
subject:2280469 T1 not found
subject:2280469 T2 not found
subject:4985859 T1 not found
subject:4985859 T2 not found
subject:1949762 T1 not found
subject:1949762 T2 not found
subject:2686310 T1 not found
subject:2686310 T2 not found
subject:3618516 T2 not found
subject:2174921 T1 not found
subject:2174921 T2 not found
subject:5082007 T1 not found
subject:5082007 T2 not found
subject:3257416 T1 not found
subject:3257416 T2 not found
subject:3559430 T2 not found
subject:3646722 T1 not found
subject:3646722 T2 not found
subject:566627

In [112]:
len(val_datalist)

3841

In [113]:
data = {"training":train_datalist,
        "validation":val_datalist}

In [114]:
len(data["training"])

83465

In [115]:
len(data["validation"])

3841

In [None]:
data["validation"]

In [117]:
with open("GBR_T1_T2_matched.json", 'w') as json_file:
    json.dump(data, json_file)


In [30]:
def load_T1T2mixed_datalist(args, input_file: str):
    with open(input_file, 'r') as f:
        fold = json.load(f)
    print(fold.keys())
    training_images = fold['training'] # Should be list
    validation_images = fold['validation']
    t1_path = args['t1_path'] #'/red/ruogu.fang/UKB/data/Brain/20252_T1_NIFTI/T1_unzip'
    t2_path = args['t2_path'] #'/red/ruogu.fang/UKB/data/Brain/20253_T2_NIFTI/T2_unzip'
    training = {}
    for i, image in enumerate(training_images):
        image_t1 = t1_path + '/' + image + "_20252_2_0/T1_brain_to_MNI.nii.gz"
        image_t2 = t2_path + '/' + image + "_20253_2_0/T2_FLAIR/T2_FLAIR_brain_to_MNI.nii.gz"
        training[i] = {"T1_image": image_t1,
                       "T2_image": image_t2}

    validation = {}
    for i, image in enumerate(validation_images):
        image_t1 = t1_path + '/' + image + "_20252_2_0/T1_brain_to_MNI.nii.gz"
        image_t2 = t2_path + '/' + image + "_20253_2_0/T2_FLAIR/T2_FLAIR_brain_to_MNI.nii.gz"
        validation[i] = {"T1_image": image_t1,
                         "T2_image": image_t2}
    return {'training': training,
            'validation': validation}


In [32]:
args={
    't1_path':'/red/ruogu.fang/UKB/data/Brain/20252_T1_NIFTI/T1_unzip',
    't2_path':'/red/ruogu.fang/UKB/data/Brain/20253_T2_NIFTI/T2_unzip'
    }
input_file="GBR_T1_T2_matched.json"
data = load_T1T2mixed_datalist(args, input_file)

dict_keys(['training', 'validation'])


In [34]:
data['training']

In [None]:
data['validation']

In [76]:
def load_T1T2mixed_datalist(args, input_file: str):
    with open(input_file, 'r') as f:
        fold = json.load(f)
    print(fold.keys())
    training_images = fold['training'] # Should be list
    validation_images = fold['validation']
    #training = {}
    #for i, image in enumerate(training_images):
    #    training[i] = image # here is pair
    #training[i] = image # here is pair
    training = {i: image for i, image in enumerate(training_images)}
    #validation = {}
    #for i, image in enumerate(validation_images):
    #    validation[i] = image 
    validation = {i: image for i, image in enumerate(validation_images)}
    return {'training': training,
            'validation': validation}

In [77]:
args={
    't1_path':'/red/ruogu.fang/UKB/data/Brain/20252_T1_NIFTI/T1_unzip',
    't2_path':'/red/ruogu.fang/UKB/data/Brain/20253_T2_NIFTI/T2_unzip'
    }
input_file="GBR_T1_T2_matched.json"
data = load_T1T2mixed_datalist(args, input_file)

dict_keys(['training', 'validation'])


In [78]:
len(data['training'])

82734

In [79]:
len(data['validation'])

4000

In [None]:
data['validation']