In [19]:
import os
from pathlib import Path
import json

In [20]:
## CONSTANTS
DATA_DIR = '/red/ruogu.fang/share/UKB/data/Brain'
T1_FILE_PATH = '20252_T1_NIFTI'
T2_FILE_PATH = '20253_T2_NIFTI'

In [21]:
data_path = Path(DATA_DIR)

In [22]:
def split_id_from_file(filename: str) -> str: 
    '''Split patient ID from a filename. Note that this function
    does not take into account the visit number, nor number of s
    '''
    return filename.split('_')[0]  # First section is id.

In [23]:
def ids_to_json(ids: set, outdir: os.PathLike, 
                name='GatorBrain_matched_subjects.json') -> None:
    outdir = Path(outdir)
    with open(outdir / name, 'w') as json_file:
        json.dump(list(ids), json_file)
    print(f'Saved matching subject IDs to {outdir / name}')

In [24]:
t1_path = data_path / T1_FILE_PATH
t2_path = data_path / T2_FILE_PATH / 'T2_unzip'

# T1 data is stored in two different areas
new_t1_path = t1_path / 'T1_new_unzip'
old_t1_path = t1_path / 'T1_unzip'

In [25]:
old_t1_ids = [split_id_from_file(filename.name) 
              for filename in old_t1_path.iterdir()]

new_t1_ids = [split_id_from_file(filename.name) 
              for filename in new_t1_path.iterdir()]

t1_ids = set(old_t1_ids).union(set(new_t1_ids))

In [26]:
t2_ids = [split_id_from_file(filename.name)
          for filename in t2_path.iterdir()]
t2_ids = set(t2_ids)

In [27]:
matching_ids = t1_ids.intersection(t2_ids)


In [28]:
print(f'Total of {len(t1_ids)} unique T1 subjects')
print(f'Total of {len(t2_ids)} unique T2 subjects')
print(f'Total of {len(matching_ids)} matching subjects found')
ids_to_json(matching_ids, outdir='.')

Total of 44172 unique T1 subjects
Total of 43369 unique T2 subjects
Total of 43367 matching subjects found
Saved matching subject IDs to GatorBrain_matched_subjects.json


In [29]:
datalist = list(matching_ids)

In [30]:
print(old_t1_path)
print(new_t1_path)

/red/ruogu.fang/share/UKB/data/Brain/20252_T1_NIFTI/T1_unzip
/red/ruogu.fang/share/UKB/data/Brain/20252_T1_NIFTI/T1_new_unzip


## Format 1

In [31]:
train_datalist=[]
unused_T1 = []
unused_T2 = []
for subject in datalist[:-2000]:
    old_image_t1 = str(old_t1_path) + '/' + subject + "_20252_2_0/T1_brain_to_MNI.nii.gz"
    new_image_t1 = str(new_t1_path) + '/' + subject + "_20252_2_0/T1/T1_brain_to_MNI.nii.gz"
    image_t2 = str(t2_path) + '/' + subject + "_20253_2_0/T2_FLAIR/T2_FLAIR_brain_to_MNI.nii.gz"

    usable=True
    
    if os.path.exists(old_image_t1):
        image_t1 = old_image_t1 
    elif os.path.exists(new_image_t1):
        image_t1 = new_image_t1 
    else:
        #print(f"subject:{subject} T1 not found")
        usable=False
        unused_T1.append(subject)
        
    if os.path.exists(image_t2):
        pass
    else:
        #print(f"subject:{subject} T2 not found")
        usable=False
        unused_T2.append(subject)
    
    if usable:
        train_datalist.append({"image_T1": image_t1, "image_T2": image_t2})

print(f"number of unused T1 images = {len(unused_T1)}")
print(f"number of unused T2 images = {len(unused_T2)}")
print(f"number of usable subjects with both T1 + T2 images = {len(train_datalist)}")


number of unused T1 images = 1293
number of unused T2 images = 1848
number of usable subjects with both T1 + T2 images = 39491


In [36]:
unused_T1

In [36]:
unused_T2

In [32]:
val_datalist=[]
unused_T1 = []
unused_T2 = []
for subject in datalist[-2000:]:
    old_image_t1 = str(old_t1_path) + '/' + subject + "_20252_2_0/T1_brain_to_MNI.nii.gz"
    new_image_t1 = str(new_t1_path) + '/' + subject + "_20252_2_0/T1/T1_brain_to_MNI.nii.gz"
    image_t2 = str(t2_path) + '/' + subject + "_20253_2_0/T2_FLAIR/T2_FLAIR_brain_to_MNI.nii.gz"

    usable=True
    
    if os.path.exists(old_image_t1):
        image_t1 = old_image_t1 
    elif os.path.exists(new_image_t1):
        image_t1 = new_image_t1 
    else:
        #print(f"subject:{subject} T1 not found")
        usable=False
        unused_T1.append(subject)
        
    if os.path.exists(image_t2):
        pass
    else:
        #print(f"subject:{subject} T2 not found")
        usable=False
        unused_T2.append(subject)
    
    if usable:
        val_datalist.append({"image_T1": image_t1, "image_T2": image_t2})

print(f"number of unused T1 images = {len(unused_T1)}")
print(f"number of unused T2 images = {len(unused_T2)}")
print(f"number of usable subjects with both T1 + T2 images = {len(val_datalist)}")

number of unused T1 images = 63
number of unused T2 images = 90
number of usable subjects with both T1 + T2 images = 1909


In [51]:
data = {"training":train_datalist,
        "validation":val_datalist}

In [52]:
with open("GBR_T1T2_matched.json", 'w') as json_file:
    json.dump(data, json_file)

## Format 2

In [13]:
train_datalist=[]
unused_T1 = []
unused_T2 = []
for subject in datalist[:-2000]:
    old_image_t1 = str(old_t1_path) + '/' + subject + "_20252_2_0/T1_brain_to_MNI.nii.gz"
    new_image_t1 = str(new_t1_path) + '/' + subject + "_20252_2_0/T1/T1_brain_to_MNI.nii.gz"
    image_t2 = str(t2_path) + '/' + subject + "_20253_2_0/T2_FLAIR/T2_FLAIR_brain_to_MNI.nii.gz"

    usable=True
    
    if os.path.exists(old_image_t1):
        image_t1 = old_image_t1 
    elif os.path.exists(new_image_t1):
        image_t1 = new_image_t1 
    else:
        #print(f"subject:{subject} T1 not found")
        usable=False
        unused_T1.append(subject)
        
    if os.path.exists(image_t2):
        pass
    else:
        #print(f"subject:{subject} T2 not found")
        usable=False
        unused_T2.append(subject)
    
    if usable:
        train_datalist.append({"image": [image_t1, image_t2]})

print(f"Training")
print(f"number of unused T1 images = {len(unused_T1)}")
print(f"number of unused T2 images = {len(unused_T2)}")
print(f"number of usable subjects with both T1 + T2 images = {len(train_datalist)}")


val_datalist=[]
unused_T1 = []
unused_T2 = []
for subject in datalist[-2000:]:
    old_image_t1 = str(old_t1_path) + '/' + subject + "_20252_2_0/T1_brain_to_MNI.nii.gz"
    new_image_t1 = str(new_t1_path) + '/' + subject + "_20252_2_0/T1/T1_brain_to_MNI.nii.gz"
    image_t2 = str(t2_path) + '/' + subject + "_20253_2_0/T2_FLAIR/T2_FLAIR_brain_to_MNI.nii.gz"

    usable=True
    
    if os.path.exists(old_image_t1):
        image_t1 = old_image_t1 
    elif os.path.exists(new_image_t1):
        image_t1 = new_image_t1 
    else:
        #print(f"subject:{subject} T1 not found")
        usable=False
        unused_T1.append(subject)
        
    if os.path.exists(image_t2):
        pass
    else:
        #print(f"subject:{subject} T2 not found")
        usable=False
        unused_T2.append(subject)
    
    if usable:
        val_datalist.append({"image": [image_t1, image_t2]})

print(f"Validation")
print(f"number of unused T1 images = {len(unused_T1)}")
print(f"number of unused T2 images = {len(unused_T2)}")
print(f"number of usable subjects with both T1 + T2 images = {len(val_datalist)}")

data = {"training":train_datalist,
        "validation":val_datalist}

with open("GBR_T1T2_matched_image.json", 'w') as json_file:
    json.dump(data, json_file)

In [4]:
def load_T1T2matched_datalist(args, input_file: str):
    with open(input_file, 'r') as f:
        fold = json.load(f)
    print(fold.keys())
    training_images = fold['training'] # Should be list
    validation_images = fold['validation']
    training = {i: image for i, image in enumerate(training_images)}
    validation = {i: image for i, image in enumerate(validation_images)}
    return {'training': training,
            'validation': validation}

def get_T1T2_dataloaders(args, modality="T1", num_workers = 4):
  
    datalist = load_T1T2matched_datalist(args, args["splits"])

    if modality == "T1":
        training_datalist   = [{"image":subject[0]} for subject in datalist['training']] 
        validation_datalist = [{"image":subject[0]} for subject in datalist['validation']] 
    elif modality == "T2":
        training_datalist   = [{"image":subject[1]} for subject in datalist['training']] 
        validation_datalist = [{"image":subject[1]} for subject in datalist['validation']] 
    elif modality == "T1_T2":
        training_datalist, validation_datalist = datalist['training'], datalist['validation']
    else:
        raise ValueError("Unsupported modality")
    return training_datalist, validation_datalist


In [5]:
args={
'splits':'GBR_T1T2_matched_image.json'
}
data = load_T1T2matched_datalist(args,'GBR_T1T2_matched_image.json') 
print(data["training"])

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



## Format Mixed

In [30]:
def load_T1T2mixed_datalist(args, input_file: str):
    with open(input_file, 'r') as f:
        fold = json.load(f)
    print(fold.keys())
    training_images = fold['training'] # Should be list
    validation_images = fold['validation']
    t1_path = args['t1_path'] #'/red/ruogu.fang/UKB/data/Brain/20252_T1_NIFTI/T1_unzip'
    t2_path = args['t2_path'] #'/red/ruogu.fang/UKB/data/Brain/20253_T2_NIFTI/T2_unzip'
    training = {}
    for i, image in enumerate(training_images):
        image_t1 = t1_path + '/' + image + "_20252_2_0/T1_brain_to_MNI.nii.gz"
        image_t2 = t2_path + '/' + image + "_20253_2_0/T2_FLAIR/T2_FLAIR_brain_to_MNI.nii.gz"
        training[i] = {"T1_image": image_t1,
                       "T2_image": image_t2}

    validation = {}
    for i, image in enumerate(validation_images):
        image_t1 = t1_path + '/' + image + "_20252_2_0/T1_brain_to_MNI.nii.gz"
        image_t2 = t2_path + '/' + image + "_20253_2_0/T2_FLAIR/T2_FLAIR_brain_to_MNI.nii.gz"
        validation[i] = {"T1_image": image_t1,
                         "T2_image": image_t2}
    return {'training': training,
            'validation': validation}


In [32]:
args={
    't1_path':'/red/ruogu.fang/UKB/data/Brain/20252_T1_NIFTI/T1_unzip',
    't2_path':'/red/ruogu.fang/UKB/data/Brain/20253_T2_NIFTI/T2_unzip'
    }
input_file="GBR_T1_T2_matched.json"
data = load_T1T2mixed_datalist(args, input_file)

dict_keys(['training', 'validation'])


In [34]:
data['training']

In [None]:
data['validation']

In [41]:
def load_T1T2matched_datalist(args, input_file: str):
    with open(input_file, 'r') as f:
        fold = json.load(f)
    print(fold.keys())
    training_images = fold['training'] # Should be list
    validation_images = fold['validation']
    training = {i: image for i, image in enumerate(training_images)}
    validation = {i: image for i, image in enumerate(validation_images)}
    return {'training': training,
            'validation': validation}

In [44]:
args={
    't1_path':'/red/ruogu.fang/UKB/data/Brain/20252_T1_NIFTI/T1_unzip',
    't2_path':'/red/ruogu.fang/UKB/data/Brain/20253_T2_NIFTI/T2_unzip'
    }
input_file="GBR_T1T2_matched.json"
data = load_T1T2matched_datalist(args, input_file)

In [47]:
print(len(data["training"]))
print(len(data["validation"]))

39504
1896


In [76]:
def load_T1T2mixed_datalist(args, input_file: str):
    with open(input_file, 'r') as f:
        fold = json.load(f)
    print(fold.keys())
    training_images = fold['training'] # Should be list
    validation_images = fold['validation']
    #training = {}
    #for i, image in enumerate(training_images):
    #    training[i] = image # here is pair
    #training[i] = image # here is pair
    training = {i: image for i, image in enumerate(training_images)}
    #validation = {}
    #for i, image in enumerate(validation_images):
    #    validation[i] = image 
    validation = {i: image for i, image in enumerate(validation_images)}
    return {'training': training,
            'validation': validation}

In [77]:
args={
    't1_path':'/red/ruogu.fang/UKB/data/Brain/20252_T1_NIFTI/T1_unzip',
    't2_path':'/red/ruogu.fang/UKB/data/Brain/20253_T2_NIFTI/T2_unzip'
    }
input_file="GBR_T1_T2_matched.json"
data = load_T1T2mixed_datalist(args, input_file)

dict_keys(['training', 'validation'])


In [78]:
len(data['training'])

82734

In [79]:
len(data['validation'])

4000

In [None]:
data['validation']