In [1]:
import os
from pathlib import Path
import json

In [2]:
## CONSTANTS
DATA_DIR = '../UKB/data/Brain'
T1_FILE_PATH = '20252_T1_NIFTI'
T2_FILE_PATH = '20253_T2_NIFTI'

In [3]:
data_path = Path(DATA_DIR)

In [4]:
def split_id_from_file(filename: str) -> str: 
    '''Split patient ID from a filename. Note that this function
    does not take into account the visit number, nor number of s
    '''
    return filename.split('_')[0]  # First section is id.

In [5]:
def ids_to_json(ids: set, outdir: os.PathLike, 
                name='GatorBrain_matched_subjects.json') -> None:
    outdir = Path(outdir)
    with open(outdir / name, 'w') as json_file:
        json.dump(list(ids), json_file)
    print(f'Saved matching subject IDs to {outdir / name}')

In [6]:
t1_path = data_path / T1_FILE_PATH
t2_path = data_path / T2_FILE_PATH / 'T2_unzip'

# T1 data is stored in two different areas
new_t1_path = t1_path / 'T1_new_unzip'
old_t1_path = t1_path / 'T1_unzip'

In [7]:
old_t1_ids = [split_id_from_file(filename.name) 
              for filename in old_t1_path.iterdir()]

new_t1_ids = [split_id_from_file(filename.name) 
              for filename in new_t1_path.iterdir()]

t1_ids = set(old_t1_ids).union(set(new_t1_ids))

In [8]:
t2_ids = [split_id_from_file(filename.name)
          for filename in t2_path.iterdir()]
t2_ids = set(t2_ids)

In [9]:
matching_ids = t1_ids.intersection(t2_ids)

In [10]:
print(f'Total of {len(t1_ids)} unique T1 subjects')
print(f'Total of {len(t2_ids)} unique T2 subjects')
print(f'Total of {len(matching_ids)} matching subjects found')
ids_to_json(matching_ids, outdir='.')

Total of 44172 unique T1 subjects
Total of 43369 unique T2 subjects
Total of 43367 matching subjects found
Saved matching subject IDs to GatorBrain_matched_subjects.json
