## Structuring the task directory

In this section, we will copy and rename our images according to the structure required by nn-UNet. Both train and validation images will be transfered to imagesTr, the corresponding labels to labelsTr, whereas the test images will be stored in imagesTs.

<!-- - Task501 - Old preproc NCCT APIS
- Task502 - Old preproc ADC APIS
- Task503 - NCCT APIS
- Task504 - ADC APIS
- Task505 - NCCT AISD
- Task506 - DWI AISD -->

In [1]:
import os
import json
import shutil
import subprocess
import yaml

import multiprocessing as mp

from functools import partial
from tqdm import tqdm
from pathlib import Path

repo_root = Path().resolve().parent
import sys ; sys.path.insert(0, str(repo_root))
from utils.utils import get_datasets, hardcore_fix_images

# MACRO AND ENV VARIABLES
datapath = Path('<DATA_PATH>')
base_path = Path('PATH_TO_PROJECT/nnunetv2')
os.environ['nnUNet_raw'] = str(base_path/'nnUNet_raw')
os.environ['nnUNet_preprocessed'] = str(base_path/'preprocessed')
os.environ['nnUNet_results'] = str(base_path/'nnUNet_trained_models')

from nnunetv2.dataset_conversion.generate_dataset_json import generate_dataset_json
from nnunetv2.paths import nnUNet_raw, nnUNet_preprocessed
from batchgenerators.utilities.file_and_folder_operations import join, save_json, load_json


In [2]:
# CONFIGS
task_id = '056'
ref_ssl_task_id = '036'
foldername = f'Dataset{task_id}_AIS'
ref_ssl_foldername = f'Dataset{ref_ssl_task_id}_AIS'

ssl = False
dataset_args = {
    'datapath': datapath,
    'datasets': ['tum', 'apis', 'aisd', 'tbi'] if ssl else ['tum'],
    'standard': ['gold', 'silver', '-'],
    'pathology': ['ais', 'normal'] if ssl else ['ais'],
    'ssl': ssl,
    'fold': 0
}
crop = False
use_diff = False
use_contralateral = False
use_bm_as_msk = False
cases_to_exclude = []
ssl_pretrained = True

# cfg file
num_epochs = 100
unfreeze_epoch = None
unfreeze_lr = None
save_at_epochs = [10, 20, 30, 40, 50, 60, 70, 80, 90]

# Folder generation
preproc_gt_path = join(nnUNet_preprocessed, foldername, 'gt_segmentations')
out_base = join(nnUNet_raw, foldername)
imagestr, labelstr = join(out_base, "imagesTr"), join(out_base, "labelsTr")
imagests, labelsts = join(out_base, 'imagesTs'), join(out_base, "labelsTs")
for i in [imagestr, labelstr, imagests, labelsts]:
    Path(i).mkdir(exist_ok=True, parents=True)

## Copying the datas to the specific dataset

In this section we will copy the raw ISBR data to the corresponding directories as previously explained.

In [23]:
def copy_fn(i, set_dset, use_diff, set_dir, lab_dir, use_bm_as_msk):
    sample = set_dset[i]
    sample_id = sample['subject']
    dataset_name = sample['dataset_name']

    # original filenames
    names = sample['crop'] if crop else sample['clean']
    # print(sample)
    try:
        ncct_path = names['ncct-pp']
    except:
        print(sample_id, sample)
        raise Exception()
    try:
        msk_path = names['bm'] if use_bm_as_msk else names['msk-pp']
    except:
        print(sample_id, sample)
        raise Exception()
    # new filenames
    new_ncct_name = f'{sample_id}_0000.nii.gz'
    new_mask_name = f'{sample_id}.nii.gz'
    # copy image to task folder
    shutil.copyfile(ncct_path, set_dir/new_ncct_name)
    hardcore_fix_images(msk_path, ncct_path, lab_dir/new_mask_name)
    # shutil.copyfile(msk_path, lab_dir/new_mask_name)

    # if diff image required:
    if use_diff:
        try:
            diff_path = names['diff-pp']
        except:
            print(sample_id, sample)
            raise Exception()
        new_diff_name = f'{sample_id}_0001.nii.gz'
        hardcore_fix_images(diff_path, ncct_path, set_dir/new_diff_name)
        # shutil.copyfile(diff_path, set_dir/new_diff_name)
    if use_contralateral:
        try:
            flip_path = names['ncct-pp-flip']
        except:
            print(sample_id, sample)
            raise Exception()
        new_flip_name = f'{sample_id}_0001.nii.gz'
        hardcore_fix_images(flip_path, ncct_path, set_dir/new_flip_name)

# Get the datasets
train, validation, test = get_datasets(**dataset_args)

dsets = [train, validation, test]
data_dirs = [imagestr, imagestr, imagests]
labels_dirs = [labelstr, labelstr, labelsts]
parttns = ['train', 'val', 'test']

# Copy files
print('Copying files...\n')
for set_dset, set_dir, lab_dir, ptn in zip(dsets, data_dirs, labels_dirs, parttns):
    print(f'Copying {ptn} files...')
    set_dir, lab_dir = Path(set_dir), Path(lab_dir)
    parallel_copy = partial(copy_fn, set_dset=set_dset, use_diff=use_diff,
                            set_dir=set_dir, lab_dir=lab_dir, use_bm_as_msk=use_bm_as_msk)
    with mp.Pool(mp.cpu_count()) as pool:
        for _ in tqdm(pool.imap(parallel_copy, range(len(set_dset))),
                      total=len(set_dset)):
            pass

file_endings = ['_0000.nii.gz', '_0001.nii.gz'] if (use_diff or use_contralateral) else ['_0000.nii.gz']

# Creating the dataset.json file required by nn-UNet which contains information 
# about our custom dataset.
print('Creating dataset.json file...\n')
train_list = [i for i in train.df.subject.tolist() if i not in cases_to_exclude]
val_list = [i for i in validation.df.subject.tolist() if i not in cases_to_exclude]
channel_names = {0: 'CT'}
if use_diff:
    channel_names = {0: 'CT', 1: 'rescale_to_0_1'}
if use_contralateral:
    channel_names = {0: 'CT', 1: 'CT'}

generate_dataset_json(
    out_base,
    channel_names=channel_names,
    labels={'background': 0, 'ais': 1},
    num_training_cases=len(val_list+train_list),
    file_ending='.nii.gz'
)

# The last step of this section is to check if the structure of our dataset directory is
# compatible with nn-UNet's requirements.
command = 'nnUNetv2_plan_and_preprocess -pl ExperimentPlannerSSL -overwrite_plans_name nnUNetPlansSSL ' \
           f'-d {task_id} -np 8 -c 3d_fullres --verify_dataset_integrity'
subprocess.run(command, shell=True)

## Creating Train-Val split
print('Creating splits file...\n')
raw_splits = []
splits = []
for i in range(5):
    dataset_args['fold'] = i
    train, validation, _ = get_datasets(**dataset_args)
    train_list = [j for j in train.df.subject.tolist() if i not in cases_to_exclude]
    val_list = [j for j in validation.df.subject.tolist() if i not in cases_to_exclude]
    raw_split = []
    for k, fend in enumerate(file_endings):
        raw_split.append([f'{imagestr}/{j}{fend}' for j in val_list])
    raw_splits.append(raw_split)
    splits.append({'train': train_list, 'val': val_list})
save_json(splits, f'{nnUNet_preprocessed}/{foldername}/splits_final.json')
save_json(raw_splits, f'{nnUNet_preprocessed}/{foldername}/raw_splits_final.json')


# Configuration files paths
base_file = Path(f'{nnUNet_preprocessed}/{foldername}/nnunet_cfg_base.yml')
run_file = Path(f'{nnUNet_preprocessed}/{foldername}/nnunet_cfg.yml')
cfg_dict = {}

## Overwrite dataset fingerprint to normalize the images according to SSL values
if ssl_pretrained:
    print('Updating dataset fingerprints file...\n')
    # Get dataset fingerprint from ssl dataset
    ssl_dfing_path = Path(f'{nnUNet_preprocessed}/{ref_ssl_foldername}/dataset_fingerprint.json')
    with open(ssl_dfing_path, 'r') as jfile:
        ssl_dfing = json.load(jfile)
    intensities_key = 'foreground_intensity_properties_per_channel'

    # Modify supervised training files
    files_to_change = ['dataset_fingerprint.json', 'nnUNetPlansSSL.json']
    for file in files_to_change:
        supvs_file_path = Path(f'{nnUNet_preprocessed}/{foldername}/{file}')
        with open(supvs_file_path, 'r') as jfile:
            supvs_file = json.load(jfile)
        supvs_file[intensities_key].update(ssl_dfing[intensities_key])
        save_json(supvs_file, supvs_file_path)
    
    # Reprocess dataset
    print('Preprocessing dataset with new fingerprints file...\n')
    subprocess.run(f'rm -r {nnUNet_preprocessed}/{foldername}/nnUNetPlansSSL_3d_fullres', shell=True)
    subprocess.run(f'rm -r {nnUNet_preprocessed}/{foldername}/gt_segmentations', shell=True)
    subprocess.run(f'nnUNetv2_preprocess -plans_name nnUNetPlansSSL -d {task_id} -np 8 -c 3d_fullres', shell=True)
    src = f'{nnUNet_preprocessed}/{ref_ssl_foldername}/nnunet_cfg.yml'
    subprocess.run(f'cp {src} {run_file}', shell=True)

    # Write configuration file
    with open(run_file, 'r') as yfile:
        cfg_dict = yaml.safe_load(yfile)

print('Saving configuration files...\n')
cfg_dict.update({
    'num_epochs': num_epochs,
    'unfreeze_epoch': unfreeze_epoch,
    'unfreeze_lr': unfreeze_lr,
    'ssl_pretrained': ssl_pretrained,
    'save_at_epochs': save_at_epochs
})

with open(base_file, 'w') as yfile:
    yaml.dump(cfg_dict, yfile)
with open(run_file, 'w') as yfile:
    yaml.dump(cfg_dict, yfile)

print('Finished')



Copying files...

Copying train files...


100%|██████████| 111/111 [00:09<00:00, 11.89it/s]

Copying val files...



100%|██████████| 29/29 [00:02<00:00,  9.78it/s]

Copying test files...



100%|██████████| 19/19 [00:02<00:00,  8.16it/s]


Creating dataset.json file...

Fingerprint extraction...
Dataset055_AIS
Using <class 'nnunetv2.imageio.simpleitk_reader_writer.SimpleITKIO'> as reader/writer

####################
verify_dataset_integrity Done. 
If you didn't see any error messages then your dataset is most likely OK!
####################

Using <class 'nnunetv2.imageio.simpleitk_reader_writer.SimpleITKIO'> as reader/writer


100%|██████████| 140/140 [00:12<00:00, 11.37it/s]


Experiment planning...
Using <class 'nnunetv2.imageio.simpleitk_reader_writer.SimpleITKIO'> as reader/writer
3D fullres U-Net configuration:
{'data_identifier': 'nnUNetPlansSSL_3d_fullres', 'preprocessor_name': 'DefaultPreprocessor', 'batch_size': 2, 'patch_size': array([ 96, 160, 160]), 'median_image_size_in_voxels': array([140., 250., 219.]), 'spacing': array([1., 1., 1.]), 'normalization_schemes': ['CTNormalization'], 'use_mask_for_norm': [False], 'UNet_class_name': 'PlainConvUNet', 'UNet_base_num_features': 32, 'n_conv_per_stage_encoder': (2, 2, 2, 2, 2, 2), 'n_conv_per_stage_decoder': (2, 2, 2, 2, 2), 'num_pool_per_axis': [4, 5, 5], 'pool_op_kernel_sizes': [[1, 1, 1], [2, 2, 2], [2, 2, 2], [2, 2, 2], [2, 2, 2], [1, 2, 2]], 'conv_kernel_sizes': [[3, 3, 3], [3, 3, 3], [3, 3, 3], [3, 3, 3], [3, 3, 3], [3, 3, 3]], 'unet_max_num_features': 320, 'resampling_fn_data': 'resample_data_or_seg_to_shape', 'resampling_fn_seg': 'resample_data_or_seg_to_shape', 'resampling_fn_data_kwargs': {'is_

100%|██████████| 140/140 [00:21<00:00,  6.58it/s]


Creating splits file...





Updating dataset fingerprints file...

Preprocessing dataset with new fingerprints file...

Preprocessing dataset Dataset055_AIS
Configuration: 3d_fullres...


100%|██████████| 140/140 [00:22<00:00,  6.16it/s]


Saving configuration files...

Finished
