In [1]:
import os, glob, shutil
import numpy as np
import json

In [2]:
raw_data_dir = "../UHN-MedImg3D-ML-quiz/"
preprocessed_data_dir = "../nnUNet_raw/Dataset001_Pancreas/"

In [3]:
train_0_files = glob.glob(os.path.join(raw_data_dir, "train", "subtype0", "*_0000.nii.gz"))
train_1_files = glob.glob(os.path.join(raw_data_dir, "train", "subtype1", "*_0000.nii.gz"))
train_2_files = glob.glob(os.path.join(raw_data_dir, "train", "subtype2", "*_0000.nii.gz"))
val_0_files = glob.glob(os.path.join(raw_data_dir, "validation", "subtype0", "*_0000.nii.gz"))
val_1_files = glob.glob(os.path.join(raw_data_dir, "validation", "subtype1", "*_0000.nii.gz"))
val_2_files = glob.glob(os.path.join(raw_data_dir, "validation", "subtype2", "*_0000.nii.gz"))
test_files = glob.glob(os.path.join(raw_data_dir, "test", "*_0000.nii.gz"))

In [4]:
print(f"Training files: {len(train_0_files)}, {len(train_1_files)}, {len(train_2_files)}")
print(f"Validation files: {len(val_0_files)}, {len(val_1_files)}, {len(val_2_files)}")
print(f"Test files: {len(test_files)}")

Training files: 62, 106, 84
Validation files: 9, 15, 12
Test files: 72


In [5]:
imagesTr_dir = os.path.join(preprocessed_data_dir, "imagesTr")
labelsTr_dir = os.path.join(preprocessed_data_dir, "labelsTr")
imagesTs_dir = os.path.join(preprocessed_data_dir, "imagesTs")
os.makedirs(imagesTr_dir, exist_ok=True)
os.makedirs(labelsTr_dir, exist_ok=True)
os.makedirs(imagesTs_dir, exist_ok=True)

In [6]:
train_subtypes = []

for file in train_0_files:
  train_subtypes.append([os.path.basename(file).replace("_0000.nii.gz", ""), 0])
  img = file
  lab = file.replace("_0000.nii.gz", ".nii.gz")
  shutil.copy(img, os.path.join(imagesTr_dir, os.path.basename(img)))
  shutil.copy(lab, os.path.join(labelsTr_dir, os.path.basename(lab)))

for file in train_1_files:
  train_subtypes.append([os.path.basename(file).replace("_0000.nii.gz", ""), 1])
  img = file
  lab = file.replace("_0000.nii.gz", ".nii.gz")
  shutil.copy(img, os.path.join(imagesTr_dir, os.path.basename(img)))
  shutil.copy(lab, os.path.join(labelsTr_dir, os.path.basename(lab)))

for file in train_2_files:
  train_subtypes.append([os.path.basename(file).replace("_0000.nii.gz", ""), 2])
  img = file
  lab = file.replace("_0000.nii.gz", ".nii.gz")
  shutil.copy(img, os.path.join(imagesTr_dir, os.path.basename(img)))
  shutil.copy(lab, os.path.join(labelsTr_dir, os.path.basename(lab)))

for file in val_0_files:
  train_subtypes.append([os.path.basename(file).replace("_0000.nii.gz", ""), 0])
  img = file
  lab = file.replace("_0000.nii.gz", ".nii.gz")
  shutil.copy(img, os.path.join(imagesTr_dir, os.path.basename(img)))
  shutil.copy(lab, os.path.join(labelsTr_dir, os.path.basename(lab)))

for file in val_1_files:
  train_subtypes.append([os.path.basename(file).replace("_0000.nii.gz", ""), 1])
  img = file
  lab = file.replace("_0000.nii.gz", ".nii.gz")
  shutil.copy(img, os.path.join(imagesTr_dir, os.path.basename(img)))
  shutil.copy(lab, os.path.join(labelsTr_dir, os.path.basename(lab)))

for file in val_2_files:
  train_subtypes.append([os.path.basename(file).replace("_0000.nii.gz", ""), 2])
  img = file
  lab = file.replace("_0000.nii.gz", ".nii.gz")
  shutil.copy(img, os.path.join(imagesTr_dir, os.path.basename(img)))
  shutil.copy(lab, os.path.join(labelsTr_dir, os.path.basename(lab)))

train_subtypes = np.array(train_subtypes)
np.savetxt(os.path.join(preprocessed_data_dir, "train_subtypes.csv"), train_subtypes, fmt="%s,%s", delimiter=",", header="file_id,class", comments="")

for file in test_files:
  img = file
  shutil.copy(img, os.path.join(imagesTs_dir, os.path.basename(img)))

In [7]:
dataset_json = { 
 "channel_names": {
   "0": "CT", 
 }, 
 "labels": {  # THIS IS DIFFERENT NOW!
   "background": 0,
   "pancreas": 1,
   "lesion": 2
 }, 
 "subtypes": {
   "subtype0": 0,
   "subtype1": 1,
   "subtype2": 2
 },
 "numTraining": len(glob.glob(os.path.join(imagesTr_dir, "*"))), 
 "file_ending": ".nii.gz"
 }
with open(os.path.join(preprocessed_data_dir, "dataset.json"), "w") as f:
  json.dump(dataset_json, f)

In [8]:
# Get all training and validation file IDs
train_ids = []
val_ids = []

# Add training files
for file in train_0_files + train_1_files + train_2_files:
    file_id = os.path.basename(file).replace("_0000.nii.gz", "")
    train_ids.append(file_id)

# Add validation files
for file in val_0_files + val_1_files + val_2_files:
    file_id = os.path.basename(file).replace("_0000.nii.gz", "")
    val_ids.append(file_id)

# Create the splits structure
splits = [{"train": train_ids, "val": val_ids}]

# Save the splits file
splits_file = os.path.join(preprocessed_data_dir, "splits_final.json")
with open(splits_file, "w") as f:
    json.dump(splits, f)