# Preprocessing

This notebook implements the following data preprocessing steps:
1. Sample the test set
2. Rename the `.nrrd` files to follow the nnUnet naming conventions
3. Generate the nnUnet datasets in the correct format

This preprocessing only works for the format of the data that was used in the paper. For more information on how to generally format data correctly look into `nnUNet/documentation/dataset_format.md`.

Before starting the preprocessing, place the images and labels in the `data` directory in the `data/images` and `data/labels` subdirectories correspondingly. The images have to be in `.nrrd` format. The following code block will then create a training-test split.

In [13]:
from pathlib import Path
import shutil
import random
import os
import sys
import json 

## 1. Sample the test set

In [3]:

# Percentage of samples used in the training split
split_ratio = 0.87

# Source directories for images, labels, and corrected labels
source_dir_images = Path('data/images')
source_dir_labels = Path('data/labels')

# Destination directories for train and test splits
train_dir_images = Path('data/train/images')
test_dir_images = Path('data/test/images')
train_dir_labels = Path('data/train/labels')
test_dir_labels = Path('data/test/labels')

def clear_directory(directory):
    if directory.exists():
        shutil.rmtree(directory)
    directory.mkdir(parents=True, exist_ok=True)

# Clear before copying new files
clear_directory(train_dir_images)
clear_directory(test_dir_images)
clear_directory(train_dir_labels)
clear_directory(test_dir_labels)


random.seed(123)

files_images = [file for file in source_dir_images.iterdir() if file.is_file()]

random.shuffle(files_images)

split_index = int(len(files_images) * split_ratio)

train_files = files_images[:split_index]
test_files = files_images[split_index:]

# Copy files to the appropriate directory
def copy_files(files, destination_images, destination_labels):
    for file in files:
        # Copy image file
        shutil.copy(file, destination_images / file.name)

        # Copy corresponding label file if it exists
        label_path = source_dir_labels / file.name
        if label_path.exists():
            shutil.copy(label_path, destination_labels / file.name)
        else:
            print(f"Label not found for {file.name}")



# Copy training files
copy_files(train_files, train_dir_images, train_dir_labels)

# Copy testing files
copy_files(test_files, test_dir_images, test_dir_labels)


## 2. Rename the `.nrrd` files to follow the nnUnet naming conventions

In [22]:
def rename_files(directory):
    for filename in os.listdir(directory):
        if filename.endswith(".nrrd"):
            # Check if there is more than one underscore
            if filename.count('_') > 1:
                new_filename = filename.replace('_', '', 1)
                os.rename(os.path.join(directory, filename), os.path.join(directory, new_filename))
                print(f"Renamed: {filename} to {new_filename}")
            else:
                print(f"No rename needed: {filename}")

rename_files("data/test/labels")
rename_files("data/train/labels")
rename_files("data/test/images")
rename_files("data/train/images")


No rename needed: 16R_stance.nrrd
No rename needed: F12L_fall.nrrd
No rename needed: F15L_stance.nrrd
No rename needed: 3L_fall.nrrd
No rename needed: 7R_fall.nrrd
No rename needed: F16L_fall.nrrd
No rename needed: F17R_stance.nrrd
No rename needed: 6R_stance.nrrd
No rename needed: F2R_stance.nrrd
No rename needed: 4L_stance.nrrd
No rename needed: F22L_stance.nrrd
No rename needed: F10L_fall.nrrd
No rename needed: F10R_stance.nrrd
No rename needed: 21L_stance.nrrd
No rename needed: 12R_stance.nrrd
No rename needed: F13R_fall.nrrd
No rename needed: F11L_fall.nrrd
No rename needed: 17R_fall.nrrd
No rename needed: 12L_fall.nrrd
No rename needed: 4R_fall.nrrd
No rename needed: F7L_fall.nrrd
No rename needed: F19L_stance.nrrd
No rename needed: 10R_stance.nrrd
No rename needed: F9L_stance.nrrd
No rename needed: 17L_stance.nrrd
No rename needed: F1R_stance.nrrd
No rename needed: 7L_stance.nrrd
No rename needed: F22R_fall.nrrd
No rename needed: 22L_stance.nrrd
No rename needed: F19R_fall.nrrd


## 3. Generate the nnUnet datasets in the correct format

In [None]:
# Define the datasetname and ID
data_name = "Dataset101_FemurCorrected"

In [11]:
source_dirs = {
    "test_labels": Path("data/test/labels"),
    "training_labels": Path("data/train/labels"),
    "test_images": Path("data/test/images"),
    "train_images": Path("data/train/images")
}

destination_dirs = {
    "test_labels": Path(f"nnUNet_raw/{data_name}/labelsTs"),
    "training_labels": Path(f"nnUNet_raw/{data_name}/labelsTr"),
    "test_images": Path(f"nnUNet_raw/{data_name}/imagesTs"),
    "train_images": Path(f"nnUNet_raw/{data_name}/imagesTr")
}

def clear_and_create_directory(directory):
    if directory.exists():
        shutil.rmtree(directory)
    directory.mkdir(parents=True, exist_ok=True)

def copy_and_rename_files(source_dir, destination_dir, append_str="_0000"):
    for file in source_dir.iterdir():
        if file.is_file():
            new_filename = file.name
            if "image" in source_dir.name.lower() and file.suffix in ['.nrrd', '.nii', '.nii.gz']:
                new_filename = file.stem + append_str + file.suffix
            shutil.copy(file, destination_dir / new_filename)

for key in source_dirs:
    clear_and_create_directory(destination_dirs[key])
    copy_and_rename_files(source_dirs[key], destination_dirs[key])

In [32]:
# Necessary folders for later training
dir1 = Path("nnUNet_preprocessed")
dir2 = Path("nnUNet_results")

if dir1.exists():
    print(f"Warning: The directory '{dir1}' already exists.")
    sys.exit(1)
else:
    dir1.mkdir(parents=True, exist_ok=False)
    print(f"Directory '{dir1}' created successfully.")

if dir2.exists():
    print(f"Warning: The directory '{dir2}' already exists.")
    sys.exit(1)
else:
    dir2.mkdir(parents=True, exist_ok=False)
    print(f"Directory '{dir2}' created successfully.")

Directory 'nnUNet_preprocessed' created successfully.
Directory 'nnUNet_results' created successfully.


Next a `dataset.json` file with details about the dataset has to be created and put into `nnUNet_raw/Dataset101_FemurCorrected`.

In [14]:
imagesTr_dir = Path(f"nnUNet_raw/{data_name}/imagesTr")
num_training_files = len(list(imagesTr_dir.glob('*')))

data = {
    "channel_names": { 
        "0": "CT"
    },
    "labels": { 
        "background": 0,
        "bone": 1
    },
    "numTraining": num_training_files,
    "file_ending": ".nrrd"
}

save_directory = Path(f"nnUNet_raw/{data_name}")  
json_filename = save_directory / "dataset.json"

with open(json_filename, 'w') as json_file:
    json.dump(data, json_file, indent=2)

print(f"JSON file '{json_filename}' created successfully.")


JSON file 'nnUNet_raw/Dataset101_FemurCorrected/dataset.json' created successfully.
