In [2]:
import os
import argparse
import sys
import json
import random
import math
import glob
from collections import OrderedDict

In [3]:
class WriteTrainJSON:
    """
    Class for writing .json files to run from training from scratch, finetuning and/or the prediction of Choroid Plexus segmentations.

    """
    def __init__(self, dataroot: str=".", work_dir: str=".", description=None, train: str="."):
        """
        Initializes the class with the given parameters.

        :param dataroot: The path to the data directory. (/var/data/MONAI_Choroid_Plexus/dataset_monai)
        :param description: The description of the experiment.
        :param work_dir: The working directory. (/var/data/student_home/lia/thesis/monai_segmentation/monai_training)
        :param train: The path to the train directory.

        Folderstructure is either: 
        --------------------------------
        i_ChP.nii.gz
        j_ChP.nii.gz
        ...
        labels
            final
                i_ChP.nii.gz
                j_ChP.nii.gz
                ...
        --------------------------------
        or 
        image_Tr
            a_image.nii
            b_image.nii
            ...
        image_Ts
            i_image.nii
            j_image.nii
            ...
        label_Tr
            a_seg.nii
            b_seg.nii
            ...

        where a, b, i, j are subject identifiers.   
        """
        self.dataroot = dataroot
        if description is None:
            self.description='Dataset for Choroid Plexus segmentation'
        elif isinstance(description, str):
            self.description=description
        self.work_dir = work_dir
        self.train = train # maybe not needed

        if not os.path.exists(self.dataroot):
            raise ValueError("The path to the data directory does not exist. Please, provide the correct path.")

    def write_train_val_json(self, num_folds: int=5, train_val_ratio: float=0.5 , json_filename: str="train_val.json"):

        print("json_filename", json_filename)

        # Set path to output file
        output_folder = os.path.join(self.work_dir, 'JSON_dir')

        # Create output folder if it does not exist
        if not os.path.exists(output_folder):
            os.makedirs(output_folder)

        # Check the folder structure
        if os.path.exists(os.path.join(self.dataroot, 'labels')):
            image_dir = self.dataroot
            label_dir = os.path.join(self.dataroot, 'labels', 'final')
        elif os.path.exists(os.path.join(self.dataroot, 'image_Tr')):
            image_dir = os.path.join(self.dataroot, 'image_Tr')
            label_dir = os.path.join(self.dataroot, 'label_Tr')
        else:
            print("self.dataroot", self.dataroot)
            raise ValueError("The folder structure is not correct. Please, provide the data in the correct format.")

        filenames_image = os.listdir(image_dir)
        filenames_image.sort()
        # Check if the label directory is in filenames_image and remove it from the list of filenames
        if 'labels' in filenames_image:
            filenames_image.remove('labels')
    
        filenames_label = os.listdir(label_dir)
        filenames_label.sort()   

        # remove hidden files and .DS_Store files 
        filenames_image = [f for f in filenames_image if not f.startswith('.')]
        filenames_label = [f for f in filenames_label if not f.startswith('.')]

        if len(filenames_image)!=len(filenames_label):
            raise ValueError("The number of images and the number of labels is different. Please, check image_Tr and label_Tr folders.")

        image_paths = [os.path.join(image_dir, filename) for filename in filenames_image]
        label_paths = [os.path.join(label_dir, filename) for filename in filenames_label]

        # Check that all files have ending .nii or .nii.gz
        for i in range(len(filenames_image)):
            if not(filenames_image[i].endswith('.nii') | filenames_image[i].endswith('.nii.gz')):
                raise ValueError("Data are not in the correct format. Please, provide images in .nii or .nii.gz Nifti format")
            if not(filenames_label[i].endswith('.nii') | filenames_label[i].endswith('.nii.gz')):
                raise ValueError("Data are not in the correct format. Please, provide images in .nii or .nii.gz Nifti format")
            
        # Split data into training and validation based on randomly sample jj indices 
        jj=math.ceil(len(filenames_image) * train_val_ratio) 
        random.seed(42) 
        indices = random.sample(range(len(filenames_image)), jj)

        train_ids = [image_paths[i] for i in indices]
        validation_ids = [image_paths[i] for i in range(len(filenames_image)) if i not in indices]
        label_train_ids = [label_paths[i] for i in indices]
        label_valid_ids = [label_paths[i] for i in range(len(filenames_label)) if i not in indices]        
        
        
        # create json file - manually set
        json_dict = OrderedDict()
        json_dict['name'] = "MRI Dataset - Choroid Plexus Segmentation" 
        json_dict['description'] = self.description
        json_dict['tensorImageSize'] = "3D"
        json_dict['modality'] = {
            "0": "MR"
        }
            
        json_dict['labels'] = {
            "0": "background",
            "1": "Choroid Plexus"
        }

        json_dict['numTraining'] = len(train_ids)
        json_dict['numValidation'] = len(validation_ids)
        json_dict['training'] = [{"fold": 0, "image": '%s' %i , "label": '%s' %j} for i, j in zip(train_ids, label_train_ids)]
        json_dict['validation'] = [{"image": '%s' %i, "label": '%s' %j} for i,j in zip(validation_ids, label_valid_ids)]
                
        random.seed(42)
        random.shuffle(json_dict["training"])

        # Split training data into N random folds
        fold_size = len(json_dict["training"]) // num_folds
        for i in range(num_folds):
            for j in range(fold_size):
                json_dict["training"][i * fold_size + j]["fold"] = i

        print("os.path.join(output_folder, json_filename)", os.path.join(output_folder, json_filename))
        with open(os.path.join(output_folder, json_filename), 'w') as f:
                json.dump(json_dict, f, indent=4, sort_keys=True)
                print("file created")
                f.close()


dataroot = "/var/data/MONAI_Choroid_Plexus/dataset_monai_train_from_scratch"
work_dir = "/var/data/student_home/lia/phuse_thesis_2024/monai_segmentation/monai_training"
json_file=WriteTrainJSON(dataroot, work_dir).write_train_val_json(json_filename = "train_val3.json")



DESCRIPTION IS NONE
json_filename train_val3.json
os.path.join(output_folder, json_filename) /var/data/student_home/lia/phuse_thesis_2024/monai_segmentation/monai_training/JSON_dir/train_val3.json
file created
DESCRIPTION IS NONE
json_filename train_val.json
os.path.join(output_folder, json_filename) ./JSON_dir/train_val.json
file created


In [5]:
from sklearn.model_selection import train_test_split


def produce_sample_dict(line: str):
    return {"label": line, "image": line.replace("labelsTr", "imagesTr")}


def produce_datalist(dataset_dir: str, train_size: int = 196):
    """
    This function is used to split the dataset.
    It will produce "train_size" number of samples for training.

    Expected folder structure:
    dataset_dir
    ├── imagesTr
    │   ├── subject1.nii.gz
    │   ├── subject2.nii.gz
    │   └── ...
    └── labelsTr
        ├── subject1.nii.gz
        ├── subject2.nii.gz
        └── ...

    :param dataset_dir: The path to the dataset directory.
    :param train_size: The number of samples to be used for training.
    """

    samples = sorted(glob.glob(os.path.join(dataset_dir, "labelsTr", "*"), recursive=True))
    samples = [_item.replace(os.path.join(dataset_dir, "labelsTr"), "labelsTr") for _item in samples]
    datalist = []
    for line in samples:
        datalist.append(produce_sample_dict(line))
    train_list, other_list = train_test_split(datalist, train_size=train_size)
    val_list, test_list = train_test_split(other_list, train_size=0.66)

    return {"training": train_list, "validation": val_list, "testing": test_list}


DESCRIPTION IS NONE
os.path.join(output_folder, json_filename) /var/data/student_home/lia/thesis/monai_segmentation/monai_training/JSON_dir/train_val.json
file created


In [3]:
class MyParser(argparse.ArgumentParser):
    def error(self, message):
        sys.stderr.write('error: %s\n' % message)
        self.print_help()
        sys.exit(2)

# Main
if __name__ == '__main__':
    print('Starting launching_tool :)')

    # Initialize the parser
    parser = argparse.ArgumentParser(
        description="Pipeline for training selected model from scratch or finetuning with N subjects with selected pretrained models"
    )

    # Add the parameters positional/optional
    parser.add_argument('--dataroot', required=True, default="/var/data/MONAI_Choroid_Plexus/dataset_train_from_scratch_monai" , help="Data directory. Where the data is stored")
    parser.add_argument('--description', required=False, help="Data description")
    parser.add_argument('--work_dir', required=True, help="working directory")
    parser.add_argument('--training_dir', required=False, help="Working directory where to save trained models. If not specified, default folder name and locations will be used")
    parser.add_argument('--train_json', required=False, default="train_val.json", help="Name of the .json file")
    # Parse the arguments
    args = parser.parse_args()
    print(args)
 
    print('Writing JSON file for training.....')
    json_file=WriteTrainJSON(args.dataroot, args.description, args.work_dir, args.training_dir).write_train_val_json(json_filename=args.train_json)



Starting launching_tool :)


usage: ipykernel_launcher.py [-h] --dataroot DATAROOT
                             [--description DESCRIPTION] --work_dir WORK_DIR
                             [--training_dir TRAINING_DIR]
                             [--train_json TRAIN_JSON]
ipykernel_launcher.py: error: the following arguments are required: --dataroot, --work_dir


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
