In [17]:
from pathlib import Path
import nibabel as nib
import numpy as np
from tqdm import tqdm
import shutil
import dataclasses
import os
from collections import defaultdict
import re
import csv
import random

# Main IDs extraction and reorganization of BraTS23 Adult Glioma

In [18]:
def extract_patient_scan_id(folder_name):
    parts = folder_name.split('-')
    if len(parts) >= 4:
        return str(parts[2]), str(parts[3])
    return None, None

def extract_sequence(file_name):
    parts = file_name.split('-')
    if len(parts) > 1:
        return parts[-1].split('.')[0]
    return None 

In [19]:
target_path = Path("/mnt/93E8-0534/JuanCarlos/")
assert target_path.exists()
dataset_name = "BraTS-GLI-2024"
dataset_path = target_path / dataset_name
os.makedirs(dataset_path, exist_ok=True)
input_path = dataset_path / "training_data"
os.makedirs(input_path, exist_ok=True)
output_path = dataset_path / "organized_dataset"
os.makedirs(output_path, exist_ok=True)

# Create new folders for images and labels
images_out = output_path / "images"
labels_out = output_path / "labels"
os.makedirs(images_out, exist_ok=True)
os.makedirs(labels_out, exist_ok=True)

train_images_out = images_out / "training"
val_images_out = images_out / "validation"
test_images_out = images_out / "testing"
train_labels_out = labels_out / "training"
val_labels_out = labels_out / "validation"
test_labels_out = labels_out / "testing"

for folder in [train_images_out, val_images_out, test_images_out, 
               train_labels_out, val_labels_out, test_labels_out]:
    os.makedirs(folder, exist_ok=True)

output_file = output_path / "dataset_info.csv"
training_csv_file = output_path / "training_dataset_info.csv"
validation_csv_file = output_path / "validation_dataset_info.csv"
test_csv_file = output_path / "testing_dataset_info.csv"

#### Dont forget to load the training and validation data from Brats into the data folder, unzip it and rename the folders to training and validation


In [22]:
def process_dataset(input_path, output_path, main_csv_file, training_csv_file, validation_csv_file, testing_csv_file, dataset_name):
    sequence_mapping = {'seg':0, 't1n': 1, 't1c': 2, 't2w': 3, 't2f': 4}
   
    with open(main_csv_file, 'w', newline='') as main_csvfile, \
         open(training_csv_file, 'w', newline='') as train_csvfile, \
         open(validation_csv_file, 'w', newline='') as val_csvfile, \
         open(testing_csv_file, 'w', newline='') as test_csvfile:
       
        main_writer = csv.writer(main_csvfile)
        train_writer = csv.writer(train_csvfile)
        val_writer = csv.writer(val_csvfile)
        test_writer = csv.writer(test_csvfile)
       
        headers = ['PatientID', 'ScanID', 'Sequence', 'SequenceLabel', 'Dataset', 'HasSegmentation']
        for writer in [main_writer, train_writer, val_writer, test_writer]:
            writer.writerow(headers)
       
        dataset = dataset_name
       
        # Process the training set
        patient_data = defaultdict(list)
        train_input_path = input_path 
        for patient_folder in train_input_path.iterdir():
            if patient_folder.is_dir():
                patient_id, scan_id = extract_patient_scan_id(patient_folder.name)
                if patient_id is None or scan_id is None:
                    print(f"Skipping folder: {patient_folder.name}")
                    continue
                
                segmentation_file = None
                for image_file in patient_folder.glob('*.nii.gz'):
                    sequence = extract_sequence(image_file.name)
                    if sequence == 'seg':
                        segmentation_file = image_file
                        continue
                    sequence_label = sequence_mapping.get(sequence.lower(), 0)
                    patient_data[patient_id].append((patient_id, scan_id, sequence, sequence_label, image_file, segmentation_file))
        
        # Split patients into train, validation, and test sets
        patient_ids = list(patient_data.keys())
        random.shuffle(patient_ids)
        total_patients = len(patient_ids)
        train_split = int(total_patients * 0.8)
        val_split = int(total_patients * 0.9)
        
        train_patients = patient_ids[:train_split]
        val_patients = patient_ids[train_split:val_split]
        test_patients = patient_ids[val_split:]
        
        print(f"Total patients: {total_patients}")
        print(f"Training patients: {len(train_patients)}")
        print(f"Validation patients: {len(val_patients)}")
        print(f"Testing patients: {len(test_patients)}")
        
        # Process and write data
        for subset, patients, writer, images_subset_path, labels_subset_path in [
            ('training', train_patients, train_writer, train_images_out, train_labels_out),
            ('validation', val_patients, val_writer, val_images_out, val_labels_out),
            ('testing', test_patients, test_writer, test_images_out, test_labels_out)
        ]:
            for patient_id in patients:
                for patient_id, scan_id, sequence, sequence_label, image_file, segmentation_file in patient_data[patient_id]:
                    # Copy image file
                    image_output_file = images_subset_path / f"{patient_id}-{scan_id}-{sequence}.nii.gz"
                    image_output_file.parent.mkdir(parents=True, exist_ok=True)
                    shutil.copyfile(image_file, image_output_file)
                    
                    # Copy segmentation file if it exists
                    has_segmentation = segmentation_file is not None
                    if has_segmentation:
                        seg_output_file = labels_subset_path / f"{patient_id}-{scan_id}-seg.nii.gz"
                        seg_output_file.parent.mkdir(parents=True, exist_ok=True)
                        shutil.copyfile(segmentation_file, seg_output_file)
                    
                    row_data = [patient_id, scan_id, sequence, sequence_label, dataset, has_segmentation]
                    main_writer.writerow(row_data)
                    writer.writerow(row_data)

process_dataset(input_path, output_path, output_file, training_csv_file, validation_csv_file, test_csv_file, dataset_name)

Total patients: 613
Training patients: 490
Validation patients: 61
Testing patients: 62
