In [None]:
# Connect to your drive with the dataset folder

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Utils

## Zip file extraction

In [None]:
import os
import zipfile
from tqdm import tqdm

def extract_zip(zip_path, destination_path = "/content"):

    os.makedirs(destination_path, exist_ok=True)

    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        members = zip_ref.infolist()
        for member in tqdm(members, desc="Extracting", unit="file"):
            zip_ref.extract(member, path=destination_path)

    print(f"Extracted to: {destination_path}")

## Tar file extraction

In [None]:
import tarfile
from tqdm import tqdm
import os

def extract_tar(tar_path, destination_path = "/content"):

    os.makedirs(destination_path, exist_ok=True)

    with tarfile.open(tar_path, 'r:*') as tar:
        members = tar.getmembers()

        for member in tqdm(members, desc="Extracting", unit="file"):
            tar.extract(member, path=destination_path)

    print(f"Extracted to: {destination_path}")

## Zip and upload to drive

In [None]:
import shutil
import os

def save_zip_to_drive(folder_path, zip_name, destination_path = ''):

    zip_path = f"/content/{zip_name}.zip"

    shutil.make_archive(base_name=zip_path.replace('.zip', ''), format='zip', root_dir=folder_path)
    print(f"Zipped folder to: {zip_path}")

    destination_path = f"/content/drive/MyDrive/{destination_path}"
    os.makedirs(os.path.dirname(destination_path), exist_ok=True)
    shutil.move(zip_path, destination_path)
    print(f"Uploaded to Drive at: {destination_path}")


## Folder deletion from Colab

In [None]:
import shutil
import os

def delete_folders(folders_paths):

    for path in folders_paths:
        if os.path.exists(path):
            shutil.rmtree(path)
            print(f"Deleted: {path}")
        else:
            print(f"Not found: {path}")

## Folder renaming

In [None]:
import os

def rename_folder(folder_path, old_name, new_name):

    old_path_name = f"{folder_path}/{old_name}"
    new_path_name = f"{folder_path}/{new_name}"

    if os.path.exists(old_path_name):
        os.rename(old_path_name, new_path_name)
        print(f"{old_name} has been renamed to {new_name}")
    else:
        print(f"Folder not found: {old_path_name}")

## Copy files to another folder

In [None]:

import os
import shutil
from tqdm import tqdm

def copy_files_to_another_folder(source_folder, destination_folder, ids_list = 'all'):
    os.makedirs(destination_folder, exist_ok=True)

    if ids_list == 'all':
        files_to_copy = [f for f in os.listdir(source_folder) if os.path.isfile(os.path.join(source_folder, f))]
    else:
        files_to_copy = [f"{id}.nii.gz" for id in ids_list]

    for filename in tqdm(files_to_copy, desc=f"Copying from {source_folder} to {destination_folder}", unit="file"):
        source_path = f"{source_folder}/{filename}"
        destination_path = f"{destination_folder}/{filename}"

        if not os.path.exists(source_path):
            print(f"Warning: {source_path} not found, skipping.")
            continue

        shutil.copy(source_path, destination_path)


## Apply a function to a folder

In [None]:
import os
import nibabel as nib
import numpy as np
import shutil
from tqdm import tqdm

def apply_function_to_folder(source_folder, destination_folder, function_to_apply):

    os.makedirs(destination_folder, exist_ok=True)

    for filename in tqdm(os.listdir(source_folder), desc=f"Applying function to files in {source_folder}", unit="file"):

        source_path = f"{source_folder}/{filename}"
        destination_path = f"{destination_folder}/{filename}"

        try:
            result_nii = function_to_apply(source_path)
            nib.save(result_nii, destination_path)
        except Exception as e:
            print(f"Error while processing file: {filename}: {e}")


# Preprocessing Functions

## Retrieve data IDs to use

In [None]:
import os
import json

def retrieve_ids_list(dataset_json_path):

    with open(dataset_json_path, 'r') as f:
        data = json.load(f)

    data_ids = []

    for entry in data['training']:
        image_path = entry['image']
        filename = os.path.basename(image_path)
        image_id = filename.replace('.nii.gz', '')
        data_ids.append(image_id)

    print("Number of IDs extracted:", len(data_ids))
    print("First 5 IDs of the list:", data_ids[:5])
    return data_ids

## Cropping

In [None]:
import os
import nibabel as nib
import numpy as np
from tqdm import tqdm
import nibabel as nib

def crop_file(file_path, height_range=(40, 200), width_range=(40, 200), depth_range=(13, 141)):

    nii_file = nib.load(file_path)
    data = nii_file.get_fdata()

    h_start, h_end = height_range
    w_start, w_end = width_range
    d_start, d_end = depth_range

    if data.ndim == 4:
        cropped_data = data[h_start:h_end, w_start:w_end, d_start:d_end, :]
    elif data.ndim == 3:
        cropped_data = data[h_start:h_end, w_start:w_end, d_start:d_end]

    cropped_nii = nib.Nifti1Image(cropped_data, affine=nii_file.affine, header=nii_file.header)
    return cropped_nii

## Split samples with almost only background

In [None]:
import os
import nibabel as nib
import numpy as np
from tqdm import tqdm

def split_samples_by_presence_of_tumor_labels(labels_folder, threshold=0.01):

    above_threshold_ids = []
    below_threshold_ids = []

    for id in tqdm(os.listdir(labels_folder), desc="Splitting samples by tumor labels presence: ", unit="file"):

        label_path = f"{labels_folder}/{id}"
        label_nii = nib.load(label_path)
        label_data = label_nii.get_fdata().astype(np.uint8)

        total_labels = np.prod(label_data.shape)
        non_background_labels = np.sum(label_data > 0)
        ratio = non_background_labels / total_labels

        id = id.split('.')[0]
        if ratio >= threshold:
            above_threshold_ids.append(id)
        else:
            below_threshold_ids.append(id)

    print(f"{len(above_threshold_ids)} samples above the threshold")
    print(f"{len(below_threshold_ids)} samples below the threshold")

    return above_threshold_ids, below_threshold_ids

## Normalization

In [None]:
import nibabel as nib
import numpy as np

def normalize_image(image_path):

    image_nii = nib.load(image_path)
    image = image_nii.get_fdata()

    normalized_image = np.zeros_like(image)

    for m in range(image.shape[3]):
       modality = image[..., m]
       min_val = modality.min()
       max_val = modality.max()

       if max_val > min_val:
          normalized_image[..., m] = (modality - min_val) / (max_val - min_val)
       else:
          normalized_image[..., m] = 0.0

    return nib.Nifti1Image(normalized_image, affine=image_nii.affine, header=image_nii.header)

# Preprocessing path

## tar file extraction

In [None]:
# Extract dataset tar
dataset_path = '/content/drive/MyDrive/Colab Notebooks/Deep Learning Project/Task01_BrainTumour.tar'
extract_tar(dataset_path)

# Main folders now
images_folder = '/content/Task01_BrainTumour/imagesTr'
labels_folder = '/content/Task01_BrainTumour/labelsTr'

Extracting: 100%|██████████| 1277/1277 [01:09<00:00, 18.44file/s]

Extracted to: /content





In [None]:
# Select IDs from the JSON
dataset_json_path = '/content/Task01_BrainTumour/dataset.json'
data_ids = retrieve_ids_list(dataset_json_path)

# Copy selected images to another folder
images_destination_folder = '/content/selected_imagesTr'
copy_files_to_another_folder(images_folder, images_destination_folder, data_ids)

# Copy selected labels to another folder
labels_destination_folder = '/content/selected_labelsTr'
copy_files_to_another_folder(labels_folder, labels_destination_folder, data_ids)

# Delete dataset folder to free space
colab_dataset_path = '/content/Task01_BrainTumour'
delete_folders([colab_dataset_path])

# Main folders now
images_folder = images_destination_folder
labels_folder = labels_destination_folder

Number of IDs extracted: 484
First 5 IDs of the list: ['BRATS_457', 'BRATS_306', 'BRATS_206', 'BRATS_449', 'BRATS_318']


Copying from /content/Task01_BrainTumour/imagesTr to /content/selected_imagesTr: 100%|██████████| 484/484 [00:40<00:00, 12.05file/s]
Copying from /content/Task01_BrainTumour/labelsTr to /content/selected_labelsTr: 100%|██████████| 484/484 [00:00<00:00, 2234.80file/s]


Deleted: /content/Task01_BrainTumour


## Cropping

In [None]:
# Crop images
images_destination_folder = '/content/cropped_imagesTr'
apply_function_to_folder(images_folder, images_destination_folder, crop_file)

# Crop labels
labels_destination_folder = '/content/cropped_labelsTr'
apply_function_to_folder(labels_folder, labels_destination_folder, crop_file)

# Delete folders to free space
delete_folders([images_folder, labels_folder])

# Main folders now
images_folder = images_destination_folder
labels_folder = labels_destination_folder

Applying function to files in /content/selected_imagesTr: 100%|██████████| 484/484 [15:38<00:00,  1.94s/file]
Applying function to files in /content/selected_labelsTr: 100%|██████████| 484/484 [01:07<00:00,  7.20file/s]


Deleted: /content/selected_imagesTr
Deleted: /content/selected_labelsTr


## Split samples with almost background

In [None]:
# Split samples with almost no tumor labels from others (could be used later)
above_threshold_ids, below_threshold_ids = split_samples_by_presence_of_tumor_labels(labels_folder)

# Put relevant images in another folder
rel_images_destination_folder = '/content/relevant_samples_imagesTr'
copy_files_to_another_folder(images_folder, rel_images_destination_folder, above_threshold_ids)

# Put relevant labels in another folder
rel_labels_destination_folder = '/content/relevant_samples_labelsTr'
copy_files_to_another_folder(labels_folder, rel_labels_destination_folder, above_threshold_ids)

# Put non relevant images in another folder
non_rel_images_destination_folder = '/content/non_relevant_samples/imagesTr'
copy_files_to_another_folder(images_folder, non_rel_images_destination_folder, below_threshold_ids)

# Put non relevant labels in another folder
non_rel_labels_destination_folder = '/content/non_relevant_samples/labelsTr'
copy_files_to_another_folder(labels_folder, non_rel_labels_destination_folder, below_threshold_ids)

# Upload non relevant samples zip to drive
non_rel_samples_path = '/content/non_relevant_samples'
save_zip_to_drive(non_rel_samples_path, 'non_relevant_samples')

# Delete past folders to free space
delete_folders([images_folder, labels_folder, non_rel_samples_path])

# Main folders now
images_folder = rel_images_destination_folder
labels_folder = rel_labels_destination_folder

Splitting samples by tumor labels presence: 100%|██████████| 484/484 [00:12<00:00, 37.33file/s]


420 samples above the threshold
64 samples below the threshold


Copying from /content/cropped_imagesTr to /content/relevant_samples_imagesTr: 100%|██████████| 420/420 [00:45<00:00,  9.31file/s]
Copying from /content/cropped_labelsTr to /content/relevant_samples_labelsTr: 100%|██████████| 420/420 [00:00<00:00, 4917.33file/s]
Copying from /content/cropped_imagesTr to /content/non_relevant_samples/imagesTr: 100%|██████████| 64/64 [00:07<00:00,  8.48file/s]
Copying from /content/cropped_labelsTr to /content/non_relevant_samples/labelsTr: 100%|██████████| 64/64 [00:00<00:00, 4665.85file/s]


Zipped folder to: /content/non_relevant_samples.zip
Uploaded to Drive at: /content/drive/MyDrive/
Deleted: /content/cropped_imagesTr
Deleted: /content/cropped_labelsTr
Deleted: /content/non_relevant_samples


## Normalization

In [None]:
images_destination_folder = '/content/normalized_imagesTr'
apply_function_to_folder(images_folder, images_destination_folder, normalize_image)

# Delete image folder to free space
delete_folders([images_folder])

# Main folders now
images_folder = images_destination_folder
labels_folder = labels_folder

Applying function to files in /content/relevant_samples_imagesTr: 100%|██████████| 420/420 [09:35<00:00,  1.37s/file]


Deleted: /content/relevant_samples_imagesTr


# Final folder uploaded to Drive

In [None]:
# Move images and labels to a single final folder
final_images_destination_folder = '/content/preprocessed_dataset/imagesTr'
final_labels_destination_folder = '/content/preprocessed_dataset/labelsTr'
copy_files_to_another_folder(images_folder, final_images_destination_folder)
copy_files_to_another_folder(labels_folder, final_labels_destination_folder)

# Upload zip of final dataset to Drive
final_dataset_path = '/content/preprocessed_dataset'
save_zip_to_drive(final_dataset_path, 'preprocessed_dataset')

# Delete folders to free space
delete_folders([images_folder, labels_folder, final_dataset_path])

Copying from /content/normalized_imagesTr to /content/preprocessed_dataset/imagesTr: 100%|██████████| 420/420 [00:44<00:00,  9.53file/s]
Copying from /content/relevant_samples_labelsTr to /content/preprocessed_dataset/labelsTr: 100%|██████████| 420/420 [00:00<00:00, 2865.92file/s]


Zipped folder to: /content/preprocessed_dataset.zip
Uploaded to Drive at: /content/drive/MyDrive/
Deleted: /content/normalized_imagesTr
Deleted: /content/relevant_samples_labelsTr
Deleted: /content/preprocessed_dataset
