In [None]:
from kaggle.api.kaggle_api_extended import KaggleApi
import os
from zipfile import ZipFile
import pathlib
import shutil
import pandas as pd

api = KaggleApi()
api.authenticate()

#### Utils 

In [None]:
def normalize_dataset_splits(root_path):
    """
    Renames folders starting with any capitalized version of "train", "test", "validation"
    to "train", "test", and "validation" respectively.
    """
    for folder_name in os.listdir(root_path):
        folder_path = os.path.join(root_path, folder_name)
        if os.path.isdir(folder_path):
            lower_folder_name = folder_name.lower()
            if lower_folder_name.startswith("train"):
                new_folder_path = os.path.join(root_path, "train")
            elif lower_folder_name.startswith("test"):
                new_folder_path = os.path.join(root_path, "test")
            elif lower_folder_name.startswith("val"):
                new_folder_path = os.path.join(root_path, "validation")
            else:
                continue  # Skip renaming if it doesn't match the criteria
            
            # Rename the folder if the new path is different from the original
            if new_folder_path != folder_path:
                os.rename(folder_path, new_folder_path)
                print(f"Renamed {folder_path} to {new_folder_path}")

def move_files_up(root_path, destination_path, depth=1, moveable_file_extensions=None):
    """
    Moves all files and folders from the root_path to the destination_path.
    """
    for file_name in os.listdir(root_path):
        file_path = os.path.join(root_path, file_name)
        if os.path.isdir(file_path):
            move_files_up(file_path, destination_path, depth + 1, moveable_file_extensions)
        else:
            if moveable_file_extensions is not None:
                file_extension = pathlib.Path(file_path).suffix
                if file_extension not in moveable_file_extensions:
                    continue
                shutil.move(file_path, destination_path)
                print(f"Moved {file_path} to {destination_path}")
    
    if depth > 1: # only remove folders if we are not at the root
        shutil.rmtree(root_path)

### Downloading the datasets

In [None]:
# Dataset keys for Kaggle API

LGG_MRI_SEGMENTATION_ID = "mateuszbuda/lgg-mri-segmentation"

DATASET_IDS = [
    "pkdarabi/brain-tumor-image-dataset-semantic-segmentation",
    "masoudnickparvar/brain-tumor-mri-dataset",
    LGG_MRI_SEGMENTATION_ID
]

LGG_MRI_SEGMENTATION = 'lgg-mri-segmentation'
DATASET_NAMES = [
    'tumor-segmentation-boxes',
    'tumor-classification',
    LGG_MRI_SEGMENTATION
]

# Destination folder
DATASET_FOLDER_PATH = pathlib.Path().absolute().parent / "datasets"


In [None]:

for i, dataset in enumerate(DATASET_IDS):
    print(f"Downloading {dataset}...")
    dataset_path = DATASET_FOLDER_PATH / DATASET_NAMES[i]

    if os.path.exists(dataset_path):
        print(f"{dataset} already exists. Skipping...")
        continue

    api.dataset_download_files(dataset, path=dataset_path, quiet=False)

    # Unzip the downloaded files
    zip_file_path = f"{dataset_path}/{dataset.split('/')[-1]}.zip"
    with ZipFile(zip_file_path, "r") as zip_ref:
        zip_ref.extractall(dataset_path)

    # Remove the zip file
    os.remove(zip_file_path)
    
    # Rename folders
    normalize_dataset_splits(dataset_path)

### Cleaning up LGG MRI Dataset (Organizing and extracting useful images)

Full dataset can be found: https://www.kaggle.com/datasets/mateuszbuda/lgg-mri-segmentation

Following cells aim to extract, identify and organize mri images for brain tumor segmentation.

In [None]:
# removing lgg-mri-segmentation subfolders
LGG_PATH = pathlib.Path(DATASET_FOLDER_PATH) / LGG_MRI_SEGMENTATION
assert os.path.exists(LGG_PATH), f"{LGG_PATH} does not exist"

lgg_subdirs = [subdir for subdir in LGG_PATH.iterdir() if subdir.is_dir()]
# ensure kaggle_3m exists in lgg_subdirs
assert any([subdir.name == "kaggle_3m" for subdir in lgg_subdirs]), "kaggle_3m not found in lgg-mri-segmentation"

kaggle_3m_path = LGG_PATH / "kaggle_3m"
# move all dirs and files from kaggle_3m to lgg_path
move_files_up(kaggle_3m_path, LGG_PATH, moveable_file_extensions=[".tif", ".csv", '.md'])

for subdir in lgg_subdirs:
    shutil.rmtree(subdir)

In [None]:
# get all .tif files from lgg_path
lgg_files = [file.parts[-1] for file in LGG_PATH.iterdir() if file.is_file() and file.suffix == ".tif"]

def get_lgg_dataframe(file_paths):
    """
    Returns a DataFrame of the LGG dataset containing the image paths and labels.
    """
    images = list(filter(lambda x: x.endswith("mask.tif"), file_paths))
    # sort based on mri number
    images.sort(key=lambda x: int(x.rsplit("_", 3)[-1][:-4]))
    # sort by patient id
    images.sort(key=lambda x: int(x.rsplit("_", 3)[-2]))

    masks = list(filter(lambda x: x.endswith("mask.tif"), file_paths))
    # sort based on mri number
    masks.sort(key=lambda x: int(x.rsplit("_", 3)[-2]))
    # sort by patient id
    masks.sort(key=lambda x: int(x.rsplit("_", 3)[-3]))

In [None]:
images = list(filter(lambda x: x.endswith("mask.tif"), lgg_files))
images[0].rsplit("_", 3)