In [None]:
from kaggle.api.kaggle_api_extended import KaggleApi
import os
from zipfile import ZipFile
import pathlib
import shutil
import pandas as pd
import numpy as np
from PIL import Image
from sklearn.model_selection import train_test_split
from typing import List, Tuple
from matplotlib import pyplot as plt
from tqdm import tqdm

api = KaggleApi()
api.authenticate()

#### Utils 

In [None]:
def normalize_dataset_splits(root_path):
    """
    Renames folders starting with any capitalized version of "train", "test", "validation"
    to "train", "test", and "validation" respectively.
    """
    for folder_name in os.listdir(root_path):
        folder_path = os.path.join(root_path, folder_name)
        if os.path.isdir(folder_path):
            lower_folder_name = folder_name.lower()
            if lower_folder_name.startswith("train"):
                new_folder_path = os.path.join(root_path, "train")
            elif lower_folder_name.startswith("test"):
                new_folder_path = os.path.join(root_path, "test")
            elif lower_folder_name.startswith("val"):
                new_folder_path = os.path.join(root_path, "validation")
            else:
                continue  # Skip renaming if it doesn't match the criteria
            
            # Rename the folder if the new path is different from the original
            if new_folder_path != folder_path:
                os.rename(folder_path, new_folder_path)
                print(f"Renamed {folder_path} to {new_folder_path}")

def move_files_up(root_path, destination_path, depth=1, moveable_file_extensions=None):
    """
    Moves all files and folders from the root_path to the destination_path.
    """
    for file_name in os.listdir(root_path):
        file_path = os.path.join(root_path, file_name)
        if os.path.isdir(file_path):
            move_files_up(file_path, destination_path, depth + 1, moveable_file_extensions)
        else:
            if moveable_file_extensions is not None:
                file_extension = pathlib.Path(file_path).suffix
                if file_extension not in moveable_file_extensions:
                    continue
                shutil.move(file_path, destination_path)
                print(f"Moved {file_path} to {destination_path}")
    
    if depth > 1: # only remove folders if we are not at the root
        shutil.rmtree(root_path)

### Downloading the datasets

In [None]:
# Dataset keys for Kaggle API

LGG_MRI_SEGMENTATION_ID = "mateuszbuda/lgg-mri-segmentation"

DATASET_IDS = [
    "pkdarabi/brain-tumor-image-dataset-semantic-segmentation",
    "masoudnickparvar/brain-tumor-mri-dataset",
    LGG_MRI_SEGMENTATION_ID
]

LGG_MRI_SEGMENTATION = 'lgg-mri-segmentation'
DATASET_NAMES = [
    'tumor-segmentation-boxes',
    'tumor-classification',
    LGG_MRI_SEGMENTATION
]

# Destination folder
DATASET_FOLDER_PATH = pathlib.Path().absolute().parent / "datasets"


In [None]:

for i, dataset in enumerate(DATASET_IDS):
    print(f"Downloading {dataset}...")
    dataset_path = DATASET_FOLDER_PATH / DATASET_NAMES[i]

    if os.path.exists(dataset_path):
        print(f"{dataset} already exists. Skipping...")
        continue

    api.dataset_download_files(dataset, path=dataset_path, quiet=False)

    # Unzip the downloaded files
    zip_file_path = f"{dataset_path}/{dataset.split('/')[-1]}.zip"
    with ZipFile(zip_file_path, "r") as zip_ref:
        zip_ref.extractall(dataset_path)

    # Remove the zip file
    os.remove(zip_file_path)
    
    # Rename folders
    normalize_dataset_splits(dataset_path)

### Cleaning up LGG MRI Dataset (Organizing and extracting useful images)

Full dataset can be found: https://www.kaggle.com/datasets/mateuszbuda/lgg-mri-segmentation

Following cells aim to extract, identify and organize mri images for brain tumor segmentation.

In [None]:
def extract_and_clean_lgg_files(lgg_path: pathlib.Path):
    """
    Extracts all .tif file from downloaded lgg-mri-segmentation dataset and removes all unnecessary subfolders.

    Args:
        lgg_path (pathlib.Path) - path to lgg dataset folder
    """
    # we only need to extract the kaggle_3m folder (contains all the .tif files)
    lgg_subdirs = [subdir for subdir in lgg_path.iterdir() if subdir.is_dir()]
    assert any([subdir.name == "kaggle_3m" for subdir in lgg_subdirs]), "kaggle_3m not found in lgg-mri-segmentation"

    kaggle_3m_path = lgg_path / "kaggle_3m"
    # move all dirs and files from kaggle_3m to lgg_path
    move_files_up(kaggle_3m_path, lgg_path, moveable_file_extensions=[".tif", ".csv", '.md'])

    for subdir in lgg_subdirs:
        shutil.rmtree(subdir)

def extract_and_holdout_split(lgg_path: pathlib.Path, holdout_ratio: float = 0.2) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Extracts feature information from the file paths and creates a holdout split.
    More specifically, it extracts identifiers for samples, mri images, segmentation masks, and diagnoses.
    It then creates a holdout split based on patient IDs.
    """
    file_paths = [file.parts[-1] for file in lgg_path.iterdir() if file.is_file() and file.suffix == ".tif"]

    images = list(filter(lambda x: not x.endswith("mask.tif"), file_paths))
    # sort based on mri number
    images.sort(key=lambda x: int(x.rsplit("_", 3)[-1][:-4]))
    # sort by patient id
    images.sort(key=lambda x: int(x.rsplit("_", 3)[-2]))

    IDs = [image[:-4] for image in images]

    masks = list(filter(lambda x: x.endswith("mask.tif"), file_paths))
    # sort based on mri number
    masks.sort(key=lambda x: int(x.rsplit("_", 3)[-2]))
    # sort by patient id
    masks.sort(key=lambda x: int(x.rsplit("_", 3)[-3]))

    has_cancer = lambda x: 1 if np.max(Image.open(LGG_PATH / x)) > 0 else 0
    diagnoses = [has_cancer(mask) for mask in masks]

    df = pd.DataFrame({"ID": IDs, 'Image': images, 'Mask': masks, 'Diagnosis': diagnoses})

    # create holdout split
    train_idx, test_idx = train_test_split(df.index, test_size=holdout_ratio, stratify=df['Diagnosis'], shuffle=True, random_state=42)
    df.loc[train_idx, 'Split'] = 'train'
    df.loc[test_idx, 'Split'] = 'test'
    return df

def separate_splits(split_df: pd.DataFrame):
    """
    Separates the train and test splits into separate folders.

    Args:
        split_df (pd.DataFrame) - dataframe containing the split information
    """

    assert 'Split' in split_df.columns, "Split column not found in dataframe"

    pbar = tqdm(total=len(split_df))

    for split in split_df['Split'].unique():
        split_path = LGG_PATH / split
        split_path.mkdir(exist_ok=True)

        split_df_split = split_df[split_df['Split'] == split]

        for i, row in split_df_split.iterrows():
            # create a folder for each patient
            patient_path = split_path / row['ID']
            patient_path.mkdir(exist_ok=True)

            # copy the image and mask to the patient folder
            shutil.copy(LGG_PATH / row['Image'], patient_path)
            shutil.copy(LGG_PATH / row['Mask'], patient_path)
            # remove the image and mask from the root folder
            os.remove(LGG_PATH / row['Image'])
            os.remove(LGG_PATH / row['Mask'])

            # add description to pbar about current split, patient
            pbar.set_description(f"{split} - {row['ID']}")
            pbar.update(1)
        
        # add csv file for the split to the split folder
        split_df_split.to_csv(split_path / f"{split}.csv", index=False)

In [None]:
# lgg_path should  
LGG_PATH = pathlib.Path(DATASET_FOLDER_PATH) / LGG_MRI_SEGMENTATION
assert os.path.exists(LGG_PATH), f"{LGG_PATH} does not exist"

# if train and test folders exist, the pass
subdirs = [subdir for subdir in LGG_PATH.iterdir() if subdir.is_dir()]

if any([subdir.name == 'train' for subdir in subdirs]) and any([subdir.name == 'test' for subdir in subdirs]):
    print(f'train and test splits already exist for {LGG_PATH}. Skipping')

else:
    if any([subdir.name == 'kaggle_3m' for subdir in subdirs]):
        print('Extracting MRI Images and Masks...')
        extract_and_clean_lgg_files(LGG_PATH)
    
    print('Creating Train / Test Split...')
    split_df = extract_and_holdout_split(LGG_PATH, holdout_ratio=0.2)

    # output metrics about the split to the console

    diagnosis_counts = lambda split: split_df[split_df['Split'] == split]['Diagnosis'].value_counts()
    train_diagnosis_counts = diagnosis_counts('train')
    test_diagnosis_counts = diagnosis_counts('test')

    print(f"Train Split: \n {train_diagnosis_counts}")
    print(f"Test Split: \n {test_diagnosis_counts}")

    print('Separating Train / Test Splits...')
    separate_splits(split_df)