In [31]:
from typing import Tuple, Union
from os import scandir
from os.path import join

import numpy as np
import pandas as pd
import torch

from sklearn.model_selection import StratifiedShuffleSplit

In [62]:
_set = "val"

dataset_dir = f"../datasets/splitted_dataset/{_set}"

for entry in scandir(dataset_dir):
    if entry.is_dir():
        print(entry.path)

../datasets/splitted_dataset/val/COVID
../datasets/splitted_dataset/val/Lung_Opacity
../datasets/splitted_dataset/val/Normal
../datasets/splitted_dataset/val/Viral Pneumonia


In [63]:
data_df = pd.DataFrame(columns=["Image Index", "Finding Labels", "Path"])

for entry in scandir(dataset_dir):
    if entry.is_dir():
        label = entry.name
        img_list = [img for img in scandir(entry.path) if img.is_file() and img.name.endswith(".png")]
        data_df = pd.concat(
            [
                data_df,
                pd.DataFrame(
                    {
                        "Image Index": [img.name for img in img_list],
                        "Finding Labels": label,
                        # "Path": [entry.path.removeprefix(f"../datasets/splitted_dataset/{_set}/") for img in img_list],
                        "Path": [entry.path.replace(f"../datasets/splitted_dataset/{_set}", "COVID-19_Radiography_Dataset") for img in img_list],
                    }
                ),
            ],
            ignore_index=True,
        )

In [64]:
def get_dummy_labels(df: pd.DataFrame) -> pd.DataFrame:
    """Transforms the labels into one-hot encoded labels.

    Args:
    -----
        data (pd.DataFrame): Dataframe containing the labels.

    Returns:
    --------
        pd.DataFrame: Dataframe with one-hot encoded labels.
    """
    data_df = df.copy()
    labels = (
        pd.get_dummies(df["Finding Labels"].str.split("|").explode())
        .groupby(level=0)
        .sum()
    )
    data_df = pd.concat([data_df, labels], axis=1)
    return data_df


def stratified_train_test_split(
    df_file: Union[str, pd.DataFrame]
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """Split the data into train, validation, and test sets

    Args:
    -----
        file_path (str): Path to the csv file or the DataFrame containing the data
    """
    if isinstance(df_file, str):
        df = pd.read_csv(df_file)
    else:
        df = df_file.copy()

    # Create 5 splits and merge the 4 splits to create the training set and the last split to create the test set
    sss = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
    train_df = None
    val_df = None
    for train_index, test_index in sss.split(df, df["Finding Labels"]):
        train_df = df.iloc[train_index]
        val_df = df.iloc[test_index]

    return train_df, val_df

In [65]:
data_df = get_dummy_labels(data_df)

data_df.to_csv(f"../datasets/csv_splits/COVID-19_Radiography_Dataset_{_set}.csv", index=False)

In [30]:
data_df = get_dummy_labels(data_df)

train_df, test_df = stratified_train_test_split(data_df)
train_df, val_df = stratified_train_test_split(train_df)

In [15]:
# train_df.to_csv("datasets/csv_splits/COVID-19_Radiography_Dataset_train.csv", index=False)
# val_df.to_csv("datasets/csv_splits/COVID-19_Radiography_Dataset_val.csv", index=False)
# test_df.to_csv("datasets/csv_splits/COVID-19_Radiography_Dataset_test.csv", index=False)

In [37]:
_img_names = train_df["Image Index"].values
_paths = train_df["Path"].values
# _data_df = torch.tensor(data_df.drop(columns=["Finding Labels", "Path"]).values)
# data_df

In [32]:
_data_df = data_df.drop(columns=["Image Index", "Finding Labels", "Path"]).values
_data_df

array([[1, 0, 0, 0],
       [1, 0, 0, 0],
       [1, 0, 0, 0],
       ...,
       [0, 0, 0, 1],
       [0, 0, 0, 1],
       [0, 0, 0, 1]])

In [35]:
_data_df

array([[1, 0, 0, 0],
       [1, 0, 0, 0],
       [1, 0, 0, 0],
       ...,
       [0, 0, 0, 1],
       [0, 0, 0, 1],
       [0, 0, 0, 1]])