In [48]:
import csv
import os
import pandas as pd

from sklearn.model_selection import train_test_split


COLUMNS = [
    'Path',
    'No Finding',
    'Lung Opacity',
    'Lung Lesion',
    'Edema',
    'Consolidation',
    'Pneumonia',
    'Atelectasis',
    'Pneumothorax',
    'Pleural Effusion',
    'Pleural Other',
    'Fracture',
]

DATASET_TYPES = ('train', 'validation', 'test')


In [49]:
def divide_dataset(csv_file: str, test_size: float = 0.2) -> dict:
    with open(csv_file, 'r') as file:
        reader = list(csv.reader(file))
    info = reader.pop(0)
    train, test = train_test_split(
        reader, shuffle=True, test_size=test_size)
    valid, test = train_test_split(test, shuffle=True, test_size=0.5)
    for dataset in (train, valid, test):
        dataset.insert(0, info)

    return {
        DATASET_TYPES[0]: train,
        DATASET_TYPES[1]: valid,
        DATASET_TYPES[2]: test
    }


In [None]:
def process_dataset(dataset: dict) -> dict:
    processed_datasets = {}
    for type in DATASET_TYPES:
        data = pd.DataFrame(dataset[type], columns=dataset[type][0])
        data = data.replace('', '0.0')
        data = data.replace('-1.0', '0.0')
        data = data[COLUMNS]
        processed_datasets[type] = data.values.tolist()
    return processed_datasets


In [None]:
def save_to_directory(datasets: dict, directory: str) -> None:
    if not os.path.exists(directory):
        os.mkdir(directory)
    for dataset in datasets:
        with open(directory + '/' + dataset + '.csv', 'w') as file:
            writer = csv.writer(file)
            writer.writerows(datasets[dataset])


In [None]:
divided_dataset = divide_dataset('../CheXpert-v1.0-small/train.csv')


In [None]:

processed_dataset = process_dataset(divided_dataset)
save_to_directory(processed_dataset, '../dataset')
