In [3]:
import csv
import os
import pandas as pd
from sklearn.model_selection import train_test_split

COLUMNS = [
    'Path',
    'Atelectasis',
    'Consolidation',
    'Edema',
    'Pleural Effusion',
    'Pneumonia',
    'Pneumothorax',
    'Lung Opacity',
    'Lung Lesion',
    'Fracture',
    'No Finding'
]

DATASET_TYPES = ('train', 'validation', 'test')


In [4]:
def divide_dataset(csv_file: str, test_size: float = 0.2) -> dict:
    with open(csv_file, 'r') as file:
        reader = list(csv.reader(file))
    info = reader.pop(0)
    train, test = train_test_split(
        reader, shuffle=True, test_size=test_size)
    valid, test = train_test_split(test, shuffle=True, test_size=0.5)
    for dataset in (train, valid, test):
        dataset.insert(0, info)

    return {
        DATASET_TYPES[0]: train,
        DATASET_TYPES[1]: valid,
        DATASET_TYPES[2]: test
    }


In [69]:
def process_dataset(csv_file: dict) -> dict:
    processed_datasets = {}
    for dataset in DATASET_TYPES:
        data = pd.read_csv(csv_file)
        data = data[COLUMNS]
        data = data.fillna(0)
        data = data.replace(-1, 0)
        processed_datasets[dataset] = data.values.tolist()
    return processed_datasets


In [6]:
def save_to_directory(datasets: dict, directory: str) -> None:
    if not os.path.exists(directory):
        os.mkdir(directory)
    for dataset in datasets:
        with open(directory + '/' + dataset + '.csv', 'w') as file:
            writer = csv.writer(file)
            writer.writerows(datasets[dataset])


In [70]:
divided_dataset = divide_dataset('../CheXpert-v1.0-small/train.csv')
save_to_directory(divided_dataset, '../dataset')


In [71]:
processed_dataset = process_dataset('../dataset/train.csv')
save_to_directory(processed_dataset, '../dataset')
