In [175]:
import csv
import os
import pandas as pd
from sklearn.model_selection import train_test_split


COLUMNS = [
    'Path',
    'No Finding',
    'Lung Opacity',
    'Lung Lesion',
    'Edema',
    'Consolidation',
    'Pneumonia',
    'Atelectasis',
    'Pneumothorax',  
    'Pleural Effusion',
    'Pleural Other',
    'Fracture',
]

DATASET_TYPES = ('train', 'validation', 'test')


In [176]:
def divide_dataset(csv_file: str, test_size: float = 0.2) -> dict:
    with open(csv_file, 'r') as file:
        reader = list(csv.reader(file))
    info = reader.pop(0)
    train, test = train_test_split(
        reader, shuffle=True, test_size=test_size)
    valid, test = train_test_split(test, shuffle=True, test_size=0.5)
    for dataset in (train, valid, test):
        dataset.insert(0, info)

    return {
        DATASET_TYPES[0]: train,
        DATASET_TYPES[1]: valid,
        DATASET_TYPES[2]: test
    }


In [177]:
def process_dataset(csv_file: dict) -> dict:
    processed_datasets = {}
    for dataset in DATASET_TYPES:
        data = pd.read_csv(csv_file)
        data = data.fillna(0)
        data = data.replace(-1, 0)
        data = data[COLUMNS]
        data.columns = COLUMNS
        print(data.head)
        processed_datasets[dataset] = data.values.tolist()
    return processed_datasets


In [178]:
def save_to_directory(datasets: dict, directory: str) -> None:
    if not os.path.exists(directory):
        os.mkdir(directory)
    for dataset in datasets:
        with open(directory + '/' + dataset + '.csv', 'w') as file:
            writer = csv.writer(file)
            writer.writerows(datasets[dataset])


In [179]:
divided_dataset = divide_dataset('../CheXpert-v1.0-small/train.csv')
save_to_directory(divided_dataset, '../dataset')


In [180]:
processed_dataset = process_dataset('../dataset/train.csv')
save_to_directory(processed_dataset, '../dataset')


<bound method NDFrame.head of                                                      Path  No Finding  \
0       CheXpert-v1.0-small/train/patient03391/study1/...         0.0   
1       CheXpert-v1.0-small/train/patient36625/study3/...         0.0   
2       CheXpert-v1.0-small/train/patient00124/study7/...         0.0   
3       CheXpert-v1.0-small/train/patient28251/study15...         0.0   
4       CheXpert-v1.0-small/train/patient25429/study1/...         0.0   
...                                                   ...         ...   
178726  CheXpert-v1.0-small/train/patient04478/study1/...         1.0   
178727  CheXpert-v1.0-small/train/patient26391/study1/...         0.0   
178728  CheXpert-v1.0-small/train/patient53232/study1/...         0.0   
178729  CheXpert-v1.0-small/train/patient25516/study2/...         0.0   
178730  CheXpert-v1.0-small/train/patient20014/study6/...         0.0   

        Lung Opacity  Lung Lesion  Edema  Consolidation  Pneumonia  \
0                0.0   