In [1]:
import numpy as np
import pandas as pd
import pickle
import os
from sklearn.preprocessing import MultiLabelBinarizer

In [2]:
vinbig_train_val = pd.read_csv("../Image2StyleGAN/vinbig_25_samples.csv", index_col = 0)

In [3]:
vinbig_test_main = pd.read_csv("../Image2StyleGAN/vinbig_25_samples_test.csv", index_col = 0)

In [4]:
vinbig_train_val.head()

Unnamed: 0,image_id,class_name,class_id,rad_id,x_min,y_min,x_max,y_max
1,21a10246a5ec7af151081d0cd6d65dc9,No finding,14,R7,,,,
4,063319de25ce7edb9b1c6b8881290140,No finding,14,R10,,,,
33,42d472bdda3ad93dac63c8e5e29977bb,No finding,14,R9,,,,
34,2b1293d9c276e5439e499f58ce2e31ab,No finding,14,R8,,,,
65,2561ba15f063a8e3a0322ad943738bb5,No finding,14,R2,,,,


In [5]:
vinbig_train_val["class_name"].unique()

array(['No finding', 'Aortic enlargement', 'Pleural thickening', 'ILD',
       'Nodule/Mass', 'Pulmonary fibrosis', 'Cardiomegaly',
       'Pleural effusion', 'Calcification', 'Other lesion',
       'Lung Opacity', 'Infiltration', 'Consolidation', 'Atelectasis',
       'Pneumothorax'], dtype=object)

In [6]:
# get train images
with open('vinbig/train_list.txt') as f:
    train = f.readlines()

# get val images
with open('vinbig/val_list.txt') as f:
    val = f.readlines()

# get test images
with open('vinbig/test_list.txt') as f:
    test = f.readlines()

In [7]:
train_image_ids = [train_image_id.strip().replace("./train/", "").replace(".png", "") for train_image_id in train]
val_image_ids = [val_image_id.strip().replace("./train/", "").replace(".png", "") for val_image_id in val]
test_image_ids = [test_image_id.strip().replace("./test/", "").replace(".png", "") for test_image_id in test]

In [8]:
vinbig_train = vinbig_train_val[vinbig_train_val["image_id"].isin(train_image_ids)]
vinbig_val = vinbig_train_val[vinbig_train_val["image_id"].isin(val_image_ids)]
vinbig_test = vinbig_test_main[vinbig_test_main["image_id"].isin(test_image_ids)]

In [9]:
vinbig_train.shape[0], vinbig_val.shape[0], vinbig_train_val.shape[0]

(270, 105, 375)

In [10]:
vinbig_test_main.shape[0], vinbig_test.shape[0]

(367, 367)

In [11]:
# construct train/val/test files
def dump_image_sets(file_list, binarized_label_list, set_name = "train", folder = "train"):
    images_f = open(f"{set_name}_binarized_list.txt", "w")
    for idx, file in enumerate(file_list):
        images_f.write(f"./{folder}/{file}.png {' '.join(map(str, list(binarized_label_list[idx])))}" + os.linesep)
    images_f.close()

def construct_files(vinbig_df, dataset, path):
    image_ids = []
    label_lists = []
    for image_id in vinbig_df["image_id"].unique():
        labels = set(vinbig_df.query(f"image_id=='{image_id}'")["class_name"].values)
        image_ids.append(image_id)
        label_lists.append(labels)
        
    mlb = MultiLabelBinarizer(classes=['Aortic enlargement', 'Atelectasis', 'Calcification',
       'Cardiomegaly', 'Consolidation', 'ILD', 'Infiltration',
       'Lung Opacity', 'No finding', 'Nodule/Mass', 'Other lesion',
       'Pleural effusion', 'Pleural thickening', 'Pneumothorax',
       'Pulmonary fibrosis'])

    binarized_labels = mlb.fit_transform([label for label in label_lists])
    
    dump_image_sets(image_ids, binarized_labels, dataset, path)

In [12]:
construct_files(vinbig_train, "train", "train")
construct_files(vinbig_val, "val", "train")
construct_files(vinbig_test, "test", "test")