In [1]:
import numpy as np
import pandas as pd
import pickle
import os
import glob
import random
from sklearn.preprocessing import MultiLabelBinarizer

In [2]:
train_csv_path = "/home/jessica/labelGAN/downstream_tasks/vinbig/train.csv"
self_annotated_vinbig_path = "/data1/shared/jessica/data/labelGAN/vinbig_test_imgs_and_segm/imgs/"

In [3]:
image_ids_annotated = [elem.replace(".png", "")for elem in os.listdir(self_annotated_vinbig_path)]

In [4]:
len(image_ids_annotated)

322

In [5]:
vinbig_full_metadata = pd.read_csv(train_csv_path)

In [6]:
train_img_ids = pd.read_csv('/home/jessica/labelGAN/downstream_tasks/vinbig/train_binarized_list.txt', delimiter=' ', header=None)[0]

In [7]:
m = ~pd.Series(image_ids_annotated).isin(train_img_ids)

In [8]:
image_ids_annotated = np.array(image_ids_annotated)[m]

In [9]:
val_img_ids = pd.read_csv('/home/jessica/labelGAN/downstream_tasks/vinbig/val_binarized_list.txt', delimiter=' ', header=None)[0]
m = ~pd.Series(image_ids_annotated).isin(val_img_ids)
image_ids_annotated = np.array(image_ids_annotated)[m]
test_img_ids = pd.read_csv('/home/jessica/labelGAN/downstream_tasks/vinbig/test_binarized_list.txt', delimiter=' ', header=None)[0]
m = ~pd.Series(image_ids_annotated).isin(test_img_ids)
image_ids_annotated = np.array(image_ids_annotated)[m]

In [10]:
image_ids_annotated.shape

(0,)

## EDA on image id data available

In [11]:
# getting binarized labels for multi-label classification problem
label_lists = []
for image_id in image_ids_annotated:
    labels = set(vinbig_full_metadata.query(f"image_id=='{image_id}'")["class_name"].values)
    label_lists.append(labels)
        
mlb = MultiLabelBinarizer(classes=['Aortic enlargement', 'Atelectasis', 'Calcification',
   'Cardiomegaly', 'Consolidation', 'ILD', 'Infiltration',
   'Lung Opacity', 'No finding', 'Nodule/Mass', 'Other lesion',
   'Pleural effusion', 'Pleural thickening', 'Pneumothorax',
   'Pulmonary fibrosis'])

binarized_labels = mlb.fit_transform([label for label in label_lists])

image_id_2_label = {}
for idx, image_id in enumerate(image_ids_annotated):
    image_id_2_label[image_id] = binarized_labels[idx]

In [12]:
binarized_labels

array([], shape=(0, 15), dtype=int64)

In [188]:
# images with no disease == "no finding"
no_dis = (binarized_labels[:, 8] == 1).nonzero()[0].shape[0]
no_dis

0

In [189]:
# images with disease != "no finding"
dis = (binarized_labels[:, 8] != 1).nonzero()[0].shape[0]
dis

173

In [190]:
label_split = no_dis/dis
label_split

0.0

In [191]:
train_test_split = 0.5 # no validation set as we are not doing any hyper parameter tuning in our work!

train_no_dis_num = int(train_test_split * no_dis)
train_dis_num = int(train_test_split * dis)

test_no_dis_num = no_dis - train_no_dis_num
test_dis_num = dis - train_dis_num

((train_no_dis_num, train_dis_num), (test_no_dis_num, test_dis_num))

((0, 86), (0, 87))

In [192]:
train_ids = []

train_ids.extend(np.random.choice(image_ids_annotated[(binarized_labels[:, 8] == 1)], size = train_no_dis_num, replace = False))
train_ids.extend(np.random.choice(image_ids_annotated[(binarized_labels[:, 8] != 1)], size = train_dis_num, replace = False))
len(train_ids)

86

In [193]:
test_ids = []

test_ids.extend(np.setdiff1d(image_ids_annotated[(binarized_labels[:, 8] == 1)], train_ids))
test_ids.extend(np.setdiff1d(image_ids_annotated[(binarized_labels[:, 8] != 1)], train_ids))
len(test_ids)

87

In [194]:
len(set(train_ids + test_ids))

173

In [196]:
# construct train/val/test files
def dump_image_sets(image_ids, binarized_label_map, set_name = "train"):
    images_f = open(f"/home/jessica/labelGAN/downstream_tasks/vinbig/{set_name}_binarized_list.txt", "a")
    for image_id in image_ids:
        images_f.write(f"{image_id} {' '.join(map(str, list(binarized_label_map[image_id])))}" + os.linesep)
    images_f.close()

In [197]:
dump_image_sets(train_ids, image_id_2_label, "val")
# construct_files(val_ids, vinbig_full_metadata, "val")


In [198]:
dump_image_sets(test_ids, image_id_2_label, "test")