In [1]:
import numpy as np
import pandas as pd
import pickle
import os
import glob
import random
from sklearn.preprocessing import MultiLabelBinarizer

In [2]:
train_csv_path = "/home/rmpatil/multi_task_gen/data/vinbig_we_labeled/train.csv"
self_annotated_vinbig_path = "/home/rmpatil/multi_task_gen/data/vinbig_we_labeled/vinbig_test_imgs_and_segm/"

In [3]:
vinbig_full_metadata = pd.read_csv(train_csv_path)

In [4]:
vinbig_full_metadata.head()

Unnamed: 0,image_id,class_name,class_id,rad_id,x_min,y_min,x_max,y_max
0,50a418190bc3fb1ef1633bf9678929b3,No finding,14,R11,,,,
1,21a10246a5ec7af151081d0cd6d65dc9,No finding,14,R7,,,,
2,9a5094b2563a1ef3ff50dc5c7ff71345,Cardiomegaly,3,R10,691.0,1375.0,1653.0,1831.0
3,051132a778e61a86eb147c7c6f564dfe,Aortic enlargement,0,R10,1264.0,743.0,1611.0,1019.0
4,063319de25ce7edb9b1c6b8881290140,No finding,14,R10,,,,


In [5]:
image_ids_annotated = np.array([mask.replace(self_annotated_vinbig_path, "").replace(".json", "") for mask in glob.glob(f"{self_annotated_vinbig_path}*.json")])
len(image_ids_annotated)

149

## EDA on image id data available

In [6]:
# getting binarized labels for multi-label classification problem
label_lists = []
for image_id in image_ids_annotated:
    labels = set(vinbig_full_metadata.query(f"image_id=='{image_id}'")["class_name"].values)
    label_lists.append(labels)
        
mlb = MultiLabelBinarizer(classes=['Aortic enlargement', 'Atelectasis', 'Calcification',
   'Cardiomegaly', 'Consolidation', 'ILD', 'Infiltration',
   'Lung Opacity', 'No finding', 'Nodule/Mass', 'Other lesion',
   'Pleural effusion', 'Pleural thickening', 'Pneumothorax',
   'Pulmonary fibrosis'])

binarized_labels = mlb.fit_transform([label for label in label_lists])

image_id_2_label = {}
for idx, image_id in enumerate(image_ids_annotated):
    image_id_2_label[image_id] = binarized_labels[idx]

In [7]:
# total images
binarized_labels.shape

(149, 15)

In [8]:
# images with no disease == "no finding"
no_dis = (binarized_labels[:, 8] == 1).nonzero()[0].shape[0]
no_dis

79

In [9]:
# images with disease != "no finding"
dis = (binarized_labels[:, 8] != 1).nonzero()[0].shape[0]
dis

70

In [10]:
label_split = no_dis/dis
label_split

1.1285714285714286

In [11]:
train_test_split = 0.7 # no validation set as we are not doing any hyper parameter tuning in our work!

train_no_dis_num = int(train_test_split * no_dis)
train_dis_num = int(train_test_split * dis)

test_no_dis_num = no_dis - train_no_dis_num
test_dis_num = dis - train_dis_num

((train_no_dis_num, train_dis_num), (test_no_dis_num, test_dis_num))

((55, 49), (24, 21))

In [12]:
train_ids = []

train_ids.extend(np.random.choice(image_ids_annotated[(binarized_labels[:, 8] == 1)], size = train_no_dis_num, replace = False))
train_ids.extend(np.random.choice(image_ids_annotated[(binarized_labels[:, 8] != 1)], size = train_dis_num, replace = False))
len(train_ids)

104

In [13]:
test_ids = []

test_ids.extend(np.setdiff1d(image_ids_annotated[(binarized_labels[:, 8] == 1)], train_ids))
test_ids.extend(np.setdiff1d(image_ids_annotated[(binarized_labels[:, 8] != 1)], train_ids))
len(test_ids)

45

In [14]:
len(set(train_ids + test_ids))

149

In [15]:
# construct train/val/test files
def dump_image_sets(image_ids, binarized_label_map, set_name = "train"):
    images_f = open(f"{set_name}_binarized_list.txt", "w")
    for image_id in image_ids:
        images_f.write(f"{image_id} {' '.join(map(str, list(binarized_label_map[image_id])))}" + os.linesep)
    images_f.close()

In [16]:
dump_image_sets(train_ids, image_id_2_label, "train")
# construct_files(val_ids, vinbig_full_metadata, "val")
dump_image_sets(test_ids, image_id_2_label, "test")