# Imports

In [1]:
import numpy as np
import pandas as pd
import torch
from datasets import Dataset

Here, we will look into the data labels of the NIH x-ray data and try to format into N-hot encoded label arrays for each image.

Note, that the chronology of the images appear by taking the array from the data frame we will be loading in, and simply using the `sorted` built in function. Let's make sure we map the correct labels to the images we have batched into the Arrow datasets.

### Load in the csv file

In [2]:
df = pd.read_csv('Data_Entry_2017_v2020.csv')

In [3]:
df

Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y]
0,00000001_000.png,Cardiomegaly,0,1,57,M,PA,2682,2749,0.143,0.143
1,00000001_001.png,Cardiomegaly|Emphysema,1,1,58,M,PA,2894,2729,0.143,0.143
2,00000001_002.png,Cardiomegaly|Effusion,2,1,58,M,PA,2500,2048,0.168,0.168
3,00000002_000.png,No Finding,0,2,80,M,PA,2500,2048,0.171,0.171
4,00000003_001.png,Hernia,0,3,74,F,PA,2500,2048,0.168,0.168
...,...,...,...,...,...,...,...,...,...,...,...
112115,00030801_001.png,Mass|Pneumonia,1,30801,38,M,PA,2048,2500,0.168,0.168
112116,00030802_000.png,No Finding,0,30802,28,M,PA,2048,2500,0.168,0.168
112117,00030803_000.png,No Finding,0,30803,42,F,PA,2048,2500,0.168,0.168
112118,00030804_000.png,No Finding,0,30804,29,F,PA,2048,2500,0.168,0.168


In [5]:
df['Finding Labels'].nunique()

836

In [4]:
df['Finding Labels'].unique()

array(['Cardiomegaly', 'Cardiomegaly|Emphysema', 'Cardiomegaly|Effusion',
       'No Finding', 'Hernia', 'Hernia|Infiltration', 'Mass|Nodule',
       'Infiltration', 'Effusion|Infiltration', 'Nodule', 'Emphysema',
       'Effusion', 'Atelectasis', 'Effusion|Mass', 'Infiltration|Mass',
       'Infiltration|Mass|Pneumothorax', 'Mass',
       'Cardiomegaly|Infiltration|Mass|Nodule',
       'Cardiomegaly|Effusion|Emphysema|Mass',
       'Atelectasis|Cardiomegaly|Emphysema|Mass|Pneumothorax',
       'Emphysema|Mass', 'Emphysema|Mass|Pneumothorax', 'Pneumothorax',
       'Emphysema|Pneumothorax', 'Atelectasis|Pneumothorax',
       'Cardiomegaly|Emphysema|Pneumothorax', 'Mass|Pleural_Thickening',
       'Cardiomegaly|Mass|Pleural_Thickening', 'Pleural_Thickening',
       'Effusion|Emphysema|Infiltration|Pneumothorax',
       'Emphysema|Infiltration|Pleural_Thickening|Pneumothorax',
       'Effusion|Pneumonia|Pneumothorax',
       'Effusion|Infiltration|Pneumothorax',
       'Effusion|Infiltra

If a patient has multiple pathologies, they are separated by a `|` solid line delimiter

Let's split them up and see how many unique labels there really are

In [6]:
ids = df['Image Index'].values

In [11]:
sorted_id_ind_pairs = np.array([(i, val) for i, val in enumerate(ids)])    

In [13]:
sorted_id_ind_pairs.shape

(112120, 2)

In [14]:
len(sorted_id_ind_pairs)

112120

In [15]:
fake = [(0,1), (0,2), (0,5), (0,3)]

sort_fake = sorted(fake, key = lambda x: x[1])

In [16]:
sort_fake

[(0, 1), (0, 2), (0, 3), (0, 5)]

In [17]:
# Sort based on the actual id values, and keep the first value in the tuple as the 
# original index in the data frame
sorted_id_ind_pairs = sorted(sorted_id_ind_pairs, key = lambda x: x[1])

In [21]:
sorted_inds = np.array([int(val[0]) for val in sorted_id_ind_pairs])

In [22]:
sorted_inds

array([     0,      1,      2, ..., 112117, 112118, 112119])

In [24]:
# for i, val in enumerate(sorted_inds):
#     if np.abs(val - sorted_inds[i+1]) != 1:
#         print(i, val) 

$\uparrow$ Just proving to myself that in fact the labels needed to be ordered

ex: The index in the sorted list is not equal to the index in the dataframe

In [27]:
sorted_inds[4]

11

In [29]:
labels = df['Finding Labels'].values

In [30]:
labels[0]

'Cardiomegaly'

In [33]:
labels[2]

'Cardiomegaly|Effusion'

In [53]:
labels[39]

'Infiltration|Mass|Pneumothorax'

In [54]:
labels[39].split('|')

['Infiltration', 'Mass', 'Pneumothorax']

In [56]:
sorted_inds

array([     0,      1,      2, ..., 112117, 112118, 112119])

In [64]:
sorted_labels = labels[sorted_inds]

In [65]:
split_labels = [label.split('|') for label in sorted_labels]

In [66]:
len(split_labels)

112120

In [68]:
# split_labels

In [69]:
# split_labels = np.array(split_labels, dtype='object')

In [70]:
# split_labels

Let's make as many entries as there are pathologies for each patient. Then we can map the existence of a pathology for each patient to a one or a zero, in columns where each column is a unique pathology.

There might be an easier way...

In [75]:
unique_labels = np.unique(
    df['Finding Labels'].str.split('|').aggregate(np.concatenate)
    ).tolist()

In [106]:
unique_labels

['Atelectasis',
 'Cardiomegaly',
 'Consolidation',
 'Edema',
 'Effusion',
 'Emphysema',
 'Fibrosis',
 'Hernia',
 'Infiltration',
 'Mass',
 'No Finding',
 'Nodule',
 'Pleural_Thickening',
 'Pneumonia',
 'Pneumothorax']

In [76]:
label_index = {v: i for i, v in enumerate(unique_labels)}

In [77]:
label_index

{'Atelectasis': 0,
 'Cardiomegaly': 1,
 'Consolidation': 2,
 'Edema': 3,
 'Effusion': 4,
 'Emphysema': 5,
 'Fibrosis': 6,
 'Hernia': 7,
 'Infiltration': 8,
 'Mass': 9,
 'No Finding': 10,
 'Nodule': 11,
 'Pleural_Thickening': 12,
 'Pneumonia': 13,
 'Pneumothorax': 14}

In [78]:
df

Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y]
0,00000001_000.png,Cardiomegaly,0,1,57,M,PA,2682,2749,0.143,0.143
1,00000001_001.png,Cardiomegaly|Emphysema,1,1,58,M,PA,2894,2729,0.143,0.143
2,00000001_002.png,Cardiomegaly|Effusion,2,1,58,M,PA,2500,2048,0.168,0.168
3,00000002_000.png,No Finding,0,2,80,M,PA,2500,2048,0.171,0.171
4,00000003_001.png,Hernia,0,3,74,F,PA,2500,2048,0.168,0.168
...,...,...,...,...,...,...,...,...,...,...,...
112115,00030801_001.png,Mass|Pneumonia,1,30801,38,M,PA,2048,2500,0.168,0.168
112116,00030802_000.png,No Finding,0,30802,28,M,PA,2048,2500,0.168,0.168
112117,00030803_000.png,No Finding,0,30803,42,F,PA,2048,2500,0.168,0.168
112118,00030804_000.png,No Finding,0,30804,29,F,PA,2048,2500,0.168,0.168


In [79]:
def string_to_N_hot(string: str):
    true_index = [label_index[cl] for cl in string.split("|")]
    label = np.zeros((len(unique_labels),), dtype=float)
    label[true_index] = 1
    return label

df["labels"] = df["Finding Labels"].apply(string_to_N_hot)

In [80]:
df

Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y],labels
0,00000001_000.png,Cardiomegaly,0,1,57,M,PA,2682,2749,0.143,0.143,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,00000001_001.png,Cardiomegaly|Emphysema,1,1,58,M,PA,2894,2729,0.143,0.143,"[0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ..."
2,00000001_002.png,Cardiomegaly|Effusion,2,1,58,M,PA,2500,2048,0.168,0.168,"[0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ..."
3,00000002_000.png,No Finding,0,2,80,M,PA,2500,2048,0.171,0.171,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,00000003_001.png,Hernia,0,3,74,F,PA,2500,2048,0.168,0.168,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...
112115,00030801_001.png,Mass|Pneumonia,1,30801,38,M,PA,2048,2500,0.168,0.168,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
112116,00030802_000.png,No Finding,0,30802,28,M,PA,2048,2500,0.168,0.168,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
112117,00030803_000.png,No Finding,0,30803,42,F,PA,2048,2500,0.168,0.168,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
112118,00030804_000.png,No Finding,0,30804,29,F,PA,2048,2500,0.168,0.168,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


<hr>

In [81]:
sorted_labels = df['labels'].values[sorted_inds]

In [83]:
ids

array(['00000001_000.png', '00000001_001.png', '00000001_002.png', ...,
       '00030803_000.png', '00030804_000.png', '00030805_000.png'],
      dtype=object)

In [84]:
sorted_ids = ids[sorted_inds]

In [86]:
sorted_ids[4]

'00000003_000.png'

In [87]:
ids[4]

'00000003_001.png'

$\checkmark$

In [94]:
sorted_labels

array([array([0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
       array([0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
       array([0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
       ...,
       array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.]),
       array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.]),
       array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.])],
      dtype=object)

In [100]:
np.stack(sorted_labels).shape

(112120, 15)

In [101]:
sorted_labels = np.stack(sorted_labels)

In [102]:
sorted_labels

array([[0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [103]:
sorted_labels[0]

array([0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [104]:
np.save('sorted_nih_n_hot_encoded_labels.npy', sorted_labels)

In [105]:
np.load('sorted_nih_n_hot_encoded_labels.npy')

array([[0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [107]:
label_tensor = torch.tensor(np.load('sorted_nih_n_hot_encoded_labels.npy'))

In [109]:
label_tensor.shape

torch.Size([112120, 15])

In [110]:
label_tensor[0]

tensor([0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       dtype=torch.float64)