This notebook demonstrates how to create one-hot encoding for a multi-label classification problem. While the original dataset contains 14 labels, we'll focus on the following 8:

```python
labels = ["Atelectasis", "Cardiomegaly", "Effusion", "Infiltration", "Mass", "Nodule", "Pneumonia", "Pneumothorax"]
```
This encoded data will be saved to a file for use in the subsequent image feature extraction step.

In [9]:
import pandas as pd
import numpy as np

In [10]:
df = pd.read_csv('/Users/xuantruong/Documents/JAIST/inference_prob_mlc_code/datasets/NIH/Data_Entry_2017__testset.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25596 entries, 0 to 25595
Data columns (total 11 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Image Index                  25596 non-null  object 
 1   Finding Labels               25596 non-null  object 
 2   Follow-up #                  25596 non-null  int64  
 3   Patient ID                   25596 non-null  int64  
 4   Patient Age                  25596 non-null  int64  
 5   Patient Gender               25596 non-null  object 
 6   View Position                25596 non-null  object 
 7   OriginalImage[Width          25596 non-null  int64  
 8   Height]                      25596 non-null  int64  
 9   OriginalImagePixelSpacing[x  25596 non-null  float64
 10  y]                           25596 non-null  float64
dtypes: float64(2), int64(5), object(4)
memory usage: 2.1+ MB


In [11]:
df.head()

Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y]
0,00000003_000.png,Hernia,0,3,81,F,PA,2582,2991,0.143,0.143
1,00000003_001.png,Hernia,1,3,74,F,PA,2500,2048,0.168,0.168
2,00000003_002.png,Hernia,2,3,75,F,PA,2048,2500,0.168,0.168
3,00000003_003.png,Hernia|Infiltration,3,3,76,F,PA,2698,2991,0.143,0.143
4,00000003_004.png,Hernia,4,3,77,F,PA,2500,2048,0.168,0.168


In [12]:
df = df[['Image Index', 'Finding Labels']]

In [13]:
# labels = ["Atelectasis", "Consolidation", "Infiltration",
#           "Pneumothorax", "Edema", "Emphysema", "Fibrosis",
#           "Effusion", "Pneumonia", "Pleural_Thickening",
#           "Cardiomegaly", "Nodule", "Mass", "Hernia"]

labels = ["Atelectasis", "Cardiomegaly", "Effusion", "Infiltration", "Mass", "Nodule", "Pneumonia", "Pneumothorax"]

In [15]:
def process_labels(label_string):
    if label_string == "No Finding":
        return [0] * len(labels)  # All zeros for "No Finding"
    else:
        labels_present = label_string.split("|")
        result = [1 if label in labels_present else 0 for label in labels]
        return result


In [16]:
# # Create new one-hot encoded columns
# for label in labels:
#     df["Labels"] = df['Finding Labels'].apply(process_labels)
# df = df[['Image Index', "Finding Labels", "Labels"]]


In [17]:
def has_exist_label(label_string, label_column):
    if label_string == "No Finding":
        return 0  # All zeros for "No Finding"
    else:
        return 1 if label_column in label_string.split("|") else 0

In [18]:
for label in labels:
    df[label] = df['Finding Labels'].apply(lambda x: has_exist_label(x, label))

df.head()

Unnamed: 0,Image Index,Finding Labels,Atelectasis,Cardiomegaly,Effusion,Infiltration,Mass,Nodule,Pneumonia,Pneumothorax
0,00000003_000.png,Hernia,0,0,0,0,0,0,0,0
1,00000003_001.png,Hernia,0,0,0,0,0,0,0,0
2,00000003_002.png,Hernia,0,0,0,0,0,0,0,0
3,00000003_003.png,Hernia|Infiltration,0,0,0,1,0,0,0,0
4,00000003_004.png,Hernia,0,0,0,0,0,0,0,0


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25596 entries, 0 to 25595
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Image Index     25596 non-null  object
 1   Finding Labels  25596 non-null  object
 2   Atelectasis     25596 non-null  int64 
 3   Cardiomegaly    25596 non-null  int64 
 4   Effusion        25596 non-null  int64 
 5   Infiltration    25596 non-null  int64 
 6   Mass            25596 non-null  int64 
 7   Nodule          25596 non-null  int64 
 8   Pneumonia       25596 non-null  int64 
 9   Pneumothorax    25596 non-null  int64 
dtypes: int64(8), object(2)
memory usage: 2.0+ MB


In [20]:
print(f"\U0001F600 Labels: {labels} \U0001F600")
df.sample(20)

😀 Labels: ['Atelectasis', 'Cardiomegaly', 'Effusion', 'Infiltration', 'Mass', 'Nodule', 'Pneumonia', 'Pneumothorax'] 😀


Unnamed: 0,Image Index,Finding Labels,Atelectasis,Cardiomegaly,Effusion,Infiltration,Mass,Nodule,Pneumonia,Pneumothorax
4077,00006585_016.png,Pneumothorax,0,0,0,0,0,0,0,1
2052,00002982_001.png,No Finding,0,0,0,0,0,0,0,0
19042,00022416_064.png,No Finding,0,0,0,0,0,0,0,0
17699,00020673_039.png,No Finding,0,0,0,0,0,0,0,0
2490,00003973_014.png,Atelectasis|Cardiomegaly|Effusion|Infiltration...,1,1,1,1,0,1,0,0
12376,00015313_013.png,No Finding,0,0,0,0,0,0,0,0
17095,00020113_032.png,Effusion,0,0,1,0,0,0,0,0
11638,00014626_038.png,Infiltration|Pneumothorax,0,0,0,1,0,0,0,1
17496,00020429_028.png,Emphysema,0,0,0,0,0,0,0,0
8428,00012087_050.png,No Finding,0,0,0,0,0,0,0,0


In [21]:

# df['Labels'] = df['Labels'].apply(lambda x: np.array(x).tolist())

df.to_csv('/Users/xuantruong/Documents/JAIST/inference_prob_mlc_code/datasets/NIH_Data_Entry_2017__testset2.csv', index=False)

In [23]:
df1 = pd.read_csv('/Users/xuantruong/Documents/JAIST/inference_prob_mlc_code/datasets/NIH_Data_Entry_2017__testset2.csv')


df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25596 entries, 0 to 25595
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Image Index     25596 non-null  object
 1   Finding Labels  25596 non-null  object
 2   Atelectasis     25596 non-null  int64 
 3   Cardiomegaly    25596 non-null  int64 
 4   Effusion        25596 non-null  int64 
 5   Infiltration    25596 non-null  int64 
 6   Mass            25596 non-null  int64 
 7   Nodule          25596 non-null  int64 
 8   Pneumonia       25596 non-null  int64 
 9   Pneumothorax    25596 non-null  int64 
dtypes: int64(8), object(2)
memory usage: 2.0+ MB


In [24]:
df1.head()

Unnamed: 0,Image Index,Finding Labels,Atelectasis,Cardiomegaly,Effusion,Infiltration,Mass,Nodule,Pneumonia,Pneumothorax
0,00000003_000.png,Hernia,0,0,0,0,0,0,0,0
1,00000003_001.png,Hernia,0,0,0,0,0,0,0,0
2,00000003_002.png,Hernia,0,0,0,0,0,0,0,0
3,00000003_003.png,Hernia|Infiltration,0,0,0,1,0,0,0,0
4,00000003_004.png,Hernia,0,0,0,0,0,0,0,0


In [25]:
df1.drop(['Finding Labels'], axis=1, inplace=True)
df1.to_csv('/Users/xuantruong/Documents/JAIST/inference_prob_mlc_code/datasets/NIH_Data_Entry_2017__testset3.csv', index=False)

In [None]:
# Define a converter function
def list_to_array(list_str):
    return np.array(eval(list_str))  # Safely evaluate the string as a list

# Apply the converter to the 'Labels' column
df1['Labels'] = df1['Labels'].apply(list_to_array)

df1.head()

In [None]:
df1.info()

In [None]:
df1.iloc[0]['Labels']