This notebook demonstrates how to create one-hot encoding for a multi-label classification problem. While the original dataset contains 14 labels, we'll focus on the following 8:

```python
labels = ["Atelectasis", "Cardiomegaly", "Effusion", "Infiltration", "Mass", "Nodule", "Pneumonia", "Pneumothorax"]
```
This encoded data will be saved to a file for use in the subsequent image feature extraction step.

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('/Users/xuantruong/Documents/JAIST/inference_prob_mlc_code/datasets/NIH/Data_Entry_2017__testset.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25596 entries, 0 to 25595
Data columns (total 11 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Image Index                  25596 non-null  object 
 1   Finding Labels               25596 non-null  object 
 2   Follow-up #                  25596 non-null  int64  
 3   Patient ID                   25596 non-null  int64  
 4   Patient Age                  25596 non-null  int64  
 5   Patient Gender               25596 non-null  object 
 6   View Position                25596 non-null  object 
 7   OriginalImage[Width          25596 non-null  int64  
 8   Height]                      25596 non-null  int64  
 9   OriginalImagePixelSpacing[x  25596 non-null  float64
 10  y]                           25596 non-null  float64
dtypes: float64(2), int64(5), object(4)
memory usage: 2.1+ MB


In [3]:
df.head()

Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y]
0,00000003_000.png,Hernia,0,3,81,F,PA,2582,2991,0.143,0.143
1,00000003_001.png,Hernia,1,3,74,F,PA,2500,2048,0.168,0.168
2,00000003_002.png,Hernia,2,3,75,F,PA,2048,2500,0.168,0.168
3,00000003_003.png,Hernia|Infiltration,3,3,76,F,PA,2698,2991,0.143,0.143
4,00000003_004.png,Hernia,4,3,77,F,PA,2500,2048,0.168,0.168


In [4]:
# labels = ["Atelectasis", "Consolidation", "Infiltration",
#           "Pneumothorax", "Edema", "Emphysema", "Fibrosis",
#           "Effusion", "Pneumonia", "Pleural_Thickening",
#           "Cardiomegaly", "Nodule", "Mass", "Hernia"]

labels = ["Atelectasis", "Cardiomegaly", "Effusion", "Infiltration", "Mass", "Nodule", "Pneumonia", "Pneumothorax"]

In [6]:
def process_labels(label_string):
    if label_string == "No Finding":
        return [0] * len(labels)  # All zeros for "No Finding"
    else:
        labels_present = label_string.split("|")
        result = [1 if label in labels_present else 0 for label in labels]
        return result

In [None]:
# Create new one-hot encoded columns
for label in labels:
    df["Labels"] = df['Finding Labels'].apply(process_labels)
df = df[['Image Index', "Finding Labels", "Labels"]]
# Example of the modified DataFrame:


In [None]:
df.info()

In [None]:
print(f"\U0001F600 Labels: {labels} \U0001F600")
df.sample(20)

In [None]:

df['Labels'] = df['Labels'].apply(lambda x: np.array(x).tolist())

df.to_csv('/Users/xuantruong/Documents/JAIST/inference_prob_mlc_code/datasets/NIH_Data_Entry_2017__testset.csv', index=False)

In [None]:
df1 = pd.read_csv('/Users/xuantruong/Documents/JAIST/inference_prob_mlc_code/datasets/NIH_Data_Entry_2017__testset.csv')


df1.info()

In [None]:
df1.head()

In [None]:
# Define a converter function
def list_to_array(list_str):
    return np.array(eval(list_str))  # Safely evaluate the string as a list

# Apply the converter to the 'Labels' column
df1['Labels'] = df1['Labels'].apply(list_to_array)

df1.head()

In [None]:
df1.info()

In [None]:
df1.iloc[0]['Labels']