# Create Proper Labels for Datapoints
### The data is presented with pipe-concatenated string-based disease labels for each image (along with some extraneous information). The goal is to take the text labels from the format "Disease|Disease|Disease" into a more usable ground truth columns formatted for One-Hot encoding.

In [166]:
import pandas as pd

## Read in the CSV file into a dataframe

In [167]:
df = pd.read_csv("Data_Entry_2017.csv")

In [168]:
df.head(5)

Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y],Unnamed: 11
0,00000001_000.png,Cardiomegaly,0,1,58,M,PA,2682,2749,0.143,0.143,
1,00000001_001.png,Cardiomegaly|Emphysema,1,1,58,M,PA,2894,2729,0.143,0.143,
2,00000001_002.png,Cardiomegaly|Effusion,2,1,58,M,PA,2500,2048,0.168,0.168,
3,00000002_000.png,No Finding,0,2,81,M,PA,2500,2048,0.171,0.171,
4,00000003_000.png,Hernia,0,3,81,F,PA,2582,2991,0.143,0.143,


## Remove data columns that won't be used for training

In [169]:
for col in df.columns: 
    print(col) 

Image Index
Finding Labels
Follow-up #
Patient ID
Patient Age
Patient Gender
View Position
OriginalImage[Width
Height]
OriginalImagePixelSpacing[x
y]
Unnamed: 11


In [170]:
df.drop([
    'Follow-up #',
    'Patient ID',
    'Patient Age',
    'Patient Gender',
    'View Position',
    'OriginalImage[Width',
    'Height]',
    'OriginalImagePixelSpacing[x',
    'y]',
    'Unnamed: 11'
    ],
    inplace=True,
    axis=1
)

In [171]:
df.head(5)

Unnamed: 0,Image Index,Finding Labels
0,00000001_000.png,Cardiomegaly
1,00000001_001.png,Cardiomegaly|Emphysema
2,00000001_002.png,Cardiomegaly|Effusion
3,00000002_000.png,No Finding
4,00000003_000.png,Hernia


## Create new columns for each disease & assign a default value of 0

In [172]:
df = df.assign(
    Atelectasis=0,
    Cardiomegaly=0,
    Effusion=0,
    Infiltration=0,
    Mass=0,
    Nodule=0,
    Pneumonia=0,
    Pneumothorax=0,
    Consolidation=0, 
    Edema=0,
    Emphysema=0, 
    Fibrosis=0,
    Pleural_Thickening=0, 
    Hernia=0
)

In [173]:
df.head(5)

Unnamed: 0,Image Index,Finding Labels,Atelectasis,Cardiomegaly,Effusion,Infiltration,Mass,Nodule,Pneumonia,Pneumothorax,Consolidation,Edema,Emphysema,Fibrosis,Pleural_Thickening,Hernia
0,00000001_000.png,Cardiomegaly,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,00000001_001.png,Cardiomegaly|Emphysema,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,00000001_002.png,Cardiomegaly|Effusion,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,00000002_000.png,No Finding,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,00000003_000.png,Hernia,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## Assign '1' for each disease ground truth value

In [174]:
df.loc[df["Finding Labels"].str.contains("Atelectasis"), "Atelectasis"] = 1
df.loc[df["Finding Labels"].str.contains("Cardiomegaly"), "Cardiomegaly"] = 1
df.loc[df["Finding Labels"].str.contains("Effusion"), "Effusion"] = 1
df.loc[df["Finding Labels"].str.contains("Infiltration"), "Infiltration"] = 1
df.loc[df["Finding Labels"].str.contains("Mass"), "Mass"] = 1
df.loc[df["Finding Labels"].str.contains("Nodule"), "Nodule"] = 1
df.loc[df["Finding Labels"].str.contains("Pneumonia"), "Pneumonia"] = 1
df.loc[df["Finding Labels"].str.contains("Pneumothorax"), "Pneumothorax"] = 1
df.loc[df["Finding Labels"].str.contains("Consolidation"), "Consolidation"] = 1
df.loc[df["Finding Labels"].str.contains("Edema"), "Edema"] = 1
df.loc[df["Finding Labels"].str.contains("Emphysema"), "Emphysema"] = 1
df.loc[df["Finding Labels"].str.contains("Fibrosis"), "Fibrosis"] = 1
df.loc[df["Finding Labels"].str.contains("Pleural_Thickening"), "Pleural_Thickening"] = 1
df.loc[df["Finding Labels"].str.contains("Hernia"), "Hernia"] = 1

In [175]:
df.head(5)

Unnamed: 0,Image Index,Finding Labels,Atelectasis,Cardiomegaly,Effusion,Infiltration,Mass,Nodule,Pneumonia,Pneumothorax,Consolidation,Edema,Emphysema,Fibrosis,Pleural_Thickening,Hernia
0,00000001_000.png,Cardiomegaly,0,1,0,0,0,0,0,0,0,0,0,0,0,0
1,00000001_001.png,Cardiomegaly|Emphysema,0,1,0,0,0,0,0,0,0,0,1,0,0,0
2,00000001_002.png,Cardiomegaly|Effusion,0,1,1,0,0,0,0,0,0,0,0,0,0,0
3,00000002_000.png,No Finding,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,00000003_000.png,Hernia,0,0,0,0,0,0,0,0,0,0,0,0,0,1


## Save new One-Hot label encodings to a new CSV file

In [176]:
df.to_csv("corrected_labels.csv")