# Understand the data
Make sure that the data downloaded from various sources all tell the same story and there is no discrepancy

### Preprocess data to make the column names consitent

In [39]:
import pandas as pd
from pathlib import Path

In [40]:
# Assuming that we are currently in the notebooks directory
project_path = Path.cwd().parent

In [41]:
df_miccai_train = pd.read_csv(f'{project_path}/meta/miccai2023_nih-cxr-lt_labels_train.csv')
df_miccai_val = pd.read_csv(f'{project_path}/meta/miccai2023_nih-cxr-lt_labels_val.csv')
df_miccai_test = pd.read_csv(f'{project_path}/meta/miccai2023_nih-cxr-lt_labels_test.csv')
df_kaggle = pd.read_csv('Data_Entry_2017.csv')
df_nih = pd.read_csv('Data_Entry_2017_v2020.csv')

In [42]:
df_miccai_test.columns

Index(['id', 'Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema',
       'Effusion', 'Emphysema', 'Fibrosis', 'Hernia', 'Infiltration', 'Mass',
       'Nodule', 'Pleural Thickening', 'Pneumonia', 'Pneumothorax',
       'Pneumoperitoneum', 'Pneumomediastinum', 'Subcutaneous Emphysema',
       'Tortuous Aorta', 'Calcification of the Aorta', 'No Finding',
       'subj_id'],
      dtype='object')

In [43]:
df_miccai = pd.concat([df_miccai_train, df_miccai_val, df_miccai_test])

In [44]:
df_miccai.rename(columns={'id': 'Image Index', 'Pleural Thickening': 'Pleural_Thickening'}, inplace=True)

In [45]:
# Lets ensure all the dataframes are sorted by image file name
df_kaggle = df_kaggle.sort_values(by='Image Index').reset_index()
df_nih = df_nih.sort_values(by='Image Index').reset_index()
df_miccai = df_miccai.sort_values(by='Image Index').reset_index()

In [46]:
# Append the diseases as column names
diseases = df_kaggle['Finding Labels'].str.get_dummies(sep='|')
df_kaggle = pd.concat([df_kaggle, diseases], axis=1)

diseases = df_nih['Finding Labels'].str.get_dummies(sep='|')
df_nih = pd.concat([df_nih, diseases], axis=1)

In [47]:
extra_in_miccai = set(df_miccai.columns) - set(df_kaggle.columns)
list(extra_in_miccai)

['subj_id',
 'Pneumomediastinum',
 'Subcutaneous Emphysema',
 'Calcification of the Aorta',
 'Pneumoperitoneum',
 'Tortuous Aorta']

In [48]:
df_miccai[list(extra_in_miccai)].sum().sort_values(ascending=False)

subj_id                       1608516321
Subcutaneous Emphysema              1991
Tortuous Aorta                       742
Calcification of the Aorta           455
Pneumoperitoneum                     316
Pneumomediastinum                    253
dtype: int64

In [49]:
df_miccai[['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema',
       'Effusion', 'Emphysema', 'Fibrosis', 'Hernia', 'Infiltration', 'Mass',
       'Nodule', 'Pleural_Thickening', 'Pneumonia', 'Pneumothorax',
       'Pneumoperitoneum', 'Pneumomediastinum', 'Subcutaneous Emphysema',
       'Tortuous Aorta', 'Calcification of the Aorta', 'No Finding']].sum().sort_values(ascending=False)

No Finding                    59406
Infiltration                  19894
Effusion                      13317
Atelectasis                   11559
Nodule                         6331
Mass                           5782
Pneumothorax                   5302
Consolidation                  4667
Pleural_Thickening             3385
Cardiomegaly                   2776
Emphysema                      2516
Edema                          2303
Subcutaneous Emphysema         1991
Fibrosis                       1686
Pneumonia                      1431
Tortuous Aorta                  742
Calcification of the Aorta      455
Pneumoperitoneum                316
Pneumomediastinum               253
Hernia                          227
dtype: int64

In [54]:
diseases = ['Atelectasis', 'Cardiomegaly', 'Effusion', 'Infiltration', 'Mass', 'Nodule', 'Pneumonia', 'Pneumothorax', 'Consolidation', 'Edema', 'Emphysema', 'Fibrosis', 'Pleural_Thickening', 'Hernia', 'No Finding']
len(set(diseases))

15

### Check whether all the data is same


In [55]:
# Make sure the dataframes are the same
for disease in diseases:
    assert(df_kaggle[disease].equals(df_nih[disease]))

In [58]:
for disease in diseases:
    print(disease, (df_kaggle[disease] != (df_miccai[disease])).sum())

Atelectasis 0
Cardiomegaly 0
Effusion 0
Infiltration 0
Mass 0
Nodule 0
Pneumonia 0
Pneumothorax 0
Consolidation 0
Edema 0
Emphysema 0
Fibrosis 0
Pleural_Thickening 0
Hernia 0
No Finding 955


In [59]:
df_kaggle[df_kaggle[disease] != (df_miccai[disease])]

Unnamed: 0,index,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],...,Emphysema,Fibrosis,Hernia,Infiltration,Mass,No Finding,Nodule,Pleural_Thickening,Pneumonia,Pneumothorax
52,52,00000013_014.png,No Finding,14,13,61,M,PA,2021,2021,...,0,0,0,0,0,1,0,0,0,0
178,178,00000035_000.png,No Finding,0,35,79,M,PA,2754,2873,...,0,0,0,0,0,1,0,0,0,0
179,179,00000035_001.png,No Finding,1,35,80,M,PA,2718,2973,...,0,0,0,0,0,1,0,0,0,0
583,583,00000143_007.png,No Finding,7,143,92,M,AP,2500,2048,...,0,0,0,0,0,1,0,0,0,0
613,613,00000150_001.png,No Finding,1,150,54,F,AP,3056,2544,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
111548,111548,00030535_002.png,No Finding,2,30535,51,F,AP,3056,2544,...,0,0,0,0,0,1,0,0,0,0
111593,111593,00030558_000.png,No Finding,0,30558,65,M,PA,2021,2021,...,0,0,0,0,0,1,0,0,0,0
111925,111925,00030673_000.png,No Finding,0,30673,58,F,PA,2021,2015,...,0,0,0,0,0,1,0,0,0,0
111940,111940,00030682_001.png,No Finding,1,30682,79,F,PA,2021,2021,...,0,0,0,0,0,1,0,0,0,0


### All the rows which are not matching have been identified with the new labels. So, looks like we are okay

In [62]:
df_miccai[df_kaggle[disease] != (df_miccai[disease])].sum()

index                                                                  27471623
Image Index                   00000013_014.png00000035_000.png00000035_001.p...
Atelectasis                                                                   0
Cardiomegaly                                                                  0
Consolidation                                                                 0
Edema                                                                         0
Effusion                                                                      0
Emphysema                                                                     0
Fibrosis                                                                      0
Hernia                                                                        0
Infiltration                                                                  0
Mass                                                                          0
Nodule                                  