In [3]:
"""
author: jakekim 
description: this file maps 51 vindr labels into 14 nih labels. 

"""

import pandas as pd
import numpy as np
import os 

### Matching the obvious and easy ones first: 
# path
parent = ('/home/jkim/research/peds_cxr/') ### parent set to metadata folder. 
vindr_path = (parent + 'peds_cxr_metadata/raw_metadata/vindr_raw_metadata_jk.csv')

new_directory_path = parent + 'peds_cxr_metadata/processed_metadata/'
os.makedirs(new_directory_path, exist_ok=True)


data = pd.read_csv(vindr_path)
print(data.shape)
new_data = pd.DataFrame()

# Directly copied columns
new_data['Image Index'] = data['image_id']
new_data['Patient Age'] = data['Patient_age_year']
new_data['Patient Gender'] = data['Patient_sex']

# Ensure integer type for boolean data
new_data['No Finding'] = data['No finding'].astype(int)
new_data['Atelectasis'] = data['Atelectasis'].astype(int)
new_data['Consolidation'] = data['Consolidation'].astype(int)
new_data['Edema'] = data['Edema'].astype(int)
new_data['Effusion'] = data['Pleural effusion'].astype(int)
new_data['Emphysema'] = data[['Emphysema', 'Congenital emphysema']].any(axis=1).astype(int)
new_data['Fibrosis'] = data['Pulmonary fibrosis'].astype(int)
new_data['Hernia'] = data[['Diagphramatic hernia', 'Intrathoracic digestive structure']].any(axis=1).astype(int)
new_data['Pneumothorax'] = data['Pneumothorax'].astype(int)
new_data['Pleural_Thickening'] = data['Pleural thickening'].astype(int)
new_data['No Finding'] = data['No finding'].astype(int)

# Combined columns
new_data['Mass/Nodule'] = data[['Mediastinal tumor', 'Chest wall mass', 'Lung tumor', 'Other nodule/mass', 'Anterior mediastinal mass', 'Paraveterbral mass', 'Reticulonodular opacity']].any(axis=1).astype(int)
new_data['Cardiomegaly'] = data[['Cardiomegaly', 'Boot-shaped heart']].any(axis=1).astype(int)
new_data['Pneumonia'] = data[['Pneumonia', 'Brocho-pneumonia', 'Pleuro-pneumonia']].any(axis=1).astype(int)
new_data['Infiltration'] = data[['Infiltration', 'Diffuse aveolar opacity', 'Hyaline membrane disease', 'Peribronchovascular interstitial opacity']].any(axis=1).astype(int)

# ### Now matching Dr. Kamel's manual matches for the following labels: 
# ### interstitial lung disease - ILD, lung cavity, lung cyst, other lesion, other opacity, and tuberculosis
# Load the additional mapping file
additional_mapping_path = parent + '/peds_cxr_metadata/peds_xray_images_for_review_modified.xlsx'
additional_mapping = pd.read_excel(additional_mapping_path)

# Fill the NaNs in the relevant columns with 0
additional_mapping[['Infiltration', 'Consolidation', 'Mass/Nodule', 'Fibrosis', 'Pneumonia']] = additional_mapping[['Infiltration', 'Consolidation', 'Mass/Nodule', 'Fibrosis', 'Pneumonia']].fillna(0)

# Make sure they're integers
additional_mapping[['Infiltration', 'Consolidation', 'Mass/Nodule', 'Fibrosis', 'Pneumonia']] = additional_mapping[['Infiltration', 'Consolidation', 'Mass/Nodule', 'Fibrosis', 'Pneumonia']].astype(int)

# Now update the new_data DataFrame with this additional mapping, but only where the current value is 0
for index, row in additional_mapping.iterrows():
    image_id = row['image_id']

    for label in ['Infiltration', 'Consolidation', 'Mass/Nodule', 'Fibrosis', 'Pneumonia']:
        if new_data.loc[new_data['Image Index'] == image_id, label].item() == 0:
            new_data.loc[new_data['Image Index'] == image_id, label] = row[label]

# Create a list of new_labels for reference
### Now match the remaining extra labels. Extra labels are by default considered no finding, unless the image was already labeled with a diagnostic label. 
new_labels = ['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 
              'Effusion', 'Emphysema', 'Fibrosis', 'Hernia', 'Infiltration', 'Mass/Nodule',
              'Pleural_Thickening', 'Pneumonia', 'Pneumothorax']

# Create a list of extra_labels for reference
extra_labels = ['Aortic enlargement', 'Bronchectasis', 'Bronchial thickening', 'Bronchiolitis', 'Bronchitis',
                'Calcification', 'Clavicle fracture', 'CPAM', 'Dextro cardia', 'Egg on string sign',
                'Enlarged PA', 'Expanded edges of the anterior ribs', 'Lung hyperinflation', 'Mediastinal shift',
                'Other disease', 'Situs inversus', 'Stomach on the right side', 'Tuberculosis', 'Other opacity',
                'Other lesion', 'Interstitial lung disease - ILD', 'Lung cavity', 'Lung cyst' ]


# add column for dataset
new_data['Dataset'] = 'VinDR'

# Reorder the columns
new_order = ['Image Index', 'Dataset', 'Patient Age', 'Patient Gender', 'No Finding',
       'Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Effusion',
       'Emphysema', 'Fibrosis', 'Hernia', 'Infiltration', 'Mass/Nodule',
       'Pleural_Thickening', 'Pneumonia', 'Pneumothorax']

new_data = new_data.reindex(columns=new_order)

#keep those with sex and age listed in the variables below
sex_list = ['M','F'] #,'O'
age_list = ['000','001','002','003','004','005','006','007','008','009','010','011','012','013','014','015','016','017'] #,'mis', 'Y', np.nan, ''

new_data = new_data.loc[(new_data['Patient Gender'].isin(sex_list)) & (new_data['Patient Age'].isin(age_list))]
print('missing sex and age dropped')
print(new_data.shape)
# add .png to the end of the image index
new_data['Image Index'] = new_data['Image Index'].apply(lambda x: str(x) + '.png')

# Define the labels to check for all zeros (excluding 'No Finding' if it's considered a valid label on its own)
labels= ['No Finding', 'Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Effusion',
                   'Emphysema', 'Fibrosis', 'Hernia', 'Infiltration', 'Mass/Nodule',
                   'Pleural_Thickening', 'Pneumonia', 'Pneumothorax']

# Filter out rows where all labels in labels_to_check are 0
new_data_filtered = new_data[(new_data[labels] != 0).any(axis=1)]
print('all zero dropped')
print(new_data_filtered.shape)
# print(new_data.sum())
new_data_filtered.to_csv(parent + 'peds_cxr_metadata/processed_metadata/vindr_14labels.csv', index=False)

### check if any rows are all 0
# Check if there are any images where all labels are 0
all_zero_images = new_data[(new_data[labels] == 0).all(axis=1)]

print("Number of images where all labels are zero:", all_zero_images.shape[0])

if not all_zero_images.empty:
    print("Here are some of those image indices:")
    print(all_zero_images['Image Index'].head())  # printing first few rows

all_zero_images.to_csv(parent + 'peds_cxr_metadata/processed_metadata/all_zero_images.csv', index=False)

### printing counts
# counting number of labels per disease by patient gender 
grouped = new_data.groupby('Patient Gender').sum().reset_index()
total = new_data.sum(numeric_only=True).rename('All')
summary = grouped.append(total)
summary = summary.transpose()
summary.to_csv(parent + 'peds_cxr_metadata/processed_metadata/vindr_counts.csv')


(9125, 57)
missing sex and age dropped
(8755, 18)
all zero dropped
(8268, 18)
Number of images where all labels are zero: 487
Here are some of those image indices:
14     18b46a3a49c78f9d01df774de16a107c.png
61     79b6d335e842734726db35ea86773646.png
75     acabadf495809b8a571583706c7520b9.png
80     e5e0d99b6f004f0e89c0a300dba144cc.png
127    ca19a99159af23371354a9f5ed403bd4.png
Name: Image Index, dtype: object


  grouped = new_data.groupby('Patient Gender').sum().reset_index()
  summary = grouped.append(total)


In [4]:
### editing NIH raw metadata
nih_path = parent + 'peds_cxr_metadata/raw_metadata/nih_raw_metadata_jk.csv'
nih_data = pd.read_csv(nih_path)

nih_data = nih_data.drop(columns=['Patient ID', 'View Position'])
print(nih_data.shape)
# merge mass and nodule columns in NIH
nih_data['Mass/Nodule'] = np.where((nih_data['Mass'] == 1) | (nih_data['Nodule'] == 1), 1, 0)
nih_data = nih_data.drop(columns=['Mass', 'Nodule'])
nih_data['Dataset'] = 'NIH'
nih_data = nih_data.reindex(columns=new_order)
nih_data.to_csv(parent + 'peds_cxr_metadata/processed_metadata/nih_14labels.csv', index=False)

# counting NIH labels
grouped = nih_data.groupby('Patient Gender').sum().reset_index()
total = nih_data.sum(numeric_only=True).rename('All')
summary = grouped.append(total)
summary = summary.transpose()
summary.to_csv(parent + 'peds_cxr_metadata/processed_metadata/nih_counts.csv')
print(nih_data.shape)

# merging VINDR and NIH into aggregate metadata
merged_data = pd.concat([new_data_filtered, nih_data], ignore_index=True)
print(merged_data.shape)
merged_data.to_csv(parent + 'peds_cxr_metadata/processed_metadata/aggregate_14labels.csv', index=False)


# counting aggregate labels
grouped = merged_data.groupby('Patient Gender').sum().reset_index()
total = merged_data.sum(numeric_only=True).rename('All')
summary = grouped.append(total)
summary = summary.transpose()
summary.to_csv(parent + 'peds_cxr_metadata/processed_metadata/aggregate_counts.csv')

(5241, 18)
(5241, 18)
(13509, 18)


  grouped = nih_data.groupby('Patient Gender').sum().reset_index()
  summary = grouped.append(total)
  grouped = merged_data.groupby('Patient Gender').sum().reset_index()
  summary = grouped.append(total)
