In [1]:
import pandas as pd
import re, os

In [2]:
MIMIC_CXR_JPEG = '/home/huent/fairness-ai/dataset/physionet.org/files/mimic-cxr-jpg/2.0.0'
MIMIC_CXR_EMBEDDING = '/home/huent/fairness-ai/dataset/image-embeddings-mimic-cxr/generalized-image-embeddings-for-the-mimic-chest-x-ray-dataset-1.0'
MIMIC_CXR_DEMOGRAPHIC = '/home/huent/fairness-ai/dataset/mimic-demographic'

In [3]:
# Example: 'files/p19/p19692222/s59566639/965b6053-a2c70d67-c0467ca6-02372346-fb7c6224.tfrecord'
FILE_PATTERN = re.compile(
    r"files/(?:\w+)/p(?P<subject_id>\w+)/s(?P<study_id>\w+)/(?P<dicom_id>[\w-]+)\.tfrecord"
)


def parse_embedding_file_pattern(file_path: str):
  """Extracts the subject_id, study_id, and dicom_id

  from the full file path string of a MIMIC CXR Embedding file:

  https://physionet.org/content/image-embeddings-mimic-cxr/

  Example input:
  files/p19/p19692222/s59566639/965b6053-a2c70d67-c0467ca6-02372346-fb7c6224.tfrecord
  """
  match = FILE_PATTERN.fullmatch(file_path)
  if not match:
    raise Exception(f"Failed to match file path: {file_path}")
  return (int(match[1]), int(match[2]), match[3])

In [4]:
df_embeddings = pd.read_csv(f"{MIMIC_CXR_EMBEDDING}/SHA256SUMS.txt", delimiter=" ", header=None, skiprows=[0])  # Skip the license file entry
display(df_embeddings.head())

Unnamed: 0,0,1
0,70e8c87d38d5dfba586218ccafb7428f76a3c7b08f6179...,files/p10/p10000032/s50414267/02aa804e-bde0afd...
1,a0c7a01d9fc223b25767ebf04d6f736ac4cca1f9336946...,files/p10/p10000032/s53189527/2a2277a9-b0ded15...
2,38202feabe98d8cd042b50879a886c8a099aba9afef18c...,files/p10/p10000032/s53911762/68b5c4b1-227d048...
3,23283d7a2cb72d8e59f21dfd33d66be9082f69b18c5c81...,files/p10/p10000032/s53911762/fffabebf-74fd3a1...
4,c62e51c914c91b2bc2d3c14a82046709b800bada664e5c...,files/p10/p10000032/s56699142/ea030e7a-2e3b134...


In [5]:
SOURCE_COL_NAME = "embeddings_file"
DL_COL_NAME = "local_embeddings_file"
df_embeddings = df_embeddings[[1]]
df_embeddings.rename(columns={1: SOURCE_COL_NAME}, inplace=True)
df_embeddings[["subject_id","study_id", "dicom_id"]] = df_embeddings.apply(
    lambda x: parse_embedding_file_pattern(x[SOURCE_COL_NAME]), axis=1, result_type="expand")

df_embeddings[DL_COL_NAME] = df_embeddings[SOURCE_COL_NAME].apply(lambda x: os.path.join(MIMIC_CXR_EMBEDDING, x))  # For download
display(df_embeddings)

Unnamed: 0,embeddings_file,subject_id,study_id,dicom_id,local_embeddings_file
0,files/p10/p10000032/s50414267/02aa804e-bde0afd...,10000032,50414267,02aa804e-bde0afdd-112c0b34-7bc16630-4e384014,/home/huent/fairness-ai/dataset/image-embeddin...
1,files/p10/p10000032/s53189527/2a2277a9-b0ded15...,10000032,53189527,2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab,/home/huent/fairness-ai/dataset/image-embeddin...
2,files/p10/p10000032/s53911762/68b5c4b1-227d048...,10000032,53911762,68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714,/home/huent/fairness-ai/dataset/image-embeddin...
3,files/p10/p10000032/s53911762/fffabebf-74fd3a1...,10000032,53911762,fffabebf-74fd3a1f-673b6b41-96ec0ac9-2ab69818,/home/huent/fairness-ai/dataset/image-embeddin...
4,files/p10/p10000032/s56699142/ea030e7a-2e3b134...,10000032,56699142,ea030e7a-2e3b1346-bc518786-7a8fd698-f673b44c,/home/huent/fairness-ai/dataset/image-embeddin...
...,...,...,...,...,...
243319,files/p19/p19999733/s57132437/3fcd0406-9b11160...,19999733,57132437,3fcd0406-9b111603-feae7033-96632b3a-111333e5,/home/huent/fairness-ai/dataset/image-embeddin...
243320,files/p19/p19999733/s57132437/428e2c18-5721d8f...,19999733,57132437,428e2c18-5721d8f3-35a05001-36f3d080-9053b83c,/home/huent/fairness-ai/dataset/image-embeddin...
243321,files/p19/p19999987/s55368167/58766883-376a15c...,19999987,55368167,58766883-376a15ce-3b323a28-6af950a0-16b793bd,/home/huent/fairness-ai/dataset/image-embeddin...
243322,files/p19/p19999987/s58621812/7ba273af-3d290f8...,19999987,58621812,7ba273af-3d290f8d-e28d0ab4-484b7a86-7fc12b08,/home/huent/fairness-ai/dataset/image-embeddin...


In [6]:
CXR_JPG_METADATA_FILES = (
    "mimic-cxr-2.0.0-metadata.csv",
    "mimic-cxr-2.0.0-split.csv",
    "mimic-cxr-2.0.0-chexpert.csv")

df_metadata = pd.read_csv(f"{MIMIC_CXR_JPEG}/{CXR_JPG_METADATA_FILES[0]}")
df_split = pd.read_csv(f"{MIMIC_CXR_JPEG}/{CXR_JPG_METADATA_FILES[1]}")
df_labels_chexpert = pd.read_csv(f"{MIMIC_CXR_JPEG}/{CXR_JPG_METADATA_FILES[2]}")

display(df_metadata.head())
display(df_split.head())
display(df_labels_chexpert.head())

Unnamed: 0,dicom_id,subject_id,study_id,PerformedProcedureStepDescription,ViewPosition,Rows,Columns,StudyDate,StudyTime,ProcedureCodeSequence_CodeMeaning,ViewCodeSequence_CodeMeaning,PatientOrientationCodeSequence_CodeMeaning
0,02aa804e-bde0afdd-112c0b34-7bc16630-4e384014,10000032,50414267,CHEST (PA AND LAT),PA,3056,2544,21800506,213014.531,CHEST (PA AND LAT),postero-anterior,Erect
1,174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962,10000032,50414267,CHEST (PA AND LAT),LATERAL,3056,2544,21800506,213014.531,CHEST (PA AND LAT),lateral,Erect
2,2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab,10000032,53189527,CHEST (PA AND LAT),PA,3056,2544,21800626,165500.312,CHEST (PA AND LAT),postero-anterior,Erect
3,e084de3b-be89b11e-20fe3f9f-9c8d8dfe-4cfd202c,10000032,53189527,CHEST (PA AND LAT),LATERAL,3056,2544,21800626,165500.312,CHEST (PA AND LAT),lateral,Erect
4,68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714,10000032,53911762,CHEST (PORTABLE AP),AP,2705,2539,21800723,80556.875,CHEST (PORTABLE AP),antero-posterior,


Unnamed: 0,dicom_id,study_id,subject_id,split
0,02aa804e-bde0afdd-112c0b34-7bc16630-4e384014,50414267,10000032,train
1,174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962,50414267,10000032,train
2,2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab,53189527,10000032,train
3,e084de3b-be89b11e-20fe3f9f-9c8d8dfe-4cfd202c,53189527,10000032,train
4,68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714,53911762,10000032,train


Unnamed: 0,subject_id,study_id,Atelectasis,Cardiomegaly,Consolidation,Edema,Enlarged Cardiomediastinum,Fracture,Lung Lesion,Lung Opacity,No Finding,Pleural Effusion,Pleural Other,Pneumonia,Pneumothorax,Support Devices
0,10000032,50414267,,,,,,,,,1.0,,,,,
1,10000032,53189527,,,,,,,,,1.0,,,,,
2,10000032,53911762,,,,,,,,,1.0,,,,,
3,10000032,56699142,,,,,,,,,1.0,,,,,
4,10000764,57375967,,,1.0,,,,,,,,,-1.0,,


In [7]:
df_labels_chexpert['No Finding'].unique()

array([ 1., nan])

In [8]:
# Each study contains one or more DICOMs
# Chexpert labels df does not contain DICOM ID. Must join on (subject_id + study_id)
df_labels_all = df_split.merge(df_labels_chexpert, on=['subject_id', 'study_id'])
df_labels_all = df_labels_all.merge(df_metadata, on=['dicom_id'])
df_labels_all = df_embeddings.merge(df_labels_all, on=['dicom_id'], how='left')

display(df_labels_all)

Unnamed: 0,embeddings_file,subject_id,study_id,dicom_id,local_embeddings_file,study_id_x,subject_id_x,split,Atelectasis,Cardiomegaly,...,study_id_y,PerformedProcedureStepDescription,ViewPosition,Rows,Columns,StudyDate,StudyTime,ProcedureCodeSequence_CodeMeaning,ViewCodeSequence_CodeMeaning,PatientOrientationCodeSequence_CodeMeaning
0,files/p10/p10000032/s50414267/02aa804e-bde0afd...,10000032,50414267,02aa804e-bde0afdd-112c0b34-7bc16630-4e384014,/home/huent/fairness-ai/dataset/image-embeddin...,50414267,10000032,train,,,...,50414267,CHEST (PA AND LAT),PA,3056,2544,21800506,213014.531,CHEST (PA AND LAT),postero-anterior,Erect
1,files/p10/p10000032/s53189527/2a2277a9-b0ded15...,10000032,53189527,2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab,/home/huent/fairness-ai/dataset/image-embeddin...,53189527,10000032,train,,,...,53189527,CHEST (PA AND LAT),PA,3056,2544,21800626,165500.312,CHEST (PA AND LAT),postero-anterior,Erect
2,files/p10/p10000032/s53911762/68b5c4b1-227d048...,10000032,53911762,68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714,/home/huent/fairness-ai/dataset/image-embeddin...,53911762,10000032,train,,,...,53911762,CHEST (PORTABLE AP),AP,2705,2539,21800723,80556.875,CHEST (PORTABLE AP),antero-posterior,
3,files/p10/p10000032/s53911762/fffabebf-74fd3a1...,10000032,53911762,fffabebf-74fd3a1f-673b6b41-96ec0ac9-2ab69818,/home/huent/fairness-ai/dataset/image-embeddin...,53911762,10000032,train,,,...,53911762,CHEST (PORTABLE AP),AP,2906,2258,21800723,80556.875,CHEST (PORTABLE AP),antero-posterior,Erect
4,files/p10/p10000032/s56699142/ea030e7a-2e3b134...,10000032,56699142,ea030e7a-2e3b1346-bc518786-7a8fd698-f673b44c,/home/huent/fairness-ai/dataset/image-embeddin...,56699142,10000032,train,,,...,56699142,CHEST (PORTABLE AP),AP,3056,2544,21800805,234424.765,CHEST (PORTABLE AP),antero-posterior,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
243319,files/p19/p19999733/s57132437/3fcd0406-9b11160...,19999733,57132437,3fcd0406-9b111603-feae7033-96632b3a-111333e5,/home/huent/fairness-ai/dataset/image-embeddin...,57132437,19999733,train,,,...,57132437,CHEST (PA AND LAT),PA,3056,2544,21520708,224550.171,CHEST (PA AND LAT),postero-anterior,Erect
243320,files/p19/p19999733/s57132437/428e2c18-5721d8f...,19999733,57132437,428e2c18-5721d8f3-35a05001-36f3d080-9053b83c,/home/huent/fairness-ai/dataset/image-embeddin...,57132437,19999733,train,,,...,57132437,CHEST (PA AND LAT),PA,3056,2544,21520708,224550.171,CHEST (PA AND LAT),postero-anterior,Erect
243321,files/p19/p19999987/s55368167/58766883-376a15c...,19999987,55368167,58766883-376a15ce-3b323a28-6af950a0-16b793bd,/home/huent/fairness-ai/dataset/image-embeddin...,55368167,19999987,train,1.0,-1.0,...,55368167,CHEST (PORTABLE AP),AP,2544,3056,21451104,51448.218,CHEST (PORTABLE AP),antero-posterior,Erect
243322,files/p19/p19999987/s58621812/7ba273af-3d290f8...,19999987,58621812,7ba273af-3d290f8d-e28d0ab4-484b7a86-7fc12b08,/home/huent/fairness-ai/dataset/image-embeddin...,58621812,19999987,train,1.0,,...,58621812,CHEST (PORTABLE AP),AP,3056,2544,21451102,202809.234,CHEST (PORTABLE AP),antero-posterior,Erect


In [9]:
patients = pd.read_csv(os.path.join(MIMIC_CXR_DEMOGRAPHIC,'patients.csv'))
ethnicities = pd.read_csv(os.path.join(MIMIC_CXR_DEMOGRAPHIC,'admissions.csv')).drop_duplicates(subset = ['subject_id']).set_index('subject_id')['ethnicity'].to_dict()
patients['ethnicity'] = patients['subject_id'].map(ethnicities)

In [10]:
df_labels_with_demographic = df_labels_all.merge(patients, on=['subject_id'], how='left')
df_labels_with_demographic['age_decile'] = pd.cut(df_labels_with_demographic['anchor_age'], bins = list(range(0, 101, 20))).apply(lambda x: f'{x.left}-{x.right}').astype(str)
df_labels_with_demographic['frontal'] = df_labels_with_demographic.ViewPosition.isin(['AP', 'PA'])

In [11]:
df_labels_with_demographic.columns

Index(['embeddings_file', 'subject_id', 'study_id', 'dicom_id',
       'local_embeddings_file', 'study_id_x', 'subject_id_x', 'split',
       'Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema',
       'Enlarged Cardiomediastinum', 'Fracture', 'Lung Lesion', 'Lung Opacity',
       'No Finding', 'Pleural Effusion', 'Pleural Other', 'Pneumonia',
       'Pneumothorax', 'Support Devices', 'subject_id_y', 'study_id_y',
       'PerformedProcedureStepDescription', 'ViewPosition', 'Rows', 'Columns',
       'StudyDate', 'StudyTime', 'ProcedureCodeSequence_CodeMeaning',
       'ViewCodeSequence_CodeMeaning',
       'PatientOrientationCodeSequence_CodeMeaning', 'gender', 'anchor_age',
       'anchor_year', 'anchor_year_group', 'dod', 'ethnicity', 'age_decile',
       'frontal'],
      dtype='object')

In [12]:
disease_columns = ['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema',
           'Enlarged Cardiomediastinum', 'Fracture', 'Lung Lesion', 'Lung Opacity',
           'No Finding', 'Pleural Effusion', 'Pleural Other', 'Pneumonia',
           'Pneumothorax', 'Support Devices']

df_labels_with_demographic = df_labels_with_demographic.dropna(subset=disease_columns, how='all')

In [13]:
diagnoses_dataframes = {}
for diagnosis in ['No Finding']:
  # Only extract required columns for the ML model
  df_diagnosis = df_labels_with_demographic[[SOURCE_COL_NAME, DL_COL_NAME, 'gender', 'age_decile', 'frontal', 'ethnicity', 'split', diagnosis]]
  df_diagnosis['No Finding'] = df_diagnosis['No Finding'].fillna(0)
  diagnoses_dataframes[diagnosis] = df_diagnosis
  df_diagnosis.to_csv(f'{MIMIC_CXR_EMBEDDING}/{diagnosis}.csv', index=False)
  print(f"Created {diagnosis}.csv with {len(df_diagnosis)} rows")
  display(df_diagnosis.nunique())
  print(df_diagnosis['No Finding'].unique())
  # Show label and split value distributions
  display(df_diagnosis[diagnosis].value_counts())
  display(df_diagnosis['split'].value_counts())
  print("\n")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_diagnosis['No Finding'] = df_diagnosis['No Finding'].fillna(0)


Created No Finding.csv with 240723 rows


embeddings_file          240723
local_embeddings_file    240723
gender                        2
age_decile                    6
frontal                       1
ethnicity                     8
split                         3
No Finding                    2
dtype: int64

[1. 0.]


No Finding
0.0    159606
1.0     81117
Name: count, dtype: int64

split
train       235410
test          3380
validate      1933
Name: count, dtype: int64





# Verify implementation

In [14]:
print(df_diagnosis['ethnicity'].unique())

['WHITE' nan 'BLACK/AFRICAN AMERICAN' 'OTHER' 'UNKNOWN' 'ASIAN'
 'HISPANIC/LATINO' 'UNABLE TO OBTAIN' 'AMERICAN INDIAN/ALASKA NATIVE']


In [18]:
df_embeddings.iloc[0]

embeddings_file          files/p10/p10000032/s50414267/02aa804e-bde0afd...
subject_id                                                        10000032
study_id                                                          50414267
dicom_id                      02aa804e-bde0afdd-112c0b34-7bc16630-4e384014
local_embeddings_file    /home/huent/fairness-ai/dataset/image-embeddin...
Name: 0, dtype: object

In [23]:
df_embeddings.iloc[0]['embeddings_file']

'files/p10/p10000032/s50414267/02aa804e-bde0afdd-112c0b34-7bc16630-4e384014.tfrecord'

In [22]:
df_metadata.iloc[0]

dicom_id                                      02aa804e-bde0afdd-112c0b34-7bc16630-4e384014
subject_id                                                                        10000032
study_id                                                                          50414267
PerformedProcedureStepDescription                                       CHEST (PA AND LAT)
ViewPosition                                                                            PA
Rows                                                                                  3056
Columns                                                                               2544
StudyDate                                                                         21800506
StudyTime                                                                       213014.531
ProcedureCodeSequence_CodeMeaning                                       CHEST (PA AND LAT)
ViewCodeSequence_CodeMeaning                                              postero-anterior

In [27]:
patients[patients['subject_id'] == 10000032]

Unnamed: 0,subject_id,gender,anchor_age,anchor_year,anchor_year_group,dod,ethnicity
286875,10000032,F,52,2180,2014 - 2016,,WHITE


In [21]:
df_diagnosis.head().iloc[0]

embeddings_file          files/p10/p10000032/s50414267/02aa804e-bde0afd...
local_embeddings_file    /home/huent/fairness-ai/dataset/image-embeddin...
gender                                                                   F
age_decile                                                           40-60
frontal                                                               True
ethnicity                                                            WHITE
split                                                                train
No Finding                                                             1.0
Name: 0, dtype: object

# Convert tf record to numpy and save

In [3]:
import numpy as np
import tensorflow as tf
import pandas as pd
import os
import tqdm

def tfrecord2numpy(filename):
    raw_dataset = tf.data.TFRecordDataset(filename)
  # Expect only one element in the TFRecord.
    filename = filename.replace('.tfrecord', '.npy')
    filename = filename.replace('/home/huent/fairness-ai/dataset/image-embeddings-mimic-cxr/generalized-image-embeddings-for-the-mimic-chest-x-ray-dataset-1.0/', '/local/ssd/huent/mimic-cxr-emb/')
    if (os.path.exists(filename)):
        return
    for raw_record in raw_dataset.take(1):
        example = tf.train.Example()
        example.ParseFromString(raw_record.numpy())
        
        # Extract the embedding feature as a list
        embedding_list = example.features.feature['embedding'].float_list.value
        
        # Convert the list to a numpy array
        np_record = np.array(embedding_list)
    os.makedirs(os.path.dirname(filename), exist_ok=True)
    np.save(filename, np_record)

In [4]:

csv_file = '/home/huent/fairness-ai/dataset/image-embeddings-mimic-cxr/generalized-image-embeddings-for-the-mimic-chest-x-ray-dataset-1.0/processed_mimic_df.csv'
df = pd.read_csv(csv_file)

for i in tqdm.tqdm(range(len(df))):
    tfrecord2numpy(df.iloc[i]['path'])

100%|██████████| 136462/136462 [14:26<00:00, 157.42it/s]


In [21]:
filename = df.iloc[0]['local_embeddings_file']
raw_dataset = tf.data.TFRecordDataset(filename)
# Expect only one element in the TFRecord.
for raw_record in raw_dataset.take(1):
    example = tf.train.Example()
    example.ParseFromString(raw_record.numpy())
    
    # Extract the embedding feature as a list
    embedding_list = example.features.feature['embedding'].float_list.value
    
    # Convert the list to a numpy array
    np_record = np.array(embedding_list)
np_record.dtype

dtype('float64')

In [10]:
with open(csv_file) as file:
    file_data = file.read()

# Replace 'generalized-image-embeddings-for-the-mimic-chest-x-ray-dataset-1.0' with 'numpy'
file_data = file_data.replace('generalized-image-embeddings-for-the-mimic-chest-x-ray-dataset-1.0', 'numpy')

# Replace '.tfrecord' with '.npy'
file_data = file_data.replace('.tfrecord', '.npy')

# Open the file in write mode
with open('/home/huent/fairness-ai/dataset/image-embeddings-mimic-cxr/numpy/No Finding.csv', 'w') as file:
    file.write(file_data)