In [None]:
#======== CONSTRUCCIÓN, DATA CLEANING/PREPROCESSING ========

In [None]:
"""
Observaciones:
*La infomación de la race de cada paciente solo se encuntra en la tabla de admisiones
*La información de edad y género de encuentra de la tabla de pacientes
*La información de las labels (patologías) se encunetra en mimic-cxr-2.0.0-chexpert.csv

--Del df clabels (clean labels, mimic-cxr-2.0.0-chexpert.csv -> subjecvt_id, study_id, labels) se cuenta con:
    (Data cleaning / preprocessing:  NaN as 0 (negative), and drop the -1 (Uncertain))
    
    Total Patients Number: 64653
    Total Studies Number: 215280

--Del df cmetadata (mimic-cxr-2.0.0-metadata.csv -> dicom_id, subject_id, study_id, view position) se cuenta con:
    (Data cleaning / preprocessing: drop NaN of ViewPoint and filter PA and AP CXrs)
    
    Total number of images: 243334
    Total number of Patients: 63945 
    Total number of Studies: 218139

--[MERGE] img_data_df: surge del merge de clabels_df y cmetadata_df -> (subjecvt_id, study_id, labels, dicom_id, view position)
    (Limitado por info de patientes dispoible subject_id de cmetadata_df (menor cantidad de patientes))
    
    Total number of images: 229968
    Total number of Patients: 63198
    Total number of Studies: 206067 
        

--Del df crace (crace_df -> admissions.csv-> race info -> subject_id), se cuenta con:
    (Data cleaning / preprocessing: drop_duplicates, no hay NaN)
    
     Total number of patients with race information: 180733

--Del df patients (patients.csv ->  subject_id, age, gender), se cuenta con:
    (Data cleaning / preprocessing: se deja tal cual, no hay NaN, no hoy duplicados )
    
    Total number of Patients: 299712

--[MERGE] demo_patients_df: que surge al unir crace_df y patientes_df: (subject_id (unique),  age, gender, race)
    (Limitado por info de race disponible (menor cantidad de patients))
     
    El total de pacientes registrados en la tabla de pacientes (que contiene la info de genero, edad) es: 299712 Pacientes
    Total de pacientes de los qu se cuenta con la info de Race de 180733 pacientes.
    Por tanto, total de pacientes con info demográfica completa (demo_patients_df): 180733 

--Del masks_df (dicom_id,Left Lung, Right Lung, Heart, Height, Width), se cuenta con:
    (Data cleaning / preprocessing: drop NaN)
    
    Total number of masks (unique dicom_id): 243285

--[MERGE] image_df: entre la info demografica (demo_patients_df) y la info de labels de metadata (img_data_df):
    (Limitado por los patients con info demográfica completa disponible)
     
    Total number of images: 205000 
    Total number of Patients: 49768 
    Total number of Studies: 183652 
    

--FINAL [MERGE] image_mask_df_raw: image_df(demo_patients_df, img_data_df) y  masks_df
    (Limitado por la cantidad de masks diponibles (solo AP y PA en ViewPosition))
    
    Total number of images: 204957
    Total number of Patients: 49766
    Total number of Studies: 183617


Locations: 
Base path: /mnt/NAS3/datasets/external/MIMIC_CXR/physionet.org/files

#MIMIC IV     
labels: /mnt/NAS3/datasets/external/MIMIC_CXR/physionet.org/files/mimic-cxr-jpg/2.0.0/mimic-cxr-2.0.0-chexpert.csv
metadata: /mnt/NAS3/datasets/external/MIMIC_CXR/physionet.org/files/mimic-cxr-jpg/2.0.0mimic-cxr-2.0.0-metadata.csv
#MIMIC CXR - JPG
admissions: /mnt/NAS3/datasets/external/MIMIC_ICU/physionet.org/files/mimiciv/2.2/hosp/admissions.csv.gz
patients: /mnt/NAS3/datasets/external/MIMIC_ICU/physionet.org/files/mimiciv/2.2/hosp/patients.csv.gz
#MIMIC CheXmask
masks: /mnt/NAS3/datasets/external/MIMIC_CXR/physionet.org/files/chexmask-cxr-segmentation-data/0.4/OriginalResolution/MIMIC-CXR-JPG.csv


DF FINAL DE LA ETAPA DE CONSTRUCCIÓN: image_mask_df_raw: image_df(demo_patients_df, img_data_df) y  masks_df
    Total number of images: 204957
    Total number of Patients: 49766
    Total number of Studies: 183617

"""

In [2]:
import pandas as pd
import os
import numpy as np
from pathlib import Path

In [3]:
base_path = Path("/mnt/NAS3/datasets/external/MIMIC_CXR/physionet.org/files/mimic-cxr-jpg/2.0.0") #MIMIC CXR-JGP
base_path2 = Path("/mnt/NAS3/datasets/external/MIMIC_ICU/physionet.org/files/mimiciv/2.2/hosp") #MIMIC IV
base_path3 = Path ("/mnt/NAS3/datasets/external/MIMIC_CXR/physionet.org/files/chexmask-cxr-segmentation-data/0.4/OriginalResolution/") #MIMIC CheXmask

In [4]:
archivos = os.listdir(base_path)
print(archivos)

['files', 'LICENSE.txt', 'README', 'SHA256SUMS.txt', 'index.html', 'mimic-cxr-2.0.0-chexpert.csv', 'mimic-cxr-2.0.0-metadata.csv', 'mimic-cxr-2.0.0-negbio.csv', 'mimic-cxr-2.0.0-split.csv', 'Recon_IMG', 'FID_images', 'Feature_extraction_MIMIC.csv', 'MIMIC_frontal_embeddings_50000_66220.csv', 'MIMIC_frontal_embeddings_0_21110.csv', 'MIMIC_frontal_embeddings_21110_50000.csv', 'MIMIC_frontal_embeddings_66220_100000.csv', 'MIMIC_frontal_embeddings_100000_150000.csv', 'MIMIC_frontal_embeddings_150000_200000.csv', 'MIMIC_frontal_embeddings_200000_210099.csv', 'features_RADDINO_Recon']


In [5]:
#info mimic
readme_path = base_path / "README"
with open(readme_path, 'r') as file:
    readme_content = file.read()

print(readme_content)

# MIMIC-CXR-JPG v2.0.0

The MIMIC Chest X-ray JPG (MIMIC-CXR-JPG) Database v2.0.0 is a large publicly available dataset of chest radiographs in JPG format with structured labels derived from free-text radiology reports. The MIMIC-CXR-JPG dataset is wholly derived from MIMIC-CXR, providing JPG format files derived from the DICOM images and structured labels derived from the free-text reports. The aim of MIMIC-CXR-JPG is to provide a convenient processed version of MIMIC-CXR, as well as to provide a standard reference for data splits and image labels. The dataset contains 377,110 JPG format images and structured labels derived from the 227,827 free-text radiology reports associated with these images. The dataset is de-identified to satisfy the US Health Insurance Portability and Accountability Act of 1996 (HIPAA) Safe Harbor requirements. Protected health information (PHI) has been removed. The dataset is intended to support a wide body of research in medicine including image understandi

In [None]:
#======= clabel_df->  mimic-cxr-2.0.0-chexpert.csv -> subjecvt_id, study_id and labels ======= 

In [10]:
path_labels = base_path / "mimic-cxr-2.0.0-chexpert.csv"
mimic_info_df = pd.read_csv(path_labels)
mimic_info_df.head()

Unnamed: 0,subject_id,study_id,Atelectasis,Cardiomegaly,Consolidation,Edema,Enlarged Cardiomediastinum,Fracture,Lung Lesion,Lung Opacity,No Finding,Pleural Effusion,Pleural Other,Pneumonia,Pneumothorax,Support Devices
0,10000032,50414267,,,,,,,,,1.0,,,,,
1,10000032,53189527,,,,,,,,,1.0,,,,,
2,10000032,53911762,,,,,,,,,1.0,,,,,
3,10000032,56699142,,,,,,,,,1.0,,,,,
4,10000764,57375967,,,1.0,,,,,,,,,-1.0,,


In [None]:
# Selection of labels and pathologies to work with

In [11]:
labels_df=mimic_info_df[["subject_id","study_id","No Finding","Pleural Effusion", "Cardiomegaly","Pneumothorax"]]
labels_df

Unnamed: 0,subject_id,study_id,No Finding,Pleural Effusion,Cardiomegaly,Pneumothorax
0,10000032,50414267,1.0,,,
1,10000032,53189527,1.0,,,
2,10000032,53911762,1.0,,,
3,10000032,56699142,1.0,,,
4,10000764,57375967,,,,
...,...,...,...,...,...,...
227822,19999442,58708861,1.0,,,
227823,19999733,57132437,1.0,,,
227824,19999987,55368167,,0.0,-1.0,0.0
227825,19999987,58621812,,,,


In [12]:
labels_df.isna().sum()

subject_id               0
study_id                 0
No Finding          152372
Pleural Effusion    140555
Cardiomegaly        161028
Pneumothorax        173979
dtype: int64

In [13]:
labels = ["No Finding", "Pleural Effusion", "Cardiomegaly", "Pneumothorax"]

count_minus_one = (labels_df[labels] == -1).sum()
print(count_minus_one)


No Finding             0
Pleural Effusion    5814
Cardiomegaly        6043
Pneumothorax        1134
dtype: int64


In [15]:
print("Total Patients Number: " + str(labels_df.subject_id.nunique()))
print("Total Studies Number: " + str(labels_df.study_id.nunique()))

Total Patients Number: 65379
Total Studies Number: 227827


In [None]:
# Data cleaning and preprocessing: We treat the NaN as 0 (negative), and drop the -1 (Uncertain) 

**Different approaches to use the uncertainty labels during the model training.**

- U-Ignore: We ignore the uncertain labels during training (**The easiest way**).
- U-Zeroes: We map all instances of the uncertain label to 0.
- U-Ones: We map all instances of the uncertain label to 1.
- U-SelfTrained: We first train a model using the U-Ignore approach to convergence, and then use the model to make predictions that re-label each of the uncertainty labels with the probability prediction outputted by the model.
- U-MultiClass: We treat the uncertainty label as its own class.

We treat the NaN as 0 (negative), and drop the -1 (Uncertain) 

In [16]:
clabels_df=labels_df.copy()
clabels_df.replace(np.nan, 0, inplace=True) # NaN -> 0 (negative)
clabels_df.replace(-1, np.nan, inplace=True) # -1 -> NaN
clabels_df.dropna(inplace=True) #drop the -1 (Uncertain)
clabels_df

Unnamed: 0,subject_id,study_id,No Finding,Pleural Effusion,Cardiomegaly,Pneumothorax
0,10000032,50414267,1.0,0.0,0.0,0.0
1,10000032,53189527,1.0,0.0,0.0,0.0
2,10000032,53911762,1.0,0.0,0.0,0.0
3,10000032,56699142,1.0,0.0,0.0,0.0
4,10000764,57375967,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...
227820,19999376,57540554,1.0,0.0,0.0,0.0
227822,19999442,58708861,1.0,0.0,0.0,0.0
227823,19999733,57132437,1.0,0.0,0.0,0.0
227825,19999987,58621812,0.0,0.0,0.0,0.0


In [17]:
clabels_df.isna().sum()

subject_id          0
study_id            0
No Finding          0
Pleural Effusion    0
Cardiomegaly        0
Pneumothorax        0
dtype: int64

In [18]:
print("Total Patients Number: " + str(clabels_df.subject_id.nunique()))
print("Total Studies Number: " + str(clabels_df.study_id.nunique()))

Total Patients Number: 64653
Total Studies Number: 215280


In [22]:
df_pre_img=pd.merge(metadata_df, clabels_df, on=["subject_id","study_id"], how="inner")
df_pre_img

Unnamed: 0,dicom_id,subject_id,study_id,ViewPosition,No Finding,Pleural Effusion,Cardiomegaly,Pneumothorax
0,02aa804e-bde0afdd-112c0b34-7bc16630-4e384014,10000032,50414267,PA,1.0,0.0,0.0,0.0
1,174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962,10000032,50414267,LATERAL,1.0,0.0,0.0,0.0
2,2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab,10000032,53189527,PA,1.0,0.0,0.0,0.0
3,e084de3b-be89b11e-20fe3f9f-9c8d8dfe-4cfd202c,10000032,53189527,LATERAL,1.0,0.0,0.0,0.0
4,68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714,10000032,53911762,AP,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
359065,3fcd0406-9b111603-feae7033-96632b3a-111333e5,19999733,57132437,PA,1.0,0.0,0.0,0.0
359066,428e2c18-5721d8f3-35a05001-36f3d080-9053b83c,19999733,57132437,PA,1.0,0.0,0.0,0.0
359067,58c403aa-35ff8bd9-73e39f54-8dc9cc5d-e0ec3fa9,19999733,57132437,LATERAL,1.0,0.0,0.0,0.0
359068,7ba273af-3d290f8d-e28d0ab4-484b7a86-7fc12b08,19999987,58621812,AP,0.0,0.0,0.0,0.0


In [23]:
print("Total number of images: " + str(df_pre_img.dicom_id.nunique()))
print("Total number of Patients: " + str(df_pre_img.subject_id.nunique()))
print("Total number of Studies: " + str(df_pre_img.study_id.nunique()))

Total number of images: 359070
Total number of Patients: 64653
Total number of Studies: 215280


In [None]:
#======= label_df->  mimic-cxr-2.0.0-chexpert.csv -> subjecvt_id, study_id and labels =======

In [None]:
#======= metadata_df->  mimic-cxr-2.0.0-metadata.csv -> dicom_id, subject_id, study_id, view position =======

In [19]:
path_meta = base_path / "mimic-cxr-2.0.0-metadata.csv"
metadata_df = pd.read_csv(path_meta)

metadata_df.head()

Unnamed: 0,dicom_id,subject_id,study_id,PerformedProcedureStepDescription,ViewPosition,Rows,Columns,StudyDate,StudyTime,ProcedureCodeSequence_CodeMeaning,ViewCodeSequence_CodeMeaning,PatientOrientationCodeSequence_CodeMeaning
0,02aa804e-bde0afdd-112c0b34-7bc16630-4e384014,10000032,50414267,CHEST (PA AND LAT),PA,3056,2544,21800506,213014.531,CHEST (PA AND LAT),postero-anterior,Erect
1,174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962,10000032,50414267,CHEST (PA AND LAT),LATERAL,3056,2544,21800506,213014.531,CHEST (PA AND LAT),lateral,Erect
2,2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab,10000032,53189527,CHEST (PA AND LAT),PA,3056,2544,21800626,165500.312,CHEST (PA AND LAT),postero-anterior,Erect
3,e084de3b-be89b11e-20fe3f9f-9c8d8dfe-4cfd202c,10000032,53189527,CHEST (PA AND LAT),LATERAL,3056,2544,21800626,165500.312,CHEST (PA AND LAT),lateral,Erect
4,68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714,10000032,53911762,CHEST (PORTABLE AP),AP,2705,2539,21800723,80556.875,CHEST (PORTABLE AP),antero-posterior,


In [None]:
# Selection data to work with

In [20]:
metadata_df=metadata_df[["dicom_id", "subject_id", "study_id", "ViewPosition"]]
metadata_df

Unnamed: 0,dicom_id,subject_id,study_id,ViewPosition
0,02aa804e-bde0afdd-112c0b34-7bc16630-4e384014,10000032,50414267,PA
1,174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962,10000032,50414267,LATERAL
2,2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab,10000032,53189527,PA
3,e084de3b-be89b11e-20fe3f9f-9c8d8dfe-4cfd202c,10000032,53189527,LATERAL
4,68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714,10000032,53911762,AP
...,...,...,...,...
377105,428e2c18-5721d8f3-35a05001-36f3d080-9053b83c,19999733,57132437,PA
377106,58c403aa-35ff8bd9-73e39f54-8dc9cc5d-e0ec3fa9,19999733,57132437,LATERAL
377107,58766883-376a15ce-3b323a28-6af950a0-16b793bd,19999987,55368167,AP
377108,7ba273af-3d290f8d-e28d0ab4-484b7a86-7fc12b08,19999987,58621812,AP


In [24]:
metadata_df.isna().sum()

dicom_id            0
subject_id          0
study_id            0
ViewPosition    15769
dtype: int64

In [25]:
print("Total number of images: " + str(metadata_df.dicom_id.nunique()))
print("Total number of Patients: " + str(metadata_df.subject_id.nunique()))
print("Total number of Studies: " + str(metadata_df.study_id.nunique()))

Total number of images: 377110
Total number of Patients: 65379
Total number of Studies: 227835


In [26]:
#Data cleanning
cmetadata_df=metadata_df.copy()
cmetadata_df.dropna(inplace=True)
cmetadata_df.head()

Unnamed: 0,dicom_id,subject_id,study_id,ViewPosition
0,02aa804e-bde0afdd-112c0b34-7bc16630-4e384014,10000032,50414267,PA
1,174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962,10000032,50414267,LATERAL
2,2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab,10000032,53189527,PA
3,e084de3b-be89b11e-20fe3f9f-9c8d8dfe-4cfd202c,10000032,53189527,LATERAL
4,68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714,10000032,53911762,AP


In [27]:
print("Total number of images: " + str(cmetadata_df.dicom_id.nunique()))
print("Total number of Patients: " + str(cmetadata_df.subject_id.nunique()))
print("Total number of Studies: " + str(cmetadata_df.study_id.nunique()))

Total number of images: 361341
Total number of Patients: 64126
Total number of Studies: 220543


In [None]:
#selecting PA and AP CXRs

In [29]:
cmetadata_df["ViewPosition"].value_counts()

ViewPosition
AP                147173
PA                 96161
LATERAL            82853
LL                 35133
PA LLD                 4
LAO                    3
RAO                    3
AP AXIAL               2
AP LLD                 2
XTABLE LATERAL         2
AP RLD                 2
SWIMMERS               1
PA RLD                 1
LPO                    1
Name: count, dtype: int64

In [None]:
#not AP an PA : 118007

In [30]:
cmetadata_df_ap_pa = cmetadata_df[
    cmetadata_df["ViewPosition"].isin(["AP", "PA"])
]


In [31]:
cmetadata_df_ap_pa["ViewPosition"].value_counts()

ViewPosition
AP    147173
PA     96161
Name: count, dtype: int64

In [32]:
print("Total number of images: " + str(cmetadata_df_ap_pa.dicom_id.nunique()))
print("Total number of Patients: " + str(cmetadata_df_ap_pa.subject_id.nunique()))
print("Total number of Studies: " + str(cmetadata_df_ap_pa.study_id.nunique()))

Total number of images: 243334
Total number of Patients: 63945
Total number of Studies: 218139


In [None]:
#====== cmetadata_df->  mimic-cxr-2.0.0-metadata.csv -> dicom_id, subject_id, study_id, view position =======

In [None]:
#=====[MERGE] Del img_data_df: surge del merge de clabels_df y cmetadata_df -> (subjecvt_id, study_id, labels, dicom_id, view position)=======

In [34]:
img_data_df=pd.merge(cmetadata_df_ap_pa, clabels_df, on=["subject_id","study_id"], how="inner")
img_data_df

Unnamed: 0,dicom_id,subject_id,study_id,ViewPosition,No Finding,Pleural Effusion,Cardiomegaly,Pneumothorax
0,02aa804e-bde0afdd-112c0b34-7bc16630-4e384014,10000032,50414267,PA,1.0,0.0,0.0,0.0
1,2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab,10000032,53189527,PA,1.0,0.0,0.0,0.0
2,68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714,10000032,53911762,AP,1.0,0.0,0.0,0.0
3,fffabebf-74fd3a1f-673b6b41-96ec0ac9-2ab69818,10000032,53911762,AP,1.0,0.0,0.0,0.0
4,ea030e7a-2e3b1346-bc518786-7a8fd698-f673b44c,10000032,56699142,AP,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
229963,16b6c70f-6d36bd77-89d2fef4-9c4b8b0a-79c69135,19999442,58708861,AP,1.0,0.0,0.0,0.0
229964,3fcd0406-9b111603-feae7033-96632b3a-111333e5,19999733,57132437,PA,1.0,0.0,0.0,0.0
229965,428e2c18-5721d8f3-35a05001-36f3d080-9053b83c,19999733,57132437,PA,1.0,0.0,0.0,0.0
229966,7ba273af-3d290f8d-e28d0ab4-484b7a86-7fc12b08,19999987,58621812,AP,0.0,0.0,0.0,0.0


In [35]:
img_data_df.isna().sum()

dicom_id            0
subject_id          0
study_id            0
ViewPosition        0
No Finding          0
Pleural Effusion    0
Cardiomegaly        0
Pneumothorax        0
dtype: int64

In [36]:
print("Total number of images: " + str(img_data_df.dicom_id.nunique()))
print("Total number of Patients: " + str(img_data_df.subject_id.nunique()))
print("Total number of Studies: " + str(img_data_df.study_id.nunique()))

Total number of images: 229968
Total number of Patients: 63198
Total number of Studies: 206067


In [None]:
#=====[MERGE] Del img_data_df: surge del merge de clabels_df y cmetadata_df -> (subjecvt_id, study_id, labels, dicom_id, view position)=======

In [None]:
#====== crace_df -> admissions.csv-> race info -> subject_id, race =======

In [37]:
path3 = "/mnt/NAS3/datasets/external/MIMIC_ICU/physionet.org/files/mimiciv/2.2/hosp"
archivos3 = os.listdir(path3)

print(archivos3)

['index.html', 'admissions.csv.gz', 'd_hcpcs.csv.gz', 'd_icd_diagnoses.csv.gz', 'd_icd_procedures.csv.gz', 'd_labitems.csv.gz', 'diagnoses_icd.csv.gz', 'drgcodes.csv.gz', 'emar.csv.gz', 'emar_detail.csv.gz', 'hcpcsevents.csv.gz', 'labevents.csv.gz', 'microbiologyevents.csv.gz', 'omr.csv.gz', 'patients.csv.gz', 'pharmacy.csv.gz', 'poe.csv.gz', 'poe_detail.csv.gz', 'prescriptions.csv.gz', 'procedures_icd.csv.gz', 'provider.csv.gz', 'services.csv.gz', 'transfers.csv.gz']


In [38]:
path_admi = base_path2 / "admissions.csv.gz"

admissions_df = pd.read_csv(path_admi , compression='gzip')

admissions_df.head()

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,deathtime,admission_type,admit_provider_id,admission_location,discharge_location,insurance,language,marital_status,race,edregtime,edouttime,hospital_expire_flag
0,10000032,22595853,2180-05-06 22:23:00,2180-05-07 17:15:00,,URGENT,P874LG,TRANSFER FROM HOSPITAL,HOME,Other,ENGLISH,WIDOWED,WHITE,2180-05-06 19:17:00,2180-05-06 23:30:00,0
1,10000032,22841357,2180-06-26 18:27:00,2180-06-27 18:49:00,,EW EMER.,P09Q6Y,EMERGENCY ROOM,HOME,Medicaid,ENGLISH,WIDOWED,WHITE,2180-06-26 15:54:00,2180-06-26 21:31:00,0
2,10000032,25742920,2180-08-05 23:44:00,2180-08-07 17:50:00,,EW EMER.,P60CC5,EMERGENCY ROOM,HOSPICE,Medicaid,ENGLISH,WIDOWED,WHITE,2180-08-05 20:58:00,2180-08-06 01:44:00,0
3,10000032,29079034,2180-07-23 12:35:00,2180-07-25 17:55:00,,EW EMER.,P30KEH,EMERGENCY ROOM,HOME,Medicaid,ENGLISH,WIDOWED,WHITE,2180-07-23 05:54:00,2180-07-23 14:00:00,0
4,10000068,25022803,2160-03-03 23:16:00,2160-03-04 06:26:00,,EU OBSERVATION,P51VDL,EMERGENCY ROOM,,Other,ENGLISH,SINGLE,WHITE,2160-03-03 21:55:00,2160-03-04 06:26:00,0


In [None]:
# Selection data to work with

In [39]:
race_df = admissions_df[["subject_id","race"]]
race_df.head()

Unnamed: 0,subject_id,race
0,10000032,WHITE
1,10000032,WHITE
2,10000032,WHITE
3,10000032,WHITE
4,10000068,WHITE


In [40]:
race_df.isna().sum()
race_df.duplicated(subset=["subject_id"]).sum()

250498

In [41]:
print("Missing values per column (NaN):")
print(race_df.isna().sum())

print("\nNumber of duplicated rows (by subject_id):", race_df.duplicated(subset=["subject_id"]).sum())


Missing values per column (NaN):
subject_id    0
race          0
dtype: int64

Number of duplicated rows (by subject_id): 250498


In [42]:
print("Total number of Patients: " + str(len(race_df)))

Total number of Patients: 431231


In [43]:
#Data cleanning
crace_df=race_df.copy()
crace_df=crace_df.drop_duplicates(subset='subject_id', keep='first')
crace_df.head()

Unnamed: 0,subject_id,race
0,10000032,WHITE
4,10000068,WHITE
5,10000084,WHITE
7,10000108,WHITE
8,10000117,WHITE


In [44]:
print("Missing values per column (NaN):")
print(crace_df.isna().sum())

print("\nNumber of duplicated rows (by subject_id):", crace_df.duplicated(subset=["subject_id"]).sum())


Missing values per column (NaN):
subject_id    0
race          0
dtype: int64

Number of duplicated rows (by subject_id): 0


In [45]:
print("Total number of Patients: " + str(len(crace_df)))

Total number of Patients: 180733


In [None]:
#====== crace_df -> admissions.csv-> race info -> subject_id, race =======

In [None]:
#====== patiens_df -> patients.csv ->  subject_id, age, gender  =======

In [46]:
path_p = base_path2 / "patients.csv.gz"
patients_df = pd.read_csv(path_p, compression='gzip')

patients_df.head()

Unnamed: 0,subject_id,gender,anchor_age,anchor_year,anchor_year_group,dod
0,10000032,F,52,2180,2014 - 2016,2180-09-09
1,10000048,F,23,2126,2008 - 2010,
2,10000068,F,19,2160,2008 - 2010,
3,10000084,M,72,2160,2017 - 2019,2161-02-13
4,10000102,F,27,2136,2008 - 2010,


In [None]:
# Selection data to work with

In [47]:
patients_df=patients_df[["subject_id","gender","anchor_age"]]
patients_df.head()

Unnamed: 0,subject_id,gender,anchor_age
0,10000032,F,52
1,10000048,F,23
2,10000068,F,19
3,10000084,M,72
4,10000102,F,27


In [50]:
print("Missing values per column (NaN):")
print(patients_df.isna().sum())

print("\nNumber of duplicated rows (by subject_id):", patients_df.duplicated(subset=["subject_id"]).sum())


Missing values per column (NaN):
subject_id    0
gender        0
anchor_age    0
dtype: int64

Number of duplicated rows (by subject_id): 0


In [51]:
print("Total number of Patients: " + str(len(patients_df)))

Total number of Patients: 299712


In [None]:
#====== patiens_df -> patients.csv ->  subject_id, age, gender  =======

In [None]:
#======[MERGE] demo_patients_df -> merge of crace_df and patientes_df ========

In [52]:
demo_patients_df=pd.merge(patients_df, crace_df, on="subject_id", how="inner")
demo_patients_df

Unnamed: 0,subject_id,gender,anchor_age,race
0,10000032,F,52,WHITE
1,10000068,F,19,WHITE
2,10000084,M,72,WHITE
3,10000108,M,25,WHITE
4,10000117,F,48,WHITE
...,...,...,...,...
180728,19999733,F,19,WHITE
180729,19999784,M,57,BLACK/AFRICAN AMERICAN
180730,19999828,F,46,WHITE
180731,19999840,M,58,WHITE


In [53]:
print("Missing values per column (NaN):")
print(demo_patients_df.isna().sum())

print("\nNumber of duplicated rows (by subject_id):", demo_patients_df.duplicated(subset=["subject_id"]).sum())


Missing values per column (NaN):
subject_id    0
gender        0
anchor_age    0
race          0
dtype: int64

Number of duplicated rows (by subject_id): 0


In [54]:
print("N Patients: " + str(len(demo_patients_df)))

N Patients: 180733


In [None]:
#El total de pacientes registrados en la tabla de pacientes (que contiene la info de genero, edad) es: 299712 Pacientes
#Total de pacientes de los qu se cuenta con la info de Race de 180733 pacientes.
#Por tanto, total de pacientes con info demográfica completa: 180733 (demo_patients_df)

In [None]:
#======[MERGE] demo_patients_df -> merge of crace_df and patientes_df ========

In [None]:
#=====mask_df -> dicom_id,Left Lung,Right Lung,Heart, Height, Width========

In [55]:
path_masks = base_path3 / "MIMIC-CXR-JPG.csv"
mask_df = pd.read_csv(path_masks)

mask_df

Unnamed: 0,dicom_id,Dice RCA (Mean),Dice RCA (Max),Landmarks,Left Lung,Right Lung,Heart,Height,Width
0,f4a185f1-db2de1fd-a05b274e-21f07d10-63a30841,0.885982,0.910834,[[ 904 433]\n [ 845 441]\n [ 777 489]\n [ 6...,1110713 9 1113242 27 1115772 44 1118301 61 112...,1102454 5 1104990 15 1107527 24 1110064 33 111...,3097284 10 3099821 31 3102358 51 3104896 70 31...,3056,2544
1,1534c820-a44d5232-30d7c596-ca557ed1-ef14d4ae,0.866419,0.928404,[[1143 150]\n [1048 157]\n [ 956 194]\n [ 8...,434219 90 436761 94 439303 99 441844 105 44438...,382738 8 385268 25 387799 41 390329 58 392859 ...,2263031 26 2265529 80 2268048 113 2270590 122 ...,3056,2544
2,95aa18d9-27969db3-9c904c3b-7e6eb3fa-95ee0061,0.884501,0.905510,[[ 929 427]\n [ 863 436]\n [ 794 486]\n [ 7...,935245 4 937785 13 940326 21 942866 30 945407 ...,1087215 4 1089752 13 1092288 23 1094825 32 109...,3249987 98 3252504 133 3255021 169 3257550 192...,3056,2544
3,80321566-1d300fe2-358ad58a-98a6b6d2-d7ad6c3f,0.864541,0.897160,[[ 814 504]\n [ 749 501]\n [ 677 555]\n [ 6...,1298994 15 1301510 45 1304026 75 1306556 91 13...,1275294 11 1277837 34 1280379 58 1282922 70 12...,3674729 16 3677265 47 3679800 79 3682336 144 3...,3056,2544
4,a9a74a32-490cae66-d7a2fd39-cab8a408-858f665b,0.876013,0.908857,[[ 906 480]\n [ 825 492]\n [ 745 552]\n [ 6...,1210021 4 1212560 13 1215099 21 1217639 29 122...,1222024 5 1224561 14 1227099 23 1229636 32 123...,3570563 18 3573081 56 3575599 94 3578128 121 3...,3056,2544
...,...,...,...,...,...,...,...,...,...
243329,21570ef0-ba470921-6e936b78-2e46f17e-31c6cc51,0.862717,0.922274,[[1165 468]\n [1082 479]\n [ 998 519]\n [ 9...,1240741 8 1243271 26 1245801 44 1248331 62 125...,1191755 5 1194291 17 1196828 28 1199364 40 120...,2670135 9 2672670 28 2675205 46 2677740 64 268...,2753,2544
243330,c2714aec-df227cb9-c87c3261-5526f21e-b67883dd,0.873246,0.906061,[[ 955 529]\n [ 870 542]\n [ 788 586]\n [ 7...,1423500 10 1426015 31 1428530 52 1431045 72 14...,1339323 6 1341847 16 1344370 28 1346894 39 134...,2786979 11 2789487 35 2791995 58 2794508 76 27...,2942,2530
243331,d5fd7e0c-5961fd50-cc03a030-723ee3fb-048b7fda,0.868462,0.881600,[[ 968 582]\n [ 898 582]\n [ 830 608]\n [ 7...,1609402 7 1611943 21 1614485 34 1617026 47 161...,1481506 73 1484048 78 1486589 85 1489130 91 14...,2781838 7 2784378 21 2786918 35 2789458 50 279...,3056,2544
243332,1dede0f2-fe40a0db-5fb8d85a-81dbbfc0-5fedf7cf,0.878087,0.890184,[[ 851 592]\n [ 776 598]\n [ 694 637]\n [ 6...,1502530 9 1505058 28 1507586 48 1510114 67 151...,1506894 8 1509426 23 1511957 39 1514489 54 151...,3338909 83 3341448 100 3343987 117 3346526 135...,3056,2544


In [56]:
#Data Selection
masks_df=mask_df[["dicom_id","Left Lung","Right Lung","Heart", "Height", "Width"]]
masks_df.head()

Unnamed: 0,dicom_id,Left Lung,Right Lung,Heart,Height,Width
0,f4a185f1-db2de1fd-a05b274e-21f07d10-63a30841,1110713 9 1113242 27 1115772 44 1118301 61 112...,1102454 5 1104990 15 1107527 24 1110064 33 111...,3097284 10 3099821 31 3102358 51 3104896 70 31...,3056,2544
1,1534c820-a44d5232-30d7c596-ca557ed1-ef14d4ae,434219 90 436761 94 439303 99 441844 105 44438...,382738 8 385268 25 387799 41 390329 58 392859 ...,2263031 26 2265529 80 2268048 113 2270590 122 ...,3056,2544
2,95aa18d9-27969db3-9c904c3b-7e6eb3fa-95ee0061,935245 4 937785 13 940326 21 942866 30 945407 ...,1087215 4 1089752 13 1092288 23 1094825 32 109...,3249987 98 3252504 133 3255021 169 3257550 192...,3056,2544
3,80321566-1d300fe2-358ad58a-98a6b6d2-d7ad6c3f,1298994 15 1301510 45 1304026 75 1306556 91 13...,1275294 11 1277837 34 1280379 58 1282922 70 12...,3674729 16 3677265 47 3679800 79 3682336 144 3...,3056,2544
4,a9a74a32-490cae66-d7a2fd39-cab8a408-858f665b,1210021 4 1212560 13 1215099 21 1217639 29 122...,1222024 5 1224561 14 1227099 23 1229636 32 123...,3570563 18 3573081 56 3575599 94 3578128 121 3...,3056,2544


In [57]:
masks_df.isna().sum()

dicom_id       0
Left Lung     49
Right Lung    49
Heart         49
Height         0
Width          0
dtype: int64

In [58]:
print("Total number of masks: " + str(len(masks_df)))

Total number of masks: 243334


In [59]:
#Data cleanning
cmasks_df=masks_df.copy()
cmasks_df.dropna(inplace=True)
cmasks_df.head()

Unnamed: 0,dicom_id,Left Lung,Right Lung,Heart,Height,Width
0,f4a185f1-db2de1fd-a05b274e-21f07d10-63a30841,1110713 9 1113242 27 1115772 44 1118301 61 112...,1102454 5 1104990 15 1107527 24 1110064 33 111...,3097284 10 3099821 31 3102358 51 3104896 70 31...,3056,2544
1,1534c820-a44d5232-30d7c596-ca557ed1-ef14d4ae,434219 90 436761 94 439303 99 441844 105 44438...,382738 8 385268 25 387799 41 390329 58 392859 ...,2263031 26 2265529 80 2268048 113 2270590 122 ...,3056,2544
2,95aa18d9-27969db3-9c904c3b-7e6eb3fa-95ee0061,935245 4 937785 13 940326 21 942866 30 945407 ...,1087215 4 1089752 13 1092288 23 1094825 32 109...,3249987 98 3252504 133 3255021 169 3257550 192...,3056,2544
3,80321566-1d300fe2-358ad58a-98a6b6d2-d7ad6c3f,1298994 15 1301510 45 1304026 75 1306556 91 13...,1275294 11 1277837 34 1280379 58 1282922 70 12...,3674729 16 3677265 47 3679800 79 3682336 144 3...,3056,2544
4,a9a74a32-490cae66-d7a2fd39-cab8a408-858f665b,1210021 4 1212560 13 1215099 21 1217639 29 122...,1222024 5 1224561 14 1227099 23 1229636 32 123...,3570563 18 3573081 56 3575599 94 3578128 121 3...,3056,2544


In [60]:
cmasks_df.isna().sum()

dicom_id      0
Left Lung     0
Right Lung    0
Heart         0
Height        0
Width         0
dtype: int64

In [61]:
print("Total number of masks: " + str(len(cmasks_df)))

Total number of masks: 243285


In [None]:
#=====mask_df -> dicom_id,Left Lung,Right Lung,Heart, Height, Width========

In [None]:
#===== [MERGE] image_df_raw: entre la info demografica (demo_patients_df) y la info de labels de metadata (img_data_df) ======

In [62]:
image_df_raw=pd.merge( img_data_df, demo_patients_df, on=["subject_id"], how="inner")
image_df_raw

Unnamed: 0,dicom_id,subject_id,study_id,ViewPosition,No Finding,Pleural Effusion,Cardiomegaly,Pneumothorax,gender,anchor_age,race
0,02aa804e-bde0afdd-112c0b34-7bc16630-4e384014,10000032,50414267,PA,1.0,0.0,0.0,0.0,F,52,WHITE
1,2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab,10000032,53189527,PA,1.0,0.0,0.0,0.0,F,52,WHITE
2,68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714,10000032,53911762,AP,1.0,0.0,0.0,0.0,F,52,WHITE
3,fffabebf-74fd3a1f-673b6b41-96ec0ac9-2ab69818,10000032,53911762,AP,1.0,0.0,0.0,0.0,F,52,WHITE
4,ea030e7a-2e3b1346-bc518786-7a8fd698-f673b44c,10000032,56699142,AP,1.0,0.0,0.0,0.0,F,52,WHITE
...,...,...,...,...,...,...,...,...,...,...,...
204995,16b6c70f-6d36bd77-89d2fef4-9c4b8b0a-79c69135,19999442,58708861,AP,1.0,0.0,0.0,0.0,M,41,WHITE
204996,3fcd0406-9b111603-feae7033-96632b3a-111333e5,19999733,57132437,PA,1.0,0.0,0.0,0.0,F,19,WHITE
204997,428e2c18-5721d8f3-35a05001-36f3d080-9053b83c,19999733,57132437,PA,1.0,0.0,0.0,0.0,F,19,WHITE
204998,7ba273af-3d290f8d-e28d0ab4-484b7a86-7fc12b08,19999987,58621812,AP,0.0,0.0,0.0,0.0,F,57,UNKNOWN


In [63]:
image_df_raw.isna().sum()

dicom_id            0
subject_id          0
study_id            0
ViewPosition        0
No Finding          0
Pleural Effusion    0
Cardiomegaly        0
Pneumothorax        0
gender              0
anchor_age          0
race                0
dtype: int64

In [64]:
print("Total number of images: " + str(image_df_raw.dicom_id.nunique()))
print("Total number of Patients: " + str(image_df_raw.subject_id.nunique()))
print("Total number of Studies: " + str(image_df_raw.study_id.nunique()))

Total number of images: 205000
Total number of Patients: 49768
Total number of Studies: 183652


In [None]:
#===== [MERGE] image_df_raw: entre la info demografica (demo_patients_df) y la info de labels de metadata (img_data_df) ======

In [None]:
#=====FINAL [MERGE] image_mask_df_raw: demo_patients_df, img_data_df, masks_df========

In [66]:
image_mask_df_raw=pd.merge(image_df_raw, cmasks_df, on=["dicom_id"], how="inner")
image_mask_df_raw

Unnamed: 0,dicom_id,subject_id,study_id,ViewPosition,No Finding,Pleural Effusion,Cardiomegaly,Pneumothorax,gender,anchor_age,race,Left Lung,Right Lung,Heart,Height,Width
0,02aa804e-bde0afdd-112c0b34-7bc16630-4e384014,10000032,50414267,PA,1.0,0.0,0.0,0.0,F,52,WHITE,792824 6 795358 20 797892 34 800426 48 802960 ...,812519 6 815054 18 817588 30 820123 42 822658 ...,3015910 10 3018442 28 3020973 47 3023504 66 30...,3056,2544
1,2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab,10000032,53189527,PA,1.0,0.0,0.0,0.0,F,52,WHITE,1019278 16 1021792 49 1024306 82 1026834 101 1...,1026209 9 1028738 27 1031267 45 1033797 62 103...,3288188 17 3290708 52 3293228 87 3295759 111 3...,3056,2544
2,68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714,10000032,53911762,AP,1.0,0.0,0.0,0.0,F,52,WHITE,1451211 25 1453702 77 1456216 106 1458753 112 ...,1455655 10 1458178 29 1460700 49 1463222 69 14...,3479554 9 3482082 27 3484611 43 3487139 61 348...,2705,2539
3,fffabebf-74fd3a1f-673b6b41-96ec0ac9-2ab69818,10000032,53911762,AP,1.0,0.0,0.0,0.0,F,52,WHITE,963370 5 965624 17 967879 28 970133 40 972388 ...,1375933 5 1378189 13 1380444 22 1382699 31 138...,3085691 8 3087944 25 3090197 42 3092450 59 309...,2906,2258
4,ea030e7a-2e3b1346-bc518786-7a8fd698-f673b44c,10000032,56699142,AP,1.0,0.0,0.0,0.0,F,52,WHITE,912403 7 914936 21 917468 37 920001 51 922534 ...,911748 8 914278 24 916808 41 919339 57 921869 ...,3089645 75 3092183 93 3094720 111 3097258 128 ...,3056,2544
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
204952,16b6c70f-6d36bd77-89d2fef4-9c4b8b0a-79c69135,19999442,58708861,AP,1.0,0.0,0.0,0.0,M,41,WHITE,988942 113 991996 119 995050 124 998104 130 10...,988237 96 991291 101 994345 106 997398 111 100...,3824644 13 3827696 40 3830748 66 3833800 84 38...,2544,3056
204953,3fcd0406-9b111603-feae7033-96632b3a-111333e5,19999733,57132437,PA,1.0,0.0,0.0,0.0,F,19,WHITE,540875 10 543401 32 545927 53 548453 74 550979...,471487 6 474022 19 476557 32 479093 44 481628 ...,2585859 70 2588392 91 2590924 113 2593457 134 ...,3056,2544
204954,428e2c18-5721d8f3-35a05001-36f3d080-9053b83c,19999733,57132437,PA,1.0,0.0,0.0,0.0,F,19,WHITE,566629 16 569145 48 571660 80 574189 99 576731...,537978 6 540514 18 543050 30 545587 41 548123 ...,2306306 14 2308848 43 2311390 71 2313932 88 23...,3056,2544
204955,7ba273af-3d290f8d-e28d0ab4-484b7a86-7fc12b08,19999987,58621812,AP,0.0,0.0,0.0,0.0,F,57,UNKNOWN,1739255 9 1741785 27 1744315 45 1746845 63 174...,1799597 6 1802135 16 1804673 26 1807211 36 180...,3697709 8 3700249 24 3702788 40 3705328 56 370...,3056,2544


In [67]:
image_mask_df_raw.isna().sum()

dicom_id            0
subject_id          0
study_id            0
ViewPosition        0
No Finding          0
Pleural Effusion    0
Cardiomegaly        0
Pneumothorax        0
gender              0
anchor_age          0
race                0
Left Lung           0
Right Lung          0
Heart               0
Height              0
Width               0
dtype: int64

In [68]:
print("Total number of images: " + str(image_mask_df_raw.dicom_id.nunique()))
print("Total number of Patients: " + str(image_mask_df_raw.subject_id.nunique()))
print("Total number of Studies: " + str(image_mask_df_raw.study_id.nunique()))

Total number of images: 204957
Total number of Patients: 49766
Total number of Studies: 183617


In [69]:
ViewPosition_image_df_raw = image_df_raw['ViewPosition'].unique()
ViewPosition_image_mask_df_raw = image_mask_df_raw['ViewPosition'].unique()
print("ViewPosition_image_df: " + str(ViewPosition_image_df_raw))
print("ViewPosition_image_mask_df: " + str(ViewPosition_image_mask_df_raw))

ViewPosition_image_df: ['PA' 'AP']
ViewPosition_image_mask_df: ['PA' 'AP']


In [None]:
#TAMAÑOS FINALES -> image_mask_df_raw

In [None]:
Total number of images: 204957
Total number of Patients: 49766
Total number of Studies: 183617

In [None]:
#TAMAÑOS FINALES

In [None]:
#=====FINAL [MERGE] image_mask_df: demo_patients_df_raw, img_data_df, masks_df========

In [None]:
======== CONSTRUCCIÓN, DATA CLEANING/PREPROCESSING ========

In [None]:
======== MAPPING ========

In [None]:
Observaciones: 

En este Notebook se hace el mapeo de las variables según es descrito en el archivo "scripts/process_data.py"
del github del articulo "The Limits". Se guarda en image image_mask_df_raw

Mapeos:

* Se hace el mapeo de la edad en Age y se crea la solumna age_decile, para ver en que rango está la edad
  (elif 0 <= x < 18: return 4
    elif 18 <= x < 40: return 3
    elif 40 <= x < 60: return 2
    elif 60 <= x < 80: return 1)

* Mapeo de race a la columna ethnicity: 
  white -> 0
  black -> 1
  asian -> 2
  null -> 3


* Mapeo sex: 
  F -> 0
  M -> 1


* se hace el mapeo sex_ethnicity:
  etsex_mapping = {'M_0': 0, 'F_0': 1, 'M_1': 2, 'F_1': 3, 'M_2': 4, 'F_2': 5, 'M_3': 6, 'F_3': 7}
  M -> masculino
  F -> Fememino
  white -> 0
  black -> 1
  asian -> 2
  null -> 3



In [None]:
#============ Age =============

In [70]:
#Verifico que todos los valores de columna edad "anchor age" sean mayores a 0
if (image_mask_df_raw['anchor_age'] < 0).any():
    print("Hay valores menores a 0 en la columna.")
else:
    print("No hay valores menores a 0 en la columna.")

No hay valores menores a 0 en la columna.


In [None]:
#mapping age, from anchor_age to age

In [71]:
def bin_age(x):
    if pd.isnull(x): return None
    elif 0 <= x < 18: return 4
    elif 18 <= x < 40: return 3
    elif 40 <= x < 60: return 2
    elif 60 <= x < 80: return 1
    else: return 0

In [72]:
image_mask_df_raw['age'] = image_mask_df_raw['anchor_age'].apply(bin_age)

In [73]:
image_mask_df_raw.head()

Unnamed: 0,dicom_id,subject_id,study_id,ViewPosition,No Finding,Pleural Effusion,Cardiomegaly,Pneumothorax,gender,anchor_age,race,Left Lung,Right Lung,Heart,Height,Width,age
0,02aa804e-bde0afdd-112c0b34-7bc16630-4e384014,10000032,50414267,PA,1.0,0.0,0.0,0.0,F,52,WHITE,792824 6 795358 20 797892 34 800426 48 802960 ...,812519 6 815054 18 817588 30 820123 42 822658 ...,3015910 10 3018442 28 3020973 47 3023504 66 30...,3056,2544,2
1,2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab,10000032,53189527,PA,1.0,0.0,0.0,0.0,F,52,WHITE,1019278 16 1021792 49 1024306 82 1026834 101 1...,1026209 9 1028738 27 1031267 45 1033797 62 103...,3288188 17 3290708 52 3293228 87 3295759 111 3...,3056,2544,2
2,68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714,10000032,53911762,AP,1.0,0.0,0.0,0.0,F,52,WHITE,1451211 25 1453702 77 1456216 106 1458753 112 ...,1455655 10 1458178 29 1460700 49 1463222 69 14...,3479554 9 3482082 27 3484611 43 3487139 61 348...,2705,2539,2
3,fffabebf-74fd3a1f-673b6b41-96ec0ac9-2ab69818,10000032,53911762,AP,1.0,0.0,0.0,0.0,F,52,WHITE,963370 5 965624 17 967879 28 970133 40 972388 ...,1375933 5 1378189 13 1380444 22 1382699 31 138...,3085691 8 3087944 25 3090197 42 3092450 59 309...,2906,2258,2
4,ea030e7a-2e3b1346-bc518786-7a8fd698-f673b44c,10000032,56699142,AP,1.0,0.0,0.0,0.0,F,52,WHITE,912403 7 914936 21 917468 37 920001 51 922534 ...,911748 8 914278 24 916808 41 919339 57 921869 ...,3089645 75 3092183 93 3094720 111 3097258 128 ...,3056,2544,2


In [None]:
#============ Age =============

In [None]:
#============Race======

In [74]:
image_mask_df_raw.rename(columns={'race': 'race_string'}, inplace=True)
image_mask_df_raw.head()

Unnamed: 0,dicom_id,subject_id,study_id,ViewPosition,No Finding,Pleural Effusion,Cardiomegaly,Pneumothorax,gender,anchor_age,race_string,Left Lung,Right Lung,Heart,Height,Width,age
0,02aa804e-bde0afdd-112c0b34-7bc16630-4e384014,10000032,50414267,PA,1.0,0.0,0.0,0.0,F,52,WHITE,792824 6 795358 20 797892 34 800426 48 802960 ...,812519 6 815054 18 817588 30 820123 42 822658 ...,3015910 10 3018442 28 3020973 47 3023504 66 30...,3056,2544,2
1,2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab,10000032,53189527,PA,1.0,0.0,0.0,0.0,F,52,WHITE,1019278 16 1021792 49 1024306 82 1026834 101 1...,1026209 9 1028738 27 1031267 45 1033797 62 103...,3288188 17 3290708 52 3293228 87 3295759 111 3...,3056,2544,2
2,68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714,10000032,53911762,AP,1.0,0.0,0.0,0.0,F,52,WHITE,1451211 25 1453702 77 1456216 106 1458753 112 ...,1455655 10 1458178 29 1460700 49 1463222 69 14...,3479554 9 3482082 27 3484611 43 3487139 61 348...,2705,2539,2
3,fffabebf-74fd3a1f-673b6b41-96ec0ac9-2ab69818,10000032,53911762,AP,1.0,0.0,0.0,0.0,F,52,WHITE,963370 5 965624 17 967879 28 970133 40 972388 ...,1375933 5 1378189 13 1380444 22 1382699 31 138...,3085691 8 3087944 25 3090197 42 3092450 59 309...,2906,2258,2
4,ea030e7a-2e3b1346-bc518786-7a8fd698-f673b44c,10000032,56699142,AP,1.0,0.0,0.0,0.0,F,52,WHITE,912403 7 914936 21 917468 37 920001 51 922534 ...,911748 8 914278 24 916808 41 919339 57 921869 ...,3089645 75 3092183 93 3094720 111 3097258 128 ...,3056,2544,2


In [75]:
def race_mapping(x):
    if pd.isnull(x):
        return 3
    elif x.startswith("WHITE"):
        return 0
    elif x.startswith("BLACK"):
        return 1
    elif x.startswith("ASIAN"):
        return 2
    else: return 3 

In [76]:
image_mask_df_raw['race']=image_mask_df_raw['race_string'].map(race_mapping) 
image_mask_df_raw.head()

Unnamed: 0,dicom_id,subject_id,study_id,ViewPosition,No Finding,Pleural Effusion,Cardiomegaly,Pneumothorax,gender,anchor_age,race_string,Left Lung,Right Lung,Heart,Height,Width,age,race
0,02aa804e-bde0afdd-112c0b34-7bc16630-4e384014,10000032,50414267,PA,1.0,0.0,0.0,0.0,F,52,WHITE,792824 6 795358 20 797892 34 800426 48 802960 ...,812519 6 815054 18 817588 30 820123 42 822658 ...,3015910 10 3018442 28 3020973 47 3023504 66 30...,3056,2544,2,0
1,2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab,10000032,53189527,PA,1.0,0.0,0.0,0.0,F,52,WHITE,1019278 16 1021792 49 1024306 82 1026834 101 1...,1026209 9 1028738 27 1031267 45 1033797 62 103...,3288188 17 3290708 52 3293228 87 3295759 111 3...,3056,2544,2,0
2,68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714,10000032,53911762,AP,1.0,0.0,0.0,0.0,F,52,WHITE,1451211 25 1453702 77 1456216 106 1458753 112 ...,1455655 10 1458178 29 1460700 49 1463222 69 14...,3479554 9 3482082 27 3484611 43 3487139 61 348...,2705,2539,2,0
3,fffabebf-74fd3a1f-673b6b41-96ec0ac9-2ab69818,10000032,53911762,AP,1.0,0.0,0.0,0.0,F,52,WHITE,963370 5 965624 17 967879 28 970133 40 972388 ...,1375933 5 1378189 13 1380444 22 1382699 31 138...,3085691 8 3087944 25 3090197 42 3092450 59 309...,2906,2258,2,0
4,ea030e7a-2e3b1346-bc518786-7a8fd698-f673b44c,10000032,56699142,AP,1.0,0.0,0.0,0.0,F,52,WHITE,912403 7 914936 21 917468 37 920001 51 922534 ...,911748 8 914278 24 916808 41 919339 57 921869 ...,3089645 75 3092183 93 3094720 111 3097258 128 ...,3056,2544,2,0


In [None]:
#============Race======

In [None]:
#===========Sex========

In [77]:
image_mask_df_raw.rename(columns={'gender': 'sex_string'}, inplace=True)
image_mask_df_raw.head()

Unnamed: 0,dicom_id,subject_id,study_id,ViewPosition,No Finding,Pleural Effusion,Cardiomegaly,Pneumothorax,sex_string,anchor_age,race_string,Left Lung,Right Lung,Heart,Height,Width,age,race
0,02aa804e-bde0afdd-112c0b34-7bc16630-4e384014,10000032,50414267,PA,1.0,0.0,0.0,0.0,F,52,WHITE,792824 6 795358 20 797892 34 800426 48 802960 ...,812519 6 815054 18 817588 30 820123 42 822658 ...,3015910 10 3018442 28 3020973 47 3023504 66 30...,3056,2544,2,0
1,2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab,10000032,53189527,PA,1.0,0.0,0.0,0.0,F,52,WHITE,1019278 16 1021792 49 1024306 82 1026834 101 1...,1026209 9 1028738 27 1031267 45 1033797 62 103...,3288188 17 3290708 52 3293228 87 3295759 111 3...,3056,2544,2,0
2,68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714,10000032,53911762,AP,1.0,0.0,0.0,0.0,F,52,WHITE,1451211 25 1453702 77 1456216 106 1458753 112 ...,1455655 10 1458178 29 1460700 49 1463222 69 14...,3479554 9 3482082 27 3484611 43 3487139 61 348...,2705,2539,2,0
3,fffabebf-74fd3a1f-673b6b41-96ec0ac9-2ab69818,10000032,53911762,AP,1.0,0.0,0.0,0.0,F,52,WHITE,963370 5 965624 17 967879 28 970133 40 972388 ...,1375933 5 1378189 13 1380444 22 1382699 31 138...,3085691 8 3087944 25 3090197 42 3092450 59 309...,2906,2258,2,0
4,ea030e7a-2e3b1346-bc518786-7a8fd698-f673b44c,10000032,56699142,AP,1.0,0.0,0.0,0.0,F,52,WHITE,912403 7 914936 21 917468 37 920001 51 922534 ...,911748 8 914278 24 916808 41 919339 57 921869 ...,3089645 75 3092183 93 3094720 111 3097258 128 ...,3056,2544,2,0


In [78]:
image_mask_df_raw['sex'] = (image_mask_df_raw['sex_string'] == 'M').astype(int)
image_mask_df_raw.head()

Unnamed: 0,dicom_id,subject_id,study_id,ViewPosition,No Finding,Pleural Effusion,Cardiomegaly,Pneumothorax,sex_string,anchor_age,race_string,Left Lung,Right Lung,Heart,Height,Width,age,race,sex
0,02aa804e-bde0afdd-112c0b34-7bc16630-4e384014,10000032,50414267,PA,1.0,0.0,0.0,0.0,F,52,WHITE,792824 6 795358 20 797892 34 800426 48 802960 ...,812519 6 815054 18 817588 30 820123 42 822658 ...,3015910 10 3018442 28 3020973 47 3023504 66 30...,3056,2544,2,0,0
1,2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab,10000032,53189527,PA,1.0,0.0,0.0,0.0,F,52,WHITE,1019278 16 1021792 49 1024306 82 1026834 101 1...,1026209 9 1028738 27 1031267 45 1033797 62 103...,3288188 17 3290708 52 3293228 87 3295759 111 3...,3056,2544,2,0,0
2,68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714,10000032,53911762,AP,1.0,0.0,0.0,0.0,F,52,WHITE,1451211 25 1453702 77 1456216 106 1458753 112 ...,1455655 10 1458178 29 1460700 49 1463222 69 14...,3479554 9 3482082 27 3484611 43 3487139 61 348...,2705,2539,2,0,0
3,fffabebf-74fd3a1f-673b6b41-96ec0ac9-2ab69818,10000032,53911762,AP,1.0,0.0,0.0,0.0,F,52,WHITE,963370 5 965624 17 967879 28 970133 40 972388 ...,1375933 5 1378189 13 1380444 22 1382699 31 138...,3085691 8 3087944 25 3090197 42 3092450 59 309...,2906,2258,2,0,0
4,ea030e7a-2e3b1346-bc518786-7a8fd698-f673b44c,10000032,56699142,AP,1.0,0.0,0.0,0.0,F,52,WHITE,912403 7 914936 21 917468 37 920001 51 922534 ...,911748 8 914278 24 916808 41 919339 57 921869 ...,3089645 75 3092183 93 3094720 111 3097258 128 ...,3056,2544,2,0,0


In [None]:
#===========Sex========

In [None]:
#===========Sex-Race========

In [79]:
sex_race_mapping = {'M_0': 0, 'F_0': 1, 'M_1': 2, 'F_1': 3, 'M_2': 4, 'F_2': 5, 'M_3': 6, 'F_3': 7}

image_mask_df_raw['sex_race'] = (image_mask_df_raw['sex_string'] + '_' + image_mask_df_raw['race'].astype(str)).map(sex_race_mapping)

image_mask_df_raw.head()

Unnamed: 0,dicom_id,subject_id,study_id,ViewPosition,No Finding,Pleural Effusion,Cardiomegaly,Pneumothorax,sex_string,anchor_age,race_string,Left Lung,Right Lung,Heart,Height,Width,age,race,sex,sex_race
0,02aa804e-bde0afdd-112c0b34-7bc16630-4e384014,10000032,50414267,PA,1.0,0.0,0.0,0.0,F,52,WHITE,792824 6 795358 20 797892 34 800426 48 802960 ...,812519 6 815054 18 817588 30 820123 42 822658 ...,3015910 10 3018442 28 3020973 47 3023504 66 30...,3056,2544,2,0,0,1
1,2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab,10000032,53189527,PA,1.0,0.0,0.0,0.0,F,52,WHITE,1019278 16 1021792 49 1024306 82 1026834 101 1...,1026209 9 1028738 27 1031267 45 1033797 62 103...,3288188 17 3290708 52 3293228 87 3295759 111 3...,3056,2544,2,0,0,1
2,68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714,10000032,53911762,AP,1.0,0.0,0.0,0.0,F,52,WHITE,1451211 25 1453702 77 1456216 106 1458753 112 ...,1455655 10 1458178 29 1460700 49 1463222 69 14...,3479554 9 3482082 27 3484611 43 3487139 61 348...,2705,2539,2,0,0,1
3,fffabebf-74fd3a1f-673b6b41-96ec0ac9-2ab69818,10000032,53911762,AP,1.0,0.0,0.0,0.0,F,52,WHITE,963370 5 965624 17 967879 28 970133 40 972388 ...,1375933 5 1378189 13 1380444 22 1382699 31 138...,3085691 8 3087944 25 3090197 42 3092450 59 309...,2906,2258,2,0,0,1
4,ea030e7a-2e3b1346-bc518786-7a8fd698-f673b44c,10000032,56699142,AP,1.0,0.0,0.0,0.0,F,52,WHITE,912403 7 914936 21 917468 37 920001 51 922534 ...,911748 8 914278 24 916808 41 919339 57 921869 ...,3089645 75 3092183 93 3094720 111 3097258 128 ...,3056,2544,2,0,0,1


In [None]:
#===========Sex-Race========

In [None]:
======== MAPPING ========

In [None]:
========= SPLITTING THE DF AT PATIENT LEVEL ==========

In [None]:
# It is done according to the final number of patients in the final df: image_mask_df_raw 
#Total number of Patients: 49766

In [80]:
patients_unique_df = image_mask_df_raw.drop_duplicates(subset='subject_id', keep='first')
patients_unique_df.shape

(49766, 20)

In [81]:
patients_unique_df=patients_unique_df[["subject_id", "sex", "race", "sex_race", "age"]]
patients_unique_df.head()

Unnamed: 0,subject_id,sex,race,sex_race,age
0,10000032,0,0,1,2
5,10000764,1,0,0,0
6,10000935,0,1,3,2
12,10000980,0,1,3,1
21,10001176,0,0,1,1


In [82]:
print(len(patients_unique_df))

49766


In [None]:
#train ->  70%, test -> 15%, val -> 15%

In [83]:
from sklearn.model_selection import train_test_split

In [84]:
def split_patients_mimic_cxr(df, test_pct=0.15, val_pct=0.15):
    
    train_val_idx, test_idx = train_test_split(df.index, test_size=test_pct, random_state=42)
    train_idx, val_idx = train_test_split(
        train_val_idx, test_size=val_pct/(1-test_pct), random_state=42)

    df['split'] = 0  #  train
    df.loc[val_idx, 'split'] = 1  # Validation
    df.loc[test_idx, 'split'] = 2  # Test


In [85]:
split_patients_mimic_cxr(patients_unique_df, 0.15, 0.15)

In [86]:
patients_unique_df

Unnamed: 0,subject_id,sex,race,sex_race,age,split
0,10000032,0,0,1,2,0
5,10000764,1,0,0,0,0
6,10000935,0,1,3,2,0
12,10000980,0,1,3,1,0
21,10001176,0,0,1,1,2
...,...,...,...,...,...,...
204944,19999156,0,0,1,1,0
204945,19999287,0,1,3,1,0
204952,19999442,1,0,0,2,0
204953,19999733,0,0,1,3,0


In [88]:
split_counts = patients_unique_df['split'].value_counts()
split_counts

split
0    34836
2     7465
1     7465
Name: count, dtype: int64

In [89]:
split_counts = patients_unique_df['split'].value_counts()


total = split_counts.sum()
split_percentages = (split_counts / total) * 100


print("\nData distribution in demo_patients_df:")
print(split_percentages)


Data distribution in demo_patients_df:
split
0    69.999598
2    15.000201
1    15.000201
Name: count, dtype: float64


In [90]:
split_df=patients_unique_df[['subject_id','split']]
split_df

Unnamed: 0,subject_id,split
0,10000032,0
5,10000764,0
6,10000935,0
12,10000980,0
21,10001176,2
...,...,...
204944,19999156,0
204945,19999287,0
204952,19999442,0
204953,19999733,0


In [None]:
========= SPLITTING THE DF AT PATIENT LEVEL ==========

In [None]:
==========MERGE SPLIT AND image_mask_df_raw ===========

In [91]:
image_mask_df_raw=pd.merge(image_mask_df_raw, split_df, on=["subject_id"], how="inner")
image_mask_df_raw

Unnamed: 0,dicom_id,subject_id,study_id,ViewPosition,No Finding,Pleural Effusion,Cardiomegaly,Pneumothorax,sex_string,anchor_age,...,Left Lung,Right Lung,Heart,Height,Width,age,race,sex,sex_race,split
0,02aa804e-bde0afdd-112c0b34-7bc16630-4e384014,10000032,50414267,PA,1.0,0.0,0.0,0.0,F,52,...,792824 6 795358 20 797892 34 800426 48 802960 ...,812519 6 815054 18 817588 30 820123 42 822658 ...,3015910 10 3018442 28 3020973 47 3023504 66 30...,3056,2544,2,0,0,1,0
1,2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab,10000032,53189527,PA,1.0,0.0,0.0,0.0,F,52,...,1019278 16 1021792 49 1024306 82 1026834 101 1...,1026209 9 1028738 27 1031267 45 1033797 62 103...,3288188 17 3290708 52 3293228 87 3295759 111 3...,3056,2544,2,0,0,1,0
2,68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714,10000032,53911762,AP,1.0,0.0,0.0,0.0,F,52,...,1451211 25 1453702 77 1456216 106 1458753 112 ...,1455655 10 1458178 29 1460700 49 1463222 69 14...,3479554 9 3482082 27 3484611 43 3487139 61 348...,2705,2539,2,0,0,1,0
3,fffabebf-74fd3a1f-673b6b41-96ec0ac9-2ab69818,10000032,53911762,AP,1.0,0.0,0.0,0.0,F,52,...,963370 5 965624 17 967879 28 970133 40 972388 ...,1375933 5 1378189 13 1380444 22 1382699 31 138...,3085691 8 3087944 25 3090197 42 3092450 59 309...,2906,2258,2,0,0,1,0
4,ea030e7a-2e3b1346-bc518786-7a8fd698-f673b44c,10000032,56699142,AP,1.0,0.0,0.0,0.0,F,52,...,912403 7 914936 21 917468 37 920001 51 922534 ...,911748 8 914278 24 916808 41 919339 57 921869 ...,3089645 75 3092183 93 3094720 111 3097258 128 ...,3056,2544,2,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
204952,16b6c70f-6d36bd77-89d2fef4-9c4b8b0a-79c69135,19999442,58708861,AP,1.0,0.0,0.0,0.0,M,41,...,988942 113 991996 119 995050 124 998104 130 10...,988237 96 991291 101 994345 106 997398 111 100...,3824644 13 3827696 40 3830748 66 3833800 84 38...,2544,3056,2,0,1,0,0
204953,3fcd0406-9b111603-feae7033-96632b3a-111333e5,19999733,57132437,PA,1.0,0.0,0.0,0.0,F,19,...,540875 10 543401 32 545927 53 548453 74 550979...,471487 6 474022 19 476557 32 479093 44 481628 ...,2585859 70 2588392 91 2590924 113 2593457 134 ...,3056,2544,3,0,0,1,0
204954,428e2c18-5721d8f3-35a05001-36f3d080-9053b83c,19999733,57132437,PA,1.0,0.0,0.0,0.0,F,19,...,566629 16 569145 48 571660 80 574189 99 576731...,537978 6 540514 18 543050 30 545587 41 548123 ...,2306306 14 2308848 43 2311390 71 2313932 88 23...,3056,2544,3,0,0,1,0
204955,7ba273af-3d290f8d-e28d0ab4-484b7a86-7fc12b08,19999987,58621812,AP,0.0,0.0,0.0,0.0,F,57,...,1739255 9 1741785 27 1744315 45 1746845 63 174...,1799597 6 1802135 16 1804673 26 1807211 36 180...,3697709 8 3700249 24 3702788 40 3705328 56 370...,3056,2544,2,3,0,7,0


In [92]:
split_counts = image_mask_df_raw['split'].value_counts()
split_counts

split
0    143526
2     31080
1     30351
Name: count, dtype: int64

In [93]:
split_counts = image_mask_df_raw['split'].value_counts()


total = split_counts.sum()
split_percentages = (split_counts / total) * 100


print("\nData distribution in demo_patients_df:")
print(split_percentages)


Data distribution in demo_patients_df:
split
0    70.027372
2    15.164156
1    14.808472
Name: count, dtype: float64


In [None]:
==========MERGE SPLIT AND image_mask_df_raw ===========

In [None]:
========== SAVE DF ============

In [None]:
#====Save final mapped DF (CXR, attr demo, masks)=====

In [229]:
image_mask_df = image_mask_df_raw[["dicom_id", "subject_id", "study_id", "ViewPosition", "No Finding", "Pleural Effusion", "Cardiomegaly", "Pneumothorax", "sex", "race", "sex_race", "age", "split", "Left Lung", "Right Lung", "Heart", "Width", "Height"]]
image_mask_df

Unnamed: 0,dicom_id,subject_id,study_id,ViewPosition,No Finding,Pleural Effusion,Cardiomegaly,Pneumothorax,sex,race,sex_race,age,split,Left Lung,Right Lung,Heart,Width,Height
0,02aa804e-bde0afdd-112c0b34-7bc16630-4e384014,10000032,50414267,PA,1.0,0.0,0.0,0.0,0,0,1,2,0,792824 6 795358 20 797892 34 800426 48 802960 ...,812519 6 815054 18 817588 30 820123 42 822658 ...,3015910 10 3018442 28 3020973 47 3023504 66 30...,2544,3056
1,2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab,10000032,53189527,PA,1.0,0.0,0.0,0.0,0,0,1,2,0,1019278 16 1021792 49 1024306 82 1026834 101 1...,1026209 9 1028738 27 1031267 45 1033797 62 103...,3288188 17 3290708 52 3293228 87 3295759 111 3...,2544,3056
2,68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714,10000032,53911762,AP,1.0,0.0,0.0,0.0,0,0,1,2,0,1451211 25 1453702 77 1456216 106 1458753 112 ...,1455655 10 1458178 29 1460700 49 1463222 69 14...,3479554 9 3482082 27 3484611 43 3487139 61 348...,2539,2705
3,fffabebf-74fd3a1f-673b6b41-96ec0ac9-2ab69818,10000032,53911762,AP,1.0,0.0,0.0,0.0,0,0,1,2,0,963370 5 965624 17 967879 28 970133 40 972388 ...,1375933 5 1378189 13 1380444 22 1382699 31 138...,3085691 8 3087944 25 3090197 42 3092450 59 309...,2258,2906
4,ea030e7a-2e3b1346-bc518786-7a8fd698-f673b44c,10000032,56699142,AP,1.0,0.0,0.0,0.0,0,0,1,2,0,912403 7 914936 21 917468 37 920001 51 922534 ...,911748 8 914278 24 916808 41 919339 57 921869 ...,3089645 75 3092183 93 3094720 111 3097258 128 ...,2544,3056
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
204952,16b6c70f-6d36bd77-89d2fef4-9c4b8b0a-79c69135,19999442,58708861,AP,1.0,0.0,0.0,0.0,1,0,0,2,0,988942 113 991996 119 995050 124 998104 130 10...,988237 96 991291 101 994345 106 997398 111 100...,3824644 13 3827696 40 3830748 66 3833800 84 38...,3056,2544
204953,3fcd0406-9b111603-feae7033-96632b3a-111333e5,19999733,57132437,PA,1.0,0.0,0.0,0.0,0,0,1,3,0,540875 10 543401 32 545927 53 548453 74 550979...,471487 6 474022 19 476557 32 479093 44 481628 ...,2585859 70 2588392 91 2590924 113 2593457 134 ...,2544,3056
204954,428e2c18-5721d8f3-35a05001-36f3d080-9053b83c,19999733,57132437,PA,1.0,0.0,0.0,0.0,0,0,1,3,0,566629 16 569145 48 571660 80 574189 99 576731...,537978 6 540514 18 543050 30 545587 41 548123 ...,2306306 14 2308848 43 2311390 71 2313932 88 23...,2544,3056
204955,7ba273af-3d290f8d-e28d0ab4-484b7a86-7fc12b08,19999987,58621812,AP,0.0,0.0,0.0,0.0,0,3,7,2,0,1739255 9 1741785 27 1744315 45 1746845 63 174...,1799597 6 1802135 16 1804673 26 1807211 36 180...,3697709 8 3700249 24 3702788 40 3705328 56 370...,2544,3056


In [230]:
image_mask_df.shape

(204957, 18)

In [None]:
#inlcuide "path" and "reduced_path" for each image

In [266]:
subject_id = image_mask_df['subject_id'].astype(str)
study_id = image_mask_df['study_id'].astype(str)
dicom_id = image_mask_df['dicom_id'].astype(str)

#'reduced_path', pos 3
image_mask_df.insert(3, "reduced_path", "")

patient_start = "p" + subject_id.str[:2]  
image_mask_df['reduced_path'] = (
    patient_start + "/p" + subject_id + "/s" + study_id + "/" + dicom_id + ".jpg"
)

# base path - preprocessed imagees -> resize 256x256
base_path = "/home/lchanch/initial_image_prepro/downsample_img_256/"
full_path = base_path + image_mask_df['reduced_path'].str.replace(".jpg", ".png", regex=False)

image_mask_df.insert(4, "path", full_path)

image_mask_df.head(3)


Unnamed: 0,dicom_id,subject_id,study_id,reduced_path,path,ViewPosition,No Finding,Pleural Effusion,Cardiomegaly,Pneumothorax,sex,race,sex_race,age,split,Left Lung,Right Lung,Heart,Width,Height
0,02aa804e-bde0afdd-112c0b34-7bc16630-4e384014,10000032,50414267,p10/p10000032/s50414267/02aa804e-bde0afdd-112c...,/home/lchanch/initial_image_prepro/downsample_...,PA,1.0,0.0,0.0,0.0,0,0,1,2,0,792824 6 795358 20 797892 34 800426 48 802960 ...,812519 6 815054 18 817588 30 820123 42 822658 ...,3015910 10 3018442 28 3020973 47 3023504 66 30...,2544,3056
1,2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab,10000032,53189527,p10/p10000032/s53189527/2a2277a9-b0ded155-c0de...,/home/lchanch/initial_image_prepro/downsample_...,PA,1.0,0.0,0.0,0.0,0,0,1,2,0,1019278 16 1021792 49 1024306 82 1026834 101 1...,1026209 9 1028738 27 1031267 45 1033797 62 103...,3288188 17 3290708 52 3293228 87 3295759 111 3...,2544,3056
2,68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714,10000032,53911762,p10/p10000032/s53911762/68b5c4b1-227d0485-9cc3...,/home/lchanch/initial_image_prepro/downsample_...,AP,1.0,0.0,0.0,0.0,0,0,1,2,0,1451211 25 1453702 77 1456216 106 1458753 112 ...,1455655 10 1458178 29 1460700 49 1463222 69 14...,3479554 9 3482082 27 3484611 43 3487139 61 348...,2539,2705


In [267]:
image_mask_df.shape

(204957, 20)

In [None]:
#====Save final mapped DF (CXR, attr demo, masks)=====

In [268]:
image_mask_df.to_csv('image_mask_df', index=False)

In [269]:
value = image_mask_df.loc[0, "path"]
print(value)

/home/lchanch/initial_image_prepro/downsample_img_256/p10/p10000032/s50414267/02aa804e-bde0afdd-112c0b34-7bc16630-4e384014.png


In [None]:
#======Save final mapped DF without masks info -> for ERM ======

In [270]:
image_df=image_mask_df[["dicom_id", "subject_id", "study_id","reduced_path", "path", "ViewPosition", "No Finding", "Pleural Effusion", "Cardiomegaly", "Pneumothorax", "sex", "race", "sex_race", "age", "split"]]
image_df.head()

Unnamed: 0,dicom_id,subject_id,study_id,reduced_path,path,ViewPosition,No Finding,Pleural Effusion,Cardiomegaly,Pneumothorax,sex,race,sex_race,age,split
0,02aa804e-bde0afdd-112c0b34-7bc16630-4e384014,10000032,50414267,p10/p10000032/s50414267/02aa804e-bde0afdd-112c...,/home/lchanch/initial_image_prepro/downsample_...,PA,1.0,0.0,0.0,0.0,0,0,1,2,0
1,2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab,10000032,53189527,p10/p10000032/s53189527/2a2277a9-b0ded155-c0de...,/home/lchanch/initial_image_prepro/downsample_...,PA,1.0,0.0,0.0,0.0,0,0,1,2,0
2,68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714,10000032,53911762,p10/p10000032/s53911762/68b5c4b1-227d0485-9cc3...,/home/lchanch/initial_image_prepro/downsample_...,AP,1.0,0.0,0.0,0.0,0,0,1,2,0
3,fffabebf-74fd3a1f-673b6b41-96ec0ac9-2ab69818,10000032,53911762,p10/p10000032/s53911762/fffabebf-74fd3a1f-673b...,/home/lchanch/initial_image_prepro/downsample_...,AP,1.0,0.0,0.0,0.0,0,0,1,2,0
4,ea030e7a-2e3b1346-bc518786-7a8fd698-f673b44c,10000032,56699142,p10/p10000032/s56699142/ea030e7a-2e3b1346-bc51...,/home/lchanch/initial_image_prepro/downsample_...,AP,1.0,0.0,0.0,0.0,0,0,1,2,0


In [271]:
image_df.shape

(204957, 15)

In [272]:
image_df.to_csv('image_df', index=False)

In [None]:
#======Save final mapped DF without masks info -> for ERM ======

In [None]:
#============Save patients_unique_df===============

In [235]:
patients_unique_df.head()

Unnamed: 0,subject_id,sex,race,sex_race,age,split
0,10000032,0,0,1,2,0
5,10000764,1,0,0,0,0
6,10000935,0,1,3,2,0
12,10000980,0,1,3,1,0
21,10001176,0,0,1,1,2


In [239]:
patients_unique_df.shape

(49766, 6)

In [236]:
patients_unique_df.to_csv('patients_df', index=False)

In [None]:
#============Save patients_unique_df===============

In [None]:
#============Save split_df===============

In [237]:
split_df.head()

Unnamed: 0,subject_id,split
0,10000032,0
5,10000764,0
6,10000935,0
12,10000980,0
21,10001176,2


In [238]:
split_df.to_csv('split_df', index=False)

In [None]:
#============Save split_df===============

In [None]:
#=============5% from the final mapped DF (image_mask_df)=====

In [273]:
image_mask_df_mini = pd.concat([
    image_mask_df[image_mask_df['split'] == 0].sample(frac=0.05, random_state=42),  # 5% -> train
    image_mask_df[image_mask_df['split'] == 1].sample(frac=0.05, random_state=42),  # 5% -> validation
    image_mask_df[image_mask_df['split'] == 2].sample(frac=0.05, random_state=42)   # 5% -> test
])

image_mask_df_mini

Unnamed: 0,dicom_id,subject_id,study_id,reduced_path,path,ViewPosition,No Finding,Pleural Effusion,Cardiomegaly,Pneumothorax,sex,race,sex_race,age,split,Left Lung,Right Lung,Heart,Width,Height
177429,0743b02e-0ef83ee8-b31b456e-42821d06-073c9808,18651980,57211617,p18/p18651980/s57211617/0743b02e-0ef83ee8-b31b...,/home/lchanch/initial_image_prepro/downsample_...,AP,1.0,0.0,0.0,0.0,1,0,0,0,0,2118294 3 2120835 9 2123376 14 2125918 19 2128...,1908930 13 1911452 38 1913973 65 1916505 80 19...,3979814 9 3982353 26 3984893 42 3987432 59 398...,2544,3056
159596,c425338e-3ed004d2-c555efb0-c26a933d-17829050,17784248,57417630,p17/p17784248/s57417630/c425338e-3ed004d2-c555...,/home/lchanch/initial_image_prepro/downsample_...,PA,0.0,0.0,0.0,0.0,0,0,1,1,0,996242 77 998784 81 1001326 85 1003867 90 1006...,942189 8 944720 25 947251 41 949782 57 952313 ...,2855643 13 2858183 40 2860723 67 2863263 87 28...,2544,3056
199438,0794432d-d10ca3ec-70e46d72-3b6d26b4-254de0b8,19733031,59774406,p19/p19733031/s59774406/0794432d-d10ca3ec-70e4...,/home/lchanch/initial_image_prepro/downsample_...,AP,0.0,0.0,1.0,0.0,0,1,3,2,0,1159201 8 1161738 25 1164275 42 1166812 58 116...,1181459 6 1183991 17 1186523 29 1189055 41 119...,2466539 10 2469064 31 2471590 51 2474115 73 24...,2539,3050
140654,66184482-05fabbef-42b9124a-f43774fe-f2b90a31,16876797,57655655,p16/p16876797/s57655655/66184482-05fabbef-42b9...,/home/lchanch/initial_image_prepro/downsample_...,AP,0.0,0.0,1.0,0.0,1,0,0,1,0,252377 6 255425 20 258474 32 261522 46 264571 ...,288324 13 291358 41 294393 68 297428 95 300462...,1932811 173 1935839 203 1938866 233 1941906 25...,3056,2544
176596,757baa2a-907e2e45-c02d91ac-46f31040-fa9b1fa0,18616140,51247707,p18/p18616140/s51247707/757baa2a-907e2e45-c02d...,/home/lchanch/initial_image_prepro/downsample_...,AP,1.0,0.0,0.0,0.0,0,0,1,2,0,1245602 8 1248134 23 1250666 38 1253197 54 125...,1194143 11 1196667 34 1199191 56 1201724 70 12...,2809819 9 2812356 26 2814892 45 2817429 62 281...,2544,3056
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18077,5504042a-8e7532fe-efd86d71-f8d83408-0ca376da,10922531,54681140,p10/p10922531/s54681140/5504042a-8e7532fe-efd8...,/home/lchanch/initial_image_prepro/downsample_...,AP,0.0,0.0,1.0,1.0,1,0,0,1,2,1844 352 4892 356 7941 359 10989 363 14038 365...,894 400 3942 403 6990 406 10039 408 13087 411 ...,2295196 7 2298237 20 2301278 33 2304319 46 230...,3050,2539
159505,13c91bb9-141b5b73-91fca3cf-28d18c86-93c0713d,17780252,50077267,p17/p17780252/s50077267/13c91bb9-141b5b73-91fc...,/home/lchanch/initial_image_prepro/downsample_...,PA,0.0,0.0,0.0,0.0,0,0,1,2,2,638237 6 639965 19 641694 31 643422 44 645151 ...,717532 6 719260 20 720988 34 722716 48 724444 ...,1745642 21 1747352 63 1749063 97 1750777 120 1...,1736,2022
73910,6073ad46-d6fc60fe-602b2a65-97f8532b-6239a162,13615149,51702602,p13/p13615149/s51702602/6073ad46-d6fc60fe-602b...,/home/lchanch/initial_image_prepro/downsample_...,AP,1.0,0.0,0.0,0.0,0,0,1,3,2,1432067 85 1435121 91 1438175 96 1441229 102 1...,1532267 16 1535294 49 1538321 82 1541361 101 1...,3488480 6 3491530 20 3494581 32 3497632 44 350...,3056,2544
122099,1efa888a-b2dedb24-e193ca00-115b5e8a-cb75fcbf,15951258,56205645,p15/p15951258/s56205645/1efa888a-b2dedb24-e193...,/home/lchanch/initial_image_prepro/downsample_...,AP,0.0,0.0,1.0,0.0,0,3,7,1,2,904673 6 907209 18 909745 29 912280 42 914816 ...,1033782 5 1036320 15 1038857 26 1041395 35 104...,2764127 91 2766666 119 2769205 147 2771744 164...,2544,3056


In [274]:
image_mask_df_mini.shape

(10248, 20)

In [275]:
split_counts_m = image_mask_df_mini['split'].value_counts()

total_m = split_counts_m.sum()
split_percentages_m = (split_counts_m / total_m) * 100

print("\nData distribution in image_mask_df_mini:")
print(split_percentages_m)


Data distribution in image_mask_df_mini:
split
0    70.023419
2    15.163934
1    14.812646
Name: count, dtype: float64


In [276]:
image_mask_df_mini.to_csv('image_mask_df_mini', index=False)

In [None]:
#=============5% from the final mapped DF (image_mask_df)=====

In [277]:
image_mask_df_mini.head(2)

Unnamed: 0,dicom_id,subject_id,study_id,reduced_path,path,ViewPosition,No Finding,Pleural Effusion,Cardiomegaly,Pneumothorax,sex,race,sex_race,age,split,Left Lung,Right Lung,Heart,Width,Height
177429,0743b02e-0ef83ee8-b31b456e-42821d06-073c9808,18651980,57211617,p18/p18651980/s57211617/0743b02e-0ef83ee8-b31b...,/home/lchanch/initial_image_prepro/downsample_...,AP,1.0,0.0,0.0,0.0,1,0,0,0,0,2118294 3 2120835 9 2123376 14 2125918 19 2128...,1908930 13 1911452 38 1913973 65 1916505 80 19...,3979814 9 3982353 26 3984893 42 3987432 59 398...,2544,3056
159596,c425338e-3ed004d2-c555efb0-c26a933d-17829050,17784248,57417630,p17/p17784248/s57417630/c425338e-3ed004d2-c555...,/home/lchanch/initial_image_prepro/downsample_...,PA,0.0,0.0,0.0,0.0,0,0,1,1,0,996242 77 998784 81 1001326 85 1003867 90 1006...,942189 8 944720 25 947251 41 949782 57 952313 ...,2855643 13 2858183 40 2860723 67 2863263 87 28...,2544,3056


In [None]:
#========10 samples from the pathologies=======

In [278]:
# Generate sample subsets for each pathology
list_pat = ["No Finding", "Pleural Effusion", "Cardiomegaly", "Pneumothorax"]

for pat in list_pat:
    # Replace spaces with underscores for file names
    pat_filename = pat.replace(" ", "_")

    # Select rows where pathology == 1
    sub_df_1 = image_mask_df[image_mask_df[pat] == 1]
    sampled_rows_1 = sub_df_1.sample(n=min(10, len(sub_df_1)), random_state=42)
    sampled_rows_1.to_csv(f"{pat_filename}_sample_1", index=False)

    # For 'No Finding', also take samples where label == 0
    if pat == "No Finding":
        sub_df_0 = image_mask_df[image_mask_df[pat] == 0]
        sampled_rows_0 = sub_df_0.sample(n=min(10, len(sub_df_0)), random_state=42)
        sampled_rows_0.to_csv(f"{pat_filename}_sample_0", index=False)


In [None]:
#========10 samples from the pathologies=======