

### Label Mapping

exclude: Support_Devices == 1

| Condition           | Index |
|---------------------|-------|
| Cardiomegaly        | 0     |
| Pleural Effusion    | 1     |
| Edema               | 2     |
| Fracture            | 3     |
| Consolidation       | 4     |
| Lung Opacity        | 4     |
| Pneumonia           | 4     |
| No Finding          | 5     |


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import io
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import h5py
from tqdm import tqdm, trange

2025-03-02 07:59:14.150748: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-02 07:59:14.162782: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-03-02 07:59:14.179255: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-03-02 07:59:14.179278: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1442] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-02 07:59:14.190158: I tensorflow/core/platform/cpu_feature_gua

In [2]:
def setup_gpu():
    gpus = tf.config.experimental.list_physical_devices('GPU')
    if gpus:
        try:
            tf.config.experimental.set_visible_devices(gpus[0], 'GPU')
            tf.config.experimental.set_memory_growth(gpus[0], True)
            logical_gpus = tf.config.experimental.list_logical_devices('GPU')
            print(f"Physical GPUs: {gpus}")
            print(f"Logical GPUs: {logical_gpus}")
        except RuntimeError as e:
            print(e)
setup_gpu()

Physical GPUs: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU')]
Logical GPUs: [LogicalDevice(name='/device:GPU:0', device_type='GPU')]


2024-09-13 22:10:40.775449: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1928] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 1762 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3090, pci bus id: 0000:65:00.0, compute capability: 8.6


In [3]:
def preprocess_image(jpg_bytes):
    image = tf.io.decode_jpeg(jpg_bytes)
    image = tf.image.resize(image, [224, 224])  # Resize image
    return image.numpy()  # Convert to numpy array to ease GPU memory usage when not training directly

In [None]:
df = pd.read_csv('./dataset//mimic_chexpert.csv')
df.groupby("study_id").count()

Unnamed: 0_level_0,subject_id,Atelectasis,Cardiomegaly,Consolidation,Edema,Enlarged_Cardiomediastinum,Fracture,Lung_Lesion,Lung_Opacity,No_Finding,Pleural_Effusion,Pleural_Other,Pneumonia,Pneumothorax,Support_Devices
study_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
50000014,1,1,1,0,1,0,0,0,1,0,0,0,1,0,0
50000028,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0
50000052,1,0,1,0,1,0,0,0,1,0,0,0,0,1,1
50000103,1,1,0,0,0,0,0,0,1,0,1,0,0,0,0
50000125,1,0,1,0,1,0,0,0,1,0,1,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59999832,1,0,1,0,1,0,0,0,0,1,1,0,1,0,1
59999849,1,1,1,0,1,1,0,0,1,0,1,0,1,1,1
59999880,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0
59999888,1,0,1,0,1,0,0,1,0,0,1,0,0,0,0


In [None]:
record_iterator = tf.compat.v1.python_io.tf_record_iterator(path='../../mimic-tf-record-withDicom.tfrecords')
study = list()
image = list()
dicom = list()

for string_record in tqdm(record_iterator, desc="Processing Records"):
    example = tf.train.Example()
    example.ParseFromString(string_record)

    dicom.append(example.features.feature['dicom_id'].bytes_list.value[0].decode('utf-8'))
    study.append(example.features.feature['study_id'].int64_list.value[0])
    image.append(preprocess_image(example.features.feature['jpg_bytes'].bytes_list.value[0]) if example.features.feature['jpg_bytes'].bytes_list.value else None)

df_image = pd.DataFrame({"study_id": study, "image": image, "dicom_id": dicom})
df_image

In [None]:
import pickle

with open('./study_image_dicom.pkl', 'wb') as f:
    pickle.dump({'study': study, 'image': image, 'dicom': dicom}, f)

In [None]:
with h5py.File('j_arrays.h5', 'w') as f:
    for i in trange(len(df_image)):
        img = df_image.iloc[i]["image"]
        study = np.zeros((1, 224, 1))
        study[0][0][0] = df_image.iloc[i]["study_id"]
        data = np.concatenate((img, study), axis=0)
        f.create_dataset(f"image_{i}", data=data)

# Train, val, test split

In [6]:
def split_by_subject_id(df):
    df = df[df['Support_Devices'] != 1]

    unique_patient_ids = df['subject_id'].unique()
    
    train_ids, temp_ids = train_test_split(unique_patient_ids, test_size=0.2, random_state=42)
    val_ids, test_ids = train_test_split(temp_ids, test_size=0.5, random_state=42)
    
    train_df = df[df['subject_id'].isin(train_ids)]
    val_df = df[df['subject_id'].isin(val_ids)]
    test_df = df[df['subject_id'].isin(test_ids)]
    
    return train_df, val_df, test_df

In [7]:
train_df, val_df, test_df = split_by_subject_id(df)

# Balance dataset

In [10]:
def create_dataset(df, target_label, balance = False):
    df = df[df['Support_Devices']!=1]
    
    
    df['Label_0'] = df['Cardiomegaly'].apply(lambda x: 1 if x == 1 else 0)
    df['Label_1'] = df['Pleural_Effusion'].apply(lambda x: 1 if x == 1 else 0)
    df['Label_2'] = df['Edema'].apply(lambda x: 1 if x == 1 else 0)
    df['Label_3'] = df['Fracture'].apply(lambda x: 1 if x == 1 else 0)
    df['Label_4'] = df[['Consolidation', 'Lung_Opacity', 'Pneumonia']].max(axis=1).apply(lambda x: 1 if x == 1 else 0)
    df['Label_5'] = df['No_Finding'].apply(lambda x: 1 if x == 1 else 0)
    
    # Group by study_id and aggregate to get the count of target label per study
    df_grouped = df.groupby('study_id').agg(
        target_label_count=(target_label, 'sum')
    ).reset_index()
    #print(len(df_grouped), len(df))
    # Separate studies with target label 1 and 0
    positive_studies = df_grouped[df_grouped['target_label_count'] == 1]['study_id']
    negative_studies = df_grouped[df_grouped['target_label_count'] == 0]['study_id']
    
    # Ensure equal number of positive and negative samples by downsampling
    min_count = min(len(positive_studies), len(negative_studies))
    if balance:
        positive_studies_sampled = positive_studies.sample(min_count, random_state=42)
        negative_studies_sampled = negative_studies.sample(min_count, random_state=42)
    else:
        positive_studies_sampled = positive_studies
        negative_studies_sampled = negative_studies
        
    #print(len(positive_studies_sampled), len(negative_studies_sampled))
    
    # Combine positive and negative samples
    balanced_studies = pd.concat([positive_studies_sampled, negative_studies_sampled])
    
    # Filter original dataframe for these studies
    df_balanced = df[df['study_id'].isin(balanced_studies)]
    
    
    return df_balanced

# create balance dataset

In [None]:
for i in range(6):
    balance_train_df = create_dataset(train_df, f'Label_{i}', True)
    balance_train_df[["study_id", f'Label_{i}']].to_csv(f'j_train_TEST_Label_{i}.csv', index=False)
    
    original_val_df = create_dataset(val_df, f'Label_{i}', False)
    original_val_df[["study_id", f'Label_{i}']].to_csv(f'j_val_Label_{i}.csv', index=False)
    
    original_test_df = create_dataset(test_df, f'Label_{i}', False)
    original_test_df[["study_id", f'Label_{i}']].to_csv(f'j_pred_Label_{i}.csv', index=False)