In [None]:
print('ok')
import pandas as pd


ok
/local/home/jchen/code/trimodal-fairness/mfenv/bin/pip


In [15]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
import torchxrayvision as xrv
import numpy as np
import pandas as pd
from PIL import Image
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Loading CheXpert DenseNet121 model...")
model = xrv.models.DenseNet(weights="densenet121-res224-chex").eval()


Loading CheXpert DenseNet121 model...


In [None]:
import torchxrayvision as xrv
import torch.nn as nn
import torch

def modify_classifier(model):
    """Ensures the classifier is a binary classification head (idempotent)."""
    
    # If classifier is already a Sequential(nn.Linear, nn.Sigmoid), return as-is
    if isinstance(model.classifier, nn.Sequential) and isinstance(model.classifier[-1], nn.Sigmoid):
        print("✅ Classifier is already modified. Skipping re-initialization.")
        return model
    
    # Handle case where classifier is already a Linear layer (not Sequential)
    if isinstance(model.classifier, nn.Linear):
        num_ftrs = model.classifier.in_features  # Extract input features
    else:
        num_ftrs = model.classifier[-1].in_features  # Extract from Sequential

    # Replace classifier with a binary classification head
    model.classifier = nn.Sequential(
        nn.Linear(num_ftrs, 1),
        nn.Sigmoid()
    )
    print("🔄 Classifier modified successfully.")
    
    return model

# Load CheXpert DenseNet121 model
model = xrv.models.DenseNet(weights="densenet121-res224-chex").eval()

# Modify classifier if needed (idempotent)
model = modify_classifier(model)

# Move model to the correct device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)


transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.Grayscale(num_output_channels=1),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.502], std=[0.289])
])

🔄 Classifier modified successfully.


In [38]:
import os
import torch
import pandas as pd
from torch.utils.data import Dataset
from torchvision import transforms
from PIL import Image

class XRayDataset(Dataset):
    def __init__(self, csv_file, img_dir, label_col="outcome_hospitalization", transform=None, max_rows=None):
        """
        Args:
            csv_file (str): Path to the CSV file.
            img_dir (str): Base directory containing MIMIC-CXR images.
            label_col (str): Column name in CSV that contains labels.
            transform: Transformations to apply to images.
            max_rows (int, optional): Maximum number of rows to load from the CSV.
        """
        self.data = pd.read_csv(csv_file, nrows=max_rows)

        # Drop rows with missing required columns
        self.data = self.data.dropna(subset=["subject_id", "study_id", "dicom_id", label_col])

        # Truncate dataset if max_rows is specified
        if max_rows is not None:
            self.data = self.data.iloc[:max_rows]

        self.img_dir = img_dir
        self.label_col = label_col
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        try:
            # Extract subject_id, study_id, dicom_id
            subject_id = str(int(self.data.iloc[idx]['subject_id']))  # Convert safely
            study_id = str(int(self.data.iloc[idx]['study_id']))  # Convert safely
            dicom_id = str(self.data.iloc[idx]['dicom_id'])  # Keep as string

            # Construct subject_id prefix (first 2 digits of subject_id) with "p" prefix
            subject_id_prefix = f"p{subject_id[:2]}"  # Example: "p12" for subject_id 12000146

            study_id_fixed = f"p{subject_id[:2]}"
            # Construct study_id prefix with "s"
            study_id_prefixed = f"s{study_id}"  # Example: "s52988409"

            # Construct the full path dynamically
            img_path = os.path.join(self.img_dir, subject_id_prefix, study_id_fixed, study_id_prefixed, f"{dicom_id}.jpg")

            # Check if image exists
            if not os.path.exists(img_path):
                raise FileNotFoundError(f"❌ Image not found: {img_path}")

            # Load and transform the image
            img = Image.open(img_path).convert('RGB')
            if self.transform:
                img = self.transform(img)

            # Get label dynamically
            label = torch.tensor(self.data.iloc[idx][self.label_col], dtype=torch.float32)

            return img, label
        
        except Exception as e:
            print(f"❌ Error processing index {idx}: {e}")
            return None  # Return None so DataLoader can handle it properly


In [40]:
# Load Dataset
print("Loading dataset...")
train_dataset = XRayDataset(csv_file="/local/data/jchen/physionet.org/files/torch_dataset/train_multimodal.csv", 
                            img_dir="/local/data/jchen/physionet.org/files/mimic-cxr-jpg/2.1.0/files",
                            label_col='outcome_hospitalization',transform=transform, max_rows=10000)

test_dataset = XRayDataset(csv_file="/local/data/jchen/physionet.org/files/torch_dataset/test_multimodal.csv", 
                           img_dir="/local/data/jchen/physionet.org/files/mimic-cxr-jpg/2.1.0/files", 
                           transform=transform,
                           max_rows=100)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

Loading dataset...


In [1]:

# Freeze Feature Extractor Layers
for param in model.features.parameters():
    param.requires_grad = False

# Define Loss and Optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(model.classifier.parameters(), lr=1e-4)

# Train the Classifier Head
print("Training classifier head...")
num_epochs = 5
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(images).squeeze()
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss / len(train_loader):.4f}")

# Unfreeze Last Dense Block for Fine-Tuning
for param in model.features.denseblock4.parameters():
    param.requires_grad = True

# Fine-Tune Model
print("Fine-tuning the model...")
optimizer = optim.Adam(model.parameters(), lr=1e-5)
num_finetune_epochs = 5

for epoch in range(num_finetune_epochs):
    model.train()
    total_loss = 0

    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(images).squeeze()
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Fine-tune Epoch [{epoch+1}/{num_finetune_epochs}], Loss: {total_loss / len(train_loader):.4f}")

# Evaluate Model Performance
print("Evaluating model...")
model.eval()
true_labels, predictions = [], []

with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)

        outputs = model(images).squeeze()
        true_labels.extend(labels.cpu().numpy())
        predictions.extend(outputs.cpu().numpy())

# Compute AUC-ROC
auc_score = roc_auc_score(true_labels, predictions)
print(f"Final AUC-ROC Score: {auc_score:.4f}")

NameError: name 'model' is not defined

In [9]:
MULTIMODAL_PATH = '/local/data/jchen/physionet.org/files/mimic-iv-ed-oct21-full/master_dataset_multimodal_final_oct21.csv'

from sklearn.model_selection import train_test_split

In [10]:
multimodal_dataset = pd.read_csv(MULTIMODAL_PATH)



# Define the outcome variable
target_col = "outcome_hospitalization"

# Train-test split (80% train, 20% test)
train_df, test_df = train_test_split(multimodal_dataset, test_size=0.2, random_state=42, stratify=multimodal_dataset[target_col])

# Check split sizes
print(f"Train size: {len(train_df)}, Test size: {len(test_df)}")


Train size: 553756, Test size: 138439


In [11]:
# Define directory and file paths
BASE_DIR = "/local/data/jchen/physionet.org/files/torch_dataset"
os.makedirs(BASE_DIR, exist_ok=True)  # Create directory if it doesn't exist

TRAIN_PATH = os.path.join(BASE_DIR, "train_multimodal.csv")
TEST_PATH = os.path.join(BASE_DIR, "test_multimodal.csv")

# Save to CSV
train_df.to_csv(TRAIN_PATH, index=False)
test_df.to_csv(TEST_PATH, index=False)

print(f"Train dataset saved to {TRAIN_PATH}")
print(f"Test dataset saved to {TEST_PATH}")

Train dataset saved to /local/data/jchen/physionet.org/files/torch_dataset/train_multimodal.csv
Test dataset saved to /local/data/jchen/physionet.org/files/torch_dataset/test_multimodal.csv


In [5]:
# Specify the columns to move to the front
cols_to_front = ['study_id', 'subject_id', 'hadm_id']

# Reorder the DataFrame
multimodal_dataset = multimodal_dataset[cols_to_front + [col for col in multimodal_dataset.columns if col not in cols_to_front]]


In [6]:
print(len(multimodal_dataset))

692195


In [7]:
multimodal_dataset

Unnamed: 0,study_id,subject_id,hadm_id,dicom_id,PerformedProcedureStepDescription,ViewPosition,Rows,Columns,StudyDate,StudyTime,...,note_seq_radiology,charttime_radiology,storetime_radiology,text_radiology,note_id_discharge,note_type_discharge,note_seq_discharge,charttime_discharge,storetime_discharge,text_discharge
0,55012421.0,17195991,23542772.0,c2a3cbe8-ad80fc7b-9696e471-e41fb7a7-b0a2d9d9,CHEST (PORTABLE AP),AP,3056.0,2544.0,21100111.0,220111.359,...,24.0,2110-01-15 15:16:00,2110-01-15 16:40:00,INDICATION: Drop attacks.\n\nCOMPARISON: Non...,17195991-DS-3,DS,3.0,2110-01-18 00:00:00,2110-01-18 10:47:00,\nName: ___ Unit No: ___\n...
1,54821803.0,19558713,,75d7e206-02c947f4-fbb2eb52-ac30500b-bda33bf3,CHEST (PA AND LAT),LATERAL,3056.0,2544.0,21100115.0,143845.406,...,,,,,,,,,,
2,54821803.0,19558713,,03eb85f3-96d85327-e813c974-21a4c789-0ee83971,CHEST (PA AND LAT),PA,3056.0,2544.0,21100115.0,143845.406,...,,,,,,,,,,
3,50633975.0,12426170,25971308.0,c634aa41-1709f8d3-d8db1c04-1241dbf1-7f41fe39,CHEST (PA AND LAT),LATERAL,3056.0,2544.0,21100127.0,192228.671,...,5.0,2110-01-27 19:37:00,2110-01-27 23:09:00,INDICATION: ___ with recent fall on R. with ...,12426170-DS-10,DS,10.0,2110-02-01 00:00:00,2110-02-01 16:35:00,\nName: ___ Unit No: ___\...
4,50633975.0,12426170,25971308.0,c1471c1c-389f6f29-4e4fb33a-ce0e07de-a91a71d3,CHEST (PA AND LAT),LATERAL,3056.0,2544.0,21100127.0,192228.671,...,5.0,2110-01-27 19:37:00,2110-01-27 23:09:00,INDICATION: ___ with recent fall on R. with ...,12426170-DS-10,DS,10.0,2110-02-01 00:00:00,2110-02-01 16:35:00,\nName: ___ Unit No: ___\...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
692190,,19999828,25744818.0,,,,,,,,...,24.0,2149-01-08 11:07:00,2149-01-08 11:26:00,EXAMINATION: UNILAT LOWER EXT VEINS RIGHT\n\n...,19999828-DS-7,DS,7.0,2149-01-18 00:00:00,2149-01-19 07:03:00,\nName: ___ Unit No: ___...
692191,,19999828,25744818.0,,,,,,,,...,25.0,2149-01-08 17:05:00,2149-01-08 18:14:00,EXAMINATION: SECOND OPINION CT TORSO\n\nINDIC...,19999828-DS-7,DS,7.0,2149-01-18 00:00:00,2149-01-19 07:03:00,\nName: ___ Unit No: ___...
692192,,19999828,25744818.0,,,,,,,,...,26.0,2149-01-09 21:30:00,2149-01-09 23:08:00,EXAMINATION: CT ABDOMEN AND PELVIS WITH CONTR...,19999828-DS-7,DS,7.0,2149-01-18 00:00:00,2149-01-19 07:03:00,\nName: ___ Unit No: ___...
692193,,19999828,25744818.0,,,,,,,,...,27.0,2149-01-13 18:50:00,2149-01-13 19:13:00,EXAMINATION: UNILAT LOWER EXT VEINS RIGHT\n\n...,19999828-DS-7,DS,7.0,2149-01-18 00:00:00,2149-01-19 07:03:00,\nName: ___ Unit No: ___...


In [8]:
unique_rows = multimodal_dataset.drop_duplicates(subset=['study_id', 'subject_id', 'hadm_id'])

In [9]:
print(len(unique_rows))

331004


In [10]:
from itertools import islice

# Group the DataFrame by the specified columns
grouped = multimodal_dataset.groupby(['study_id', 'subject_id', 'hadm_id'])

# Print the first 5 groups
grouped


<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fee840ae270>

In [11]:
len(grouped)

331004

In [12]:
for i, (group_keys, group) in enumerate(grouped):
    if i >= 5:  # Limit to the first 5 groups
        break
    display(f"Group: {group_keys}", group)

'Group: (50000014.0, 11941242, 20712112.0)'

Unnamed: 0,study_id,subject_id,hadm_id,dicom_id,PerformedProcedureStepDescription,ViewPosition,Rows,Columns,StudyDate,StudyTime,...,note_seq_radiology,charttime_radiology,storetime_radiology,text_radiology,note_id_discharge,note_type_discharge,note_seq_discharge,charttime_discharge,storetime_discharge,text_discharge
108674,50000014.0,11941242,20712112.0,dffc8ab2-ff37704f-2fb29e6d-51e08075-88bca914,CHEST (PORTABLE AP),AP,2544.0,3056.0,21720525.0,132301.281,...,18.0,2172-05-25 13:17:00,2172-05-25 14:15:00,EXAMINATION: Chest radiograph\n\nINDICATION: ...,11941242-DS-14,DS,14.0,2172-05-29 00:00:00,2172-05-30 08:48:00,\nName: ___ Unit No: ___...


'Group: (50000186.0, 14444780, 23700853.0)'

Unnamed: 0,study_id,subject_id,hadm_id,dicom_id,PerformedProcedureStepDescription,ViewPosition,Rows,Columns,StudyDate,StudyTime,...,note_seq_radiology,charttime_radiology,storetime_radiology,text_radiology,note_id_discharge,note_type_discharge,note_seq_discharge,charttime_discharge,storetime_discharge,text_discharge
5918,50000186.0,14444780,23700853.0,c7dadf13-58bc3fd8-7e7f4ba4-9ac13218-43aae1b2,CHEST (PA AND LAT),LATERAL,3056.0,2544.0,21171130.0,144537.781,...,21.0,2117-11-30 14:44:00,2117-11-30 15:10:00,"INDICATION: Fever, sweats, abdominal pain, cr...",14444780-DS-18,DS,18.0,2117-12-05 00:00:00,2117-12-05 13:36:00,\nName: ___ Unit No: __...
5919,50000186.0,14444780,23700853.0,93bcf53f-7c91b330-3738f326-4d31769d-6cff6fe5,CHEST (PA AND LAT),PA,3056.0,2544.0,21171130.0,144537.781,...,21.0,2117-11-30 14:44:00,2117-11-30 15:10:00,"INDICATION: Fever, sweats, abdominal pain, cr...",14444780-DS-18,DS,18.0,2117-12-05 00:00:00,2117-12-05 13:36:00,\nName: ___ Unit No: __...


'Group: (50000511.0, 13658672, 20440549.0)'

Unnamed: 0,study_id,subject_id,hadm_id,dicom_id,PerformedProcedureStepDescription,ViewPosition,Rows,Columns,StudyDate,StudyTime,...,note_seq_radiology,charttime_radiology,storetime_radiology,text_radiology,note_id_discharge,note_type_discharge,note_seq_discharge,charttime_discharge,storetime_discharge,text_discharge
56738,50000511.0,13658672,20440549.0,5f930e4e-77b45876-8c3d6f4e-b5320395-9bd041f5,CHEST (PORTABLE AP),AP,2438.0,3050.0,21460427.0,203404.359,...,14.0,2146-04-27 20:29:00,2146-04-27 21:45:00,HISTORY: ___ male with seizure.\n\nCOMPARISON...,13658672-DS-13,DS,13.0,2146-05-01 00:00:00,2146-05-02 15:18:00,\nName: ___ Unit No: ___\n...


'Group: (50001064.0, 13987926, 28222327.0)'

Unnamed: 0,study_id,subject_id,hadm_id,dicom_id,PerformedProcedureStepDescription,ViewPosition,Rows,Columns,StudyDate,StudyTime,...,note_seq_radiology,charttime_radiology,storetime_radiology,text_radiology,note_id_discharge,note_type_discharge,note_seq_discharge,charttime_discharge,storetime_discharge,text_discharge
59122,50001064.0,13987926,28222327.0,ba8dd57c-0f556403-b9de02a9-26e605db-fd582102,CHEST (PA AND LAT),PA,3056.0,2544.0,21470629.0,162406.578,...,70.0,2147-06-30 13:02:00,2147-06-30 14:01:00,INDICATION: ___ female with dyspnea. Evaluate...,13987926-DS-21,DS,21.0,2147-07-01 00:00:00,2147-07-01 15:04:00,\nName: ___ Unit No: ___\...
59123,50001064.0,13987926,28222327.0,830d71e2-be3d4c58-a0681d7c-6464f275-345d0672,CHEST (PA AND LAT),LATERAL,3056.0,2544.0,21470629.0,162406.578,...,70.0,2147-06-30 13:02:00,2147-06-30 14:01:00,INDICATION: ___ female with dyspnea. Evaluate...,13987926-DS-21,DS,21.0,2147-07-01 00:00:00,2147-07-01 15:04:00,\nName: ___ Unit No: ___\...


'Group: (50001372.0, 14740030, 26729524.0)'

Unnamed: 0,study_id,subject_id,hadm_id,dicom_id,PerformedProcedureStepDescription,ViewPosition,Rows,Columns,StudyDate,StudyTime,...,note_seq_radiology,charttime_radiology,storetime_radiology,text_radiology,note_id_discharge,note_type_discharge,note_seq_discharge,charttime_discharge,storetime_discharge,text_discharge
149179,50001372.0,14740030,26729524.0,242cace3-313f3e5c-1ca8bb37-774a4721-063ec0da,CHEST (PA AND LAT),LATERAL,3056.0,2544.0,21921030.0,14333.671,...,16.0,2192-10-30 07:13:00,2192-10-30 08:39:00,INDICATION: Pleuritic chest pain and positive...,,,,,,
149180,50001372.0,14740030,26729524.0,50d7d528-db58e775-acfc0b46-f0dac914-9e550434,CHEST (PA AND LAT),PA,3056.0,2544.0,21921030.0,14333.671,...,16.0,2192-10-30 07:13:00,2192-10-30 08:39:00,INDICATION: Pleuritic chest pain and positive...,,,,,,
149181,50001372.0,14740030,26729524.0,b76054dd-3647b2ec-9eb0d921-eb480e79-a141b24f,CHEST (PA AND LAT),PA,3056.0,2544.0,21921030.0,14333.671,...,16.0,2192-10-30 07:13:00,2192-10-30 08:39:00,INDICATION: Pleuritic chest pain and positive...,,,,,,


# deep dive into just PA xrays

In [14]:
len(multimodal_dataset[multimodal_dataset['ViewPosition']=='PA'])

53806

In [16]:
len(multimodal_dataset[multimodal_dataset['ViewPosition']=='LATERAL'])

70636

In [19]:
multimodal_dataset['ViewPosition'].value_counts()

ViewPosition
LATERAL           70636
PA                53806
AP                36871
LL                  156
LAO                   3
RAO                   3
XTABLE LATERAL        1
AP RLD                1
PA LLD                1
Name: count, dtype: int64

In [None]:
# we will need to train our own model
# and predict the hospitalizaiton outcome
# distribution of labels
multimodal_dataset[multimodal_dataset['ViewPosition']=='LATERAL']['outcome_hospitalization'].value_counts()


outcome_hospitalization
True     45306
False    25330
Name: count, dtype: int64

In [1]:
import torchxrayvision as xrv

ModuleNotFoundError: No module named 'torchxrayvision'

In [22]:
# in theory, the studyID should be the most narrow

multimodal_dataset[multimodal_dataset['study_id']==50001064.0]

Unnamed: 0,study_id,subject_id,hadm_id,dicom_id,PerformedProcedureStepDescription,ViewPosition,Rows,Columns,StudyDate,StudyTime,...,note_seq_radiology,charttime_radiology,storetime_radiology,text_radiology,note_id_discharge,note_type_discharge,note_seq_discharge,charttime_discharge,storetime_discharge,text_discharge
59122,50001064.0,13987926,28222327.0,ba8dd57c-0f556403-b9de02a9-26e605db-fd582102,CHEST (PA AND LAT),PA,3056.0,2544.0,21470629.0,162406.578,...,70.0,2147-06-30 13:02:00,2147-06-30 14:01:00,INDICATION: ___ female with dyspnea. Evaluate...,13987926-DS-21,DS,21.0,2147-07-01 00:00:00,2147-07-01 15:04:00,\nName: ___ Unit No: ___\...
59123,50001064.0,13987926,28222327.0,830d71e2-be3d4c58-a0681d7c-6464f275-345d0672,CHEST (PA AND LAT),LATERAL,3056.0,2544.0,21470629.0,162406.578,...,70.0,2147-06-30 13:02:00,2147-06-30 14:01:00,INDICATION: ___ female with dyspnea. Evaluate...,13987926-DS-21,DS,21.0,2147-07-01 00:00:00,2147-07-01 15:04:00,\nName: ___ Unit No: ___\...


In [24]:
multimodal_dataset['study_id'].nunique()
multimodal_dataset['study_size'] = multimodal_dataset.groupby('study_id')['study_id'].transform('size')
# sometimes there will be multiple studyID for a single subject, hadm_id

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  multimodal_dataset['study_size'] = multimodal_dataset.groupby('study_id')['study_id'].transform('size')


In [25]:
largest_groups = multimodal_dataset.sort_values(by='study_size', ascending=False)

In [26]:
largest_groups

Unnamed: 0,study_id,subject_id,hadm_id,dicom_id,PerformedProcedureStepDescription,ViewPosition,Rows,Columns,StudyDate,StudyTime,...,charttime_radiology,storetime_radiology,text_radiology,note_id_discharge,note_type_discharge,note_seq_discharge,charttime_discharge,storetime_discharge,text_discharge,study_size
2883,57841705.0,18948429,20303843.0,1771f20c-803e0d52-aee954fc-0ce6e22b-9c32fecd,CHEST (PORTABLE AP),AP,3056.0,2544.0,21141022.0,202752.312,...,2114-10-22 08:38:00,2114-10-22 12:26:00,REASON FOR THE EXAMINATION: This is a ___ wom...,18948429-DS-22,DS,22.0,2114-10-23 00:00:00,2114-10-23 05:03:00,\nName: ___ Unit No: _...,6.0
107650,53856634.0,18866430,27948099.0,693a687d-cb975d86-137c07d0-498db21b-bad56f79,CHEST (PA AND LAT),LATERAL,3056.0,2544.0,21711201.0,121841.734,...,2171-12-01 15:09:00,2171-12-01 17:05:00,INDICATION: ___ year old male patient with pri...,18866430-DS-29,DS,29.0,2171-12-03 00:00:00,2171-12-04 07:23:00,\nName: ___ Unit No: ...,6.0
107649,53856634.0,18866430,27948099.0,0a4aaf2d-87876a72-ac93ec95-0070d25e-aad7c612,CHEST (PA AND LAT),LATERAL,3056.0,2544.0,21711201.0,121841.734,...,2171-12-01 15:09:00,2171-12-01 17:05:00,INDICATION: ___ year old male patient with pri...,18866430-DS-29,DS,29.0,2171-12-03 00:00:00,2171-12-04 07:23:00,\nName: ___ Unit No: ...,6.0
19814,52761747.0,13823645,,018887a6-4bbfa4b6-022cab1b-47faa8e6-64c1a2f5,CHEST (PA AND LAT),PA,3056.0,2544.0,21271128.0,82858.125,...,,,,,,,,,,6.0
2879,57841705.0,18948429,20303843.0,3b7b7fdf-09e0c19e-2cb9193e-ff533c79-4d26b220,CHEST (PORTABLE AP),AP,3056.0,2544.0,21141022.0,202752.312,...,2114-10-22 08:38:00,2114-10-22 12:26:00,REASON FOR THE EXAMINATION: This is a ___ wom...,18948429-DS-22,DS,22.0,2114-10-23 00:00:00,2114-10-23 05:03:00,\nName: ___ Unit No: _...,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
692190,,19999828,25744818.0,,,,,,,,...,2149-01-08 11:07:00,2149-01-08 11:26:00,EXAMINATION: UNILAT LOWER EXT VEINS RIGHT\n\n...,19999828-DS-7,DS,7.0,2149-01-18 00:00:00,2149-01-19 07:03:00,\nName: ___ Unit No: ___...,
692191,,19999828,25744818.0,,,,,,,,...,2149-01-08 17:05:00,2149-01-08 18:14:00,EXAMINATION: SECOND OPINION CT TORSO\n\nINDIC...,19999828-DS-7,DS,7.0,2149-01-18 00:00:00,2149-01-19 07:03:00,\nName: ___ Unit No: ___...,
692192,,19999828,25744818.0,,,,,,,,...,2149-01-09 21:30:00,2149-01-09 23:08:00,EXAMINATION: CT ABDOMEN AND PELVIS WITH CONTR...,19999828-DS-7,DS,7.0,2149-01-18 00:00:00,2149-01-19 07:03:00,\nName: ___ Unit No: ___...,
692193,,19999828,25744818.0,,,,,,,,...,2149-01-13 18:50:00,2149-01-13 19:13:00,EXAMINATION: UNILAT LOWER EXT VEINS RIGHT\n\n...,19999828-DS-7,DS,7.0,2149-01-18 00:00:00,2149-01-19 07:03:00,\nName: ___ Unit No: ___...,


In [27]:
multimodal_dataset[multimodal_dataset['study_id']==53856634.0]

Unnamed: 0,study_id,subject_id,hadm_id,dicom_id,PerformedProcedureStepDescription,ViewPosition,Rows,Columns,StudyDate,StudyTime,...,charttime_radiology,storetime_radiology,text_radiology,note_id_discharge,note_type_discharge,note_seq_discharge,charttime_discharge,storetime_discharge,text_discharge,study_size
107647,53856634.0,18866430,27948099.0,ac14d54a-ede29cf8-3efc77fc-3754f0ff-62308ff5,CHEST (PA AND LAT),AP,2544.0,3056.0,21711201.0,121841.734,...,2171-12-01 15:09:00,2171-12-01 17:05:00,INDICATION: ___ year old male patient with pri...,18866430-DS-29,DS,29.0,2171-12-03 00:00:00,2171-12-04 07:23:00,\nName: ___ Unit No: ...,6.0
107648,53856634.0,18866430,27948099.0,50aabb09-42b20c64-e36361f1-242f703d-69fc3175,CHEST (PA AND LAT),LATERAL,3056.0,2544.0,21711201.0,121841.734,...,2171-12-01 15:09:00,2171-12-01 17:05:00,INDICATION: ___ year old male patient with pri...,18866430-DS-29,DS,29.0,2171-12-03 00:00:00,2171-12-04 07:23:00,\nName: ___ Unit No: ...,6.0
107649,53856634.0,18866430,27948099.0,0a4aaf2d-87876a72-ac93ec95-0070d25e-aad7c612,CHEST (PA AND LAT),LATERAL,3056.0,2544.0,21711201.0,121841.734,...,2171-12-01 15:09:00,2171-12-01 17:05:00,INDICATION: ___ year old male patient with pri...,18866430-DS-29,DS,29.0,2171-12-03 00:00:00,2171-12-04 07:23:00,\nName: ___ Unit No: ...,6.0
107650,53856634.0,18866430,27948099.0,693a687d-cb975d86-137c07d0-498db21b-bad56f79,CHEST (PA AND LAT),LATERAL,3056.0,2544.0,21711201.0,121841.734,...,2171-12-01 15:09:00,2171-12-01 17:05:00,INDICATION: ___ year old male patient with pri...,18866430-DS-29,DS,29.0,2171-12-03 00:00:00,2171-12-04 07:23:00,\nName: ___ Unit No: ...,6.0
107651,53856634.0,18866430,27948099.0,27559035-763f9da7-194e7101-1f28f3fc-514a5152,CHEST (PA AND LAT),LATERAL,3056.0,2544.0,21711201.0,121841.734,...,2171-12-01 15:09:00,2171-12-01 17:05:00,INDICATION: ___ year old male patient with pri...,18866430-DS-29,DS,29.0,2171-12-03 00:00:00,2171-12-04 07:23:00,\nName: ___ Unit No: ...,6.0
107652,53856634.0,18866430,27948099.0,52e49970-a3911eee-c5602596-55b9b862-10836713,CHEST (PA AND LAT),AP,2544.0,3056.0,21711201.0,121841.734,...,2171-12-01 15:09:00,2171-12-01 17:05:00,INDICATION: ___ year old male patient with pri...,18866430-DS-29,DS,29.0,2171-12-03 00:00:00,2171-12-04 07:23:00,\nName: ___ Unit No: ...,6.0


In [29]:
multimodal_dataset[multimodal_dataset['study_id']==53856634.0]['text_radiology'].tolist()

['INDICATION: ___ year old male patient with prior history of CVAs presenting\nwith lethargy. \n\nTECHNIQUE: Contiguous axial MDCT images were obtained through the brain\nwithout administration of IV contrast. Reformatted coronal and sagittal thin\nsection bone algorithm-reconstructed images were acquired.\n\nTotal DLP: 897.50 mGy-cm\nCTDI: 62.34 mGy\n\nCOMPARISON: NECT of the head on ___.\n\nFINDINGS: There is no evidence of hemorrhage, edema, mass effect, or\ninfarction. There are multiple regions of unchanged encephalomalacia,\nconsistent with prior infarctions. There are areas of low densities within the\nbifrontal lobes likely consistent with prior contusions or less likely\ninfarction. Prominent ventricles and sulci suggest age-related involutional\nchanges or atrophy. Periventricular white matter hypodensities are consistent\nwith chronic small vessel ischemic disease.\n\nNo fracture is identified. Mucosal thickening is noted in the ethmoidal air\ncells. Atherosclerotic mural ca

In [31]:
print(multimodal_dataset.columns.tolist())

['study_id', 'subject_id', 'hadm_id', 'dicom_id', 'PerformedProcedureStepDescription', 'ViewPosition', 'Rows', 'Columns', 'StudyDate', 'StudyTime', 'ProcedureCodeSequence_CodeMeaning', 'ViewCodeSequence_CodeMeaning', 'PatientOrientationCodeSequence_CodeMeaning', 'PStudyTime', 'PStudyDateTime', 'index', 'stay_id', 'intime', 'outtime', 'gender', 'race', 'arrival_transport', 'disposition', 'anchor_age', 'anchor_year', 'dod', 'admittime', 'dischtime', 'deathtime', 'ethnicity', 'edregtime', 'edouttime', 'insurance', 'in_year', 'age', 'outcome_inhospital_mortality', 'ed_los', 'intime_icu', 'time_to_icu_transfer', 'outcome_icu_transfer_12h', 'outcome_hospitalization', 'outcome_critical', 'n_ed_30d', 'n_ed_90d', 'n_ed_365d', 'next_ed_visit_time', 'next_ed_visit_time_diff', 'outcome_ed_revisit_3d', 'n_hosp_30d', 'n_hosp_90d', 'n_hosp_365d', 'n_icu_30d', 'n_icu_90d', 'n_icu_365d', 'ed_los_hours', 'time_to_icu_transfer_hours', 'next_ed_visit_time_diff_days', 'triage_temperature', 'triage_heartrat

In [37]:
outcome_grouped = multimodal_dataset.groupby(['study_id', 'subject_id', 'hadm_id', 'outcome_hospitalization'])


In [38]:
len(outcome_grouped)

331004

In [47]:
sample_df = multimodal_dataset[multimodal_dataset['study_id']==53856634.0]

In [48]:
sample_df

Unnamed: 0,study_id,subject_id,hadm_id,dicom_id,PerformedProcedureStepDescription,ViewPosition,Rows,Columns,StudyDate,StudyTime,...,charttime_radiology,storetime_radiology,text_radiology,note_id_discharge,note_type_discharge,note_seq_discharge,charttime_discharge,storetime_discharge,text_discharge,study_size
107647,53856634.0,18866430,27948099.0,ac14d54a-ede29cf8-3efc77fc-3754f0ff-62308ff5,CHEST (PA AND LAT),AP,2544.0,3056.0,21711201.0,121841.734,...,2171-12-01 15:09:00,2171-12-01 17:05:00,INDICATION: ___ year old male patient with pri...,18866430-DS-29,DS,29.0,2171-12-03 00:00:00,2171-12-04 07:23:00,\nName: ___ Unit No: ...,6.0
107648,53856634.0,18866430,27948099.0,50aabb09-42b20c64-e36361f1-242f703d-69fc3175,CHEST (PA AND LAT),LATERAL,3056.0,2544.0,21711201.0,121841.734,...,2171-12-01 15:09:00,2171-12-01 17:05:00,INDICATION: ___ year old male patient with pri...,18866430-DS-29,DS,29.0,2171-12-03 00:00:00,2171-12-04 07:23:00,\nName: ___ Unit No: ...,6.0
107649,53856634.0,18866430,27948099.0,0a4aaf2d-87876a72-ac93ec95-0070d25e-aad7c612,CHEST (PA AND LAT),LATERAL,3056.0,2544.0,21711201.0,121841.734,...,2171-12-01 15:09:00,2171-12-01 17:05:00,INDICATION: ___ year old male patient with pri...,18866430-DS-29,DS,29.0,2171-12-03 00:00:00,2171-12-04 07:23:00,\nName: ___ Unit No: ...,6.0
107650,53856634.0,18866430,27948099.0,693a687d-cb975d86-137c07d0-498db21b-bad56f79,CHEST (PA AND LAT),LATERAL,3056.0,2544.0,21711201.0,121841.734,...,2171-12-01 15:09:00,2171-12-01 17:05:00,INDICATION: ___ year old male patient with pri...,18866430-DS-29,DS,29.0,2171-12-03 00:00:00,2171-12-04 07:23:00,\nName: ___ Unit No: ...,6.0
107651,53856634.0,18866430,27948099.0,27559035-763f9da7-194e7101-1f28f3fc-514a5152,CHEST (PA AND LAT),LATERAL,3056.0,2544.0,21711201.0,121841.734,...,2171-12-01 15:09:00,2171-12-01 17:05:00,INDICATION: ___ year old male patient with pri...,18866430-DS-29,DS,29.0,2171-12-03 00:00:00,2171-12-04 07:23:00,\nName: ___ Unit No: ...,6.0
107652,53856634.0,18866430,27948099.0,52e49970-a3911eee-c5602596-55b9b862-10836713,CHEST (PA AND LAT),AP,2544.0,3056.0,21711201.0,121841.734,...,2171-12-01 15:09:00,2171-12-01 17:05:00,INDICATION: ___ year old male patient with pri...,18866430-DS-29,DS,29.0,2171-12-03 00:00:00,2171-12-04 07:23:00,\nName: ___ Unit No: ...,6.0


In [44]:
grouped = sample_df.groupby(['study_id', 'subject_id', 'hadm_id',])

# Define a function to identify columns with disagreements
def find_disagreements(group):
    """Returns a list of columns that have disagreements within the group."""
    return [col for col in group.columns if group[col].nunique() > 1]

# Apply the disagreement check to each group
disagreement_columns = grouped.apply(find_disagreements)

# Convert the results to a DataFrame for better visualization
disagreement_df = disagreement_columns.reset_index(name='disagreeing_columns')

print(disagreement_df)


     study_id  subject_id     hadm_id  \
0  53856634.0    18866430  27948099.0   

                                 disagreeing_columns  
0  [dicom_id, ViewPosition, Rows, Columns, ViewCo...  


  disagreement_columns = grouped.apply(find_disagreements)


In [46]:
disagreement_df['disagreeing_columns'].tolist()

[['dicom_id',
  'ViewPosition',
  'Rows',
  'Columns',
  'ViewCodeSequence_CodeMeaning']]

In [13]:
#  we can examine the 6 different xrays here
import os 
import matplotlib.pyplot as plt
subject_id_path = '/local/data/jchen/physionet.org/files/mimic-cxr-jpg/2.1.0/files/p18/p18866430'
image_path = os.path.join(subject_id_path, 's53856634', 'ac14d54a-ede29cf8-3efc77fc-3754f0ff-62308ff5.jpg')

from PIL import Image
img = Image.open(image_path)
plt.imshow(img)
plt.show()

FileNotFoundError: [Errno 2] No such file or directory: '/local/data/jchen/physionet.org/files/mimic-cxr-jpg/2.1.0/files/p18/p18866430/s53856634/ac14d54a-ede29cf8-3efc77fc-3754f0ff-62308ff5.jpg'

In [14]:
image_path = os.path.join(subject_id_path, 's53856634', 'ac14d54a-ede29cf8-3efc77fc-3754f0ff-62308ff5.jpg')

from PIL import Image
img = Image.open(image_path)
plt.imshow(img)
plt.show()

FileNotFoundError: [Errno 2] No such file or directory: '/local/data/jchen/physionet.org/files/mimic-cxr-jpg/2.1.0/files/p18/p18866430/s53856634/ac14d54a-ede29cf8-3efc77fc-3754f0ff-62308ff5.jpg'

In [15]:
# instead, now let's start joining the datasets together 

print(image_path)

/local/data/jchen/physionet.org/files/mimic-cxr-jpg/2.1.0/files/p18/p18866430/s53856634/ac14d54a-ede29cf8-3efc77fc-3754f0ff-62308ff5.jpg
