In [1]:
# ============================
# 1. Import Libraries
# ============================
import os
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
from PIL import Image
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import roc_auc_score
import numpy as np

# ============================
# 2. Load Metadata
# ============================
df = pd.read_csv('/content/drive/MyDrive/Data_Entry_2017_v2020.csv')

print("Original dataframe:", len(df))

# ============================
# 3. Load train/test split files
# ============================
with open('/content/drive/MyDrive//train_val_list.txt') as f:
    train_files = set(f.read().splitlines())
with open('/content/drive/MyDrive/test_list.txt') as f:
    test_files = set(f.read().splitlines())

df["split"] = df["Image Index"].apply(
    lambda x: "train" if x in train_files else ("test" if x in test_files else "none")
)

# ============================
# 4. Clean Missing Images
# ============================
# Update the image directory path to the correct location in Google Drive
IMG_DIR = '/content/drive/MyDrive/images' # Assuming the images are in a folder named 'images' within your Project/X_ray folder

df = df[df["Image Index"].apply(lambda x: os.path.exists(os.path.join(IMG_DIR, x)))]
print("After removing missing images:", len(df))

# ============================
# 5. Encode Labels
# ============================
df["Finding Labels"] = df["Finding Labels"].str.split("|")
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df["Finding Labels"])
print("Classes:", mlb.classes_)

# ============================
# 6. Dataset Class
# ============================
transform = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406],
                         [0.229, 0.224, 0.225])
])

class CXRDataset(Dataset):
    def __init__(self, df, labels, img_dir, transform=None):
        self.df = df.reset_index(drop=True)
        self.labels = labels
        self.img_dir = img_dir
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        img_path = os.path.join(self.img_dir, row["Image Index"])
        image = Image.open(img_path).convert("RGB")
        if self.transform:
            image = self.transform(image)
        label = torch.tensor(self.labels[idx]).float()
        return image, label

train_df = df[df["split"]=="train"].reset_index(drop=True)
test_df  = df[df["split"]=="test"].reset_index(drop=True)

train_dataset = CXRDataset(train_df, y[df["split"]=="train"], IMG_DIR, transform)
test_dataset  = CXRDataset(test_df, y[df["split"]=="test"], IMG_DIR, transform)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader  = DataLoader(test_dataset, batch_size=32, shuffle=False)

print("Train samples:", len(train_dataset))
print("Test samples:", len(test_dataset))

# ============================
# 7. Define Model (DenseNet121)
# ============================
num_classes = len(mlb.classes_)

model = models.densenet121(weights="IMAGENET1K_V1")
model.classifier = nn.Sequential(
    nn.Linear(model.classifier.in_features, num_classes),
    nn.Sigmoid()   # multi-label
)

# ============================
# 8. Train Loop
# ============================
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

EPOCHS = 5
for epoch in range(EPOCHS):
    model.train()
    running_loss = 0.0
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
    print(f"Epoch {epoch+1}/{EPOCHS}, Loss: {running_loss/len(train_loader):.4f}")

# ============================
# 9. Evaluation
# ============================
model.eval()
y_true, y_pred = [], []

with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        y_true.append(labels.cpu())
        y_pred.append(outputs.cpu())

y_true = torch.cat(y_true).numpy()
y_pred = torch.cat(y_pred).numpy()

print("Mean ROC AUC:", roc_auc_score(y_true, y_pred, average="macro"))

Original dataframe: 112120
After removing missing images: 10000
Classes: ['Atelectasis' 'Cardiomegaly' 'Consolidation' 'Edema' 'Effusion'
 'Emphysema' 'Fibrosis' 'Hernia' 'Infiltration' 'Mass' 'No Finding'
 'Nodule' 'Pleural_Thickening' 'Pneumonia' 'Pneumothorax']
Train samples: 8463
Test samples: 1537
Downloading: "https://download.pytorch.org/models/densenet121-a639ec97.pth" to /root/.cache/torch/hub/checkpoints/densenet121-a639ec97.pth


100%|██████████| 30.8M/30.8M [00:00<00:00, 119MB/s]


Epoch 1/5, Loss: 0.2165
Epoch 2/5, Loss: 0.1549
Epoch 3/5, Loss: 0.1385
Epoch 4/5, Loss: 0.1162
Epoch 5/5, Loss: 0.0897
Mean ROC AUC: 0.6562795468381261
