In [12]:
import os
import pandas as pd
import numpy as np
from PIL import Image
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
from tqdm import tqdm
import random
from glob import glob
from sklearn.metrics import f1_score, roc_auc_score

In [18]:
#First, we must define a class that will handle our dataset.

class XRayDataset(Dataset):
    def __init__(self, file_path, img_dir, transform=None):
        self.df = pd.read_parquet(file_path)
        self.img_dir = img_dir
        self.transform = transform

        self.image_ids = self.df['image_id'].values
        self.labels = self.df.iloc[:, 1:].values.astype(float)

    def __len__(self):
        return len(self.image_ids)

    def __getitem__(self, idx):
        image_path = os.path.join(self.img_dir, self.image_ids[idx])
        image = Image.open(image_path).convert('RGB')
        
        if self.transform:
            image = self.transform(image)

        label = torch.tensor(self.labels[idx], dtype=torch.float32)


        return image, label

In [19]:
# Now it's time to find the params to normalize our images. We will do it using the `compute_mean_std` function.

def compute_mean_std(image_dir, image_filenames, sample_size=1000):
    transform = transforms.ToTensor()
    
    # Limit to sample_size
    image_filenames = image_filenames[:sample_size]

    sum_ = torch.zeros(3)
    sum_squared = torch.zeros(3)
    count = 0

    for img_name in tqdm(image_filenames):
        img_path = os.path.join(image_dir, img_name)
        img = Image.open(img_path).convert("RGB")
        tensor = transform(img)
        
        sum_ += tensor.sum(dim=[1, 2])
        sum_squared += (tensor ** 2).sum(dim=[1, 2])
        count += tensor.shape[1] * tensor.shape[2]

    mean = sum_ / count
    std = (sum_squared / count - mean**2).sqrt()
    return mean.tolist(), std.tolist()


image_dir = "../data/images/train"
image_files = [os.path.basename(p) for p in glob(f"{image_dir}/*.png")]
random.shuffle(image_files)

mean, std = compute_mean_std(image_dir, image_files, sample_size=2500)
print("Mean:", mean)
print("Std:", std)


transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=mean, std=std)
])

# The code above would be used if we created the model from scratch


100%|██████████| 2500/2500 [00:15<00:00, 160.26it/s]

Mean: [0.4965616762638092, 0.4965616762638092, 0.4965616762638092]
Std: [0.24910806119441986, 0.24910806119441986, 0.24910806119441986]





We have to setup Transforms and DataLoaders. After doing so, we can setup our model. It's better to start with simpler model and see how it's doing

In [20]:

base_path = '../data'


transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406], 
        std=[0.229, 0.224, 0.225]
    )
])

train_dataset = XRayDataset(f'{base_path}/train_df.parquet', f'{base_path}/images/train', transform=transform)
val_dataset = XRayDataset(f'{base_path}/val_df.parquet', f'{base_path}/images/val', transform=transform)
test_dataset = XRayDataset(f'{base_path}/test_df.parquet', f'{base_path}/images/test', transform=transform)

# DataLoaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=0)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=0)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=0)


In [None]:
# Now it's time to choose a model. We will start with a pre-trained VGG16 model and modify it slightly to fit our needs.

model = models.vgg16(weights='DEFAULT') 
model.classifier[6] = nn.Linear(4096, 18)  

device = torch.device("cpu")
model = model.to(device)

criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)




In [1]:
num_epochs = 5



In [26]:
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    total_labels = 0
    correct_labels = 0.0

    train_loop = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} [Train]")
    for images, labels in train_loop:
        images, labels = images.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

        # Multi-label accuracy
        preds = torch.sigmoid(outputs)
        predicted_labels = (preds > 0.5).float()
        correct_labels += (predicted_labels == labels).sum().item()
        total_labels += labels.numel()

        # Update tqdm with dynamic stats
        train_loop.set_postfix(loss=running_loss/len(train_loop), acc=100*correct_labels/total_labels)

    train_acc = 100 * correct_labels / total_labels
    print(f"Train Accuracy: {train_acc:.2f}%")

    # ----- Validation -----
    model.eval()
    val_correct_labels = 0.0
    val_total_labels = 0
    val_loop = tqdm(val_loader, desc=f"Epoch {epoch+1}/{num_epochs} [Val]")
    with torch.no_grad():
        for images, labels in val_loop:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)

            preds = torch.sigmoid(outputs)
            predicted_labels = (preds > 0.5).float()
            val_correct_labels += (predicted_labels == labels).sum().item()
            val_total_labels += labels.numel()

            val_loop.set_postfix(acc=100*val_correct_labels/val_total_labels)

    val_acc = 100 * val_correct_labels / val_total_labels
    print(f"Validation Accuracy: {val_acc:.2f}%")

Epoch 1/5 [Train]: 100%|██████████| 2786/2786 [6:53:12<00:00,  8.90s/it, acc=87.1, loss=0.241]   


Train Accuracy: 87.08%


Epoch 1/5 [Val]: 100%|██████████| 370/370 [15:46<00:00,  2.56s/it, acc=87.8]


Validation Accuracy: 87.84%


Epoch 2/5 [Train]: 100%|██████████| 2786/2786 [6:19:29<00:00,  8.17s/it, acc=88, loss=0.221]    


Train Accuracy: 88.03%


Epoch 2/5 [Val]: 100%|██████████| 370/370 [19:02<00:00,  3.09s/it, acc=87.8]


Validation Accuracy: 87.85%


Epoch 3/5 [Train]: 100%|██████████| 2786/2786 [7:50:27<00:00, 10.13s/it, acc=88.1, loss=0.218]   


Train Accuracy: 88.14%


Epoch 3/5 [Val]: 100%|██████████| 370/370 [18:59<00:00,  3.08s/it, acc=87.9]


Validation Accuracy: 87.89%


Epoch 4/5 [Train]: 100%|██████████| 2786/2786 [7:37:57<00:00,  9.86s/it, acc=88.2, loss=0.216]   


Train Accuracy: 88.20%


Epoch 4/5 [Val]: 100%|██████████| 370/370 [19:04<00:00,  3.09s/it, acc=88]  


Validation Accuracy: 88.00%


Epoch 5/5 [Train]: 100%|██████████| 2786/2786 [7:58:15<00:00, 10.30s/it, acc=88.2, loss=0.214]   


Train Accuracy: 88.25%


Epoch 5/5 [Val]: 100%|██████████| 370/370 [17:17<00:00,  2.80s/it, acc=88.1]

Validation Accuracy: 88.11%





In [27]:
torch.save(model.state_dict(), "vgg16_lungscan_v1.pth")


In [None]:
# There is a lot of room for improvement, the model barely learns anything. We will evaluate on the test set, see what's the issue and then try to use a different model