In [2]:
# Libraries
import tarfile
import os


import pandas as pd
from sklearn.model_selection import train_test_split
from torchvision import transforms
from torchvision.datasets import ImageFolder
from torch.utils.data import DataLoader

In [3]:
# Mount Drive
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [4]:
# # Extract images zipped folders

# output_folder = '/content/extracted_xray_images/'
# # Step 3: Extract the .tar file

# image_folder = '/content/drive/MyDrive/Chest-disease-detection/NIH-chest-xray-14/datasets/'
# output_folder = f'{image_folder}/images'
# for filename in os.listdir(image_folder):
#   tar_file_path = f'{image_folder}/{filename}'  # Change this to the name of your downloaded file
#   with tarfile.open(tar_file_path, 'r') as tar:
#       tar.extractall(path=output_folder)

In [5]:
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
class ChestXrayDataset(Dataset):
    def __init__(self, labels_file, img_dir, transform=None):
        # Load the CSV with image names and labels
        self.labels_df = pd.read_csv(labels_file)
        self.img_dir = img_dir
        self.transform = transform

    def __len__(self):
        return len(self.labels_df)

    def __getitem__(self, idx):
        # Extract image filename and corresponding labels
        img_name = self.labels_df.iloc[idx, 0]  # First column: image filename
        img_path = os.path.join(self.img_dir, img_name)

        image = Image.open(img_path).convert('RGB')

        # Second column onward contains disease labels (0 or 1 for each class)
        labels = self.labels_df.iloc[idx, 1:].values.astype(float)

        if self.transform:
            image = self.transform(image)

        return image, torch.tensor(labels, dtype=torch.float32)


In [6]:


# Set paths
data_dir = '/content/drive/MyDrive/Chest-disease-detection/NIH-chest-xray-14/datasets/images/'  # Directory with images
train_labels_path = '/content/drive/MyDrive/Chest-disease-detection/NIH-chest-xray-14/labels/train_labels.csv'  # Path to the labels CSV file
val_labels_path = '/content/drive/MyDrive/Chest-disease-detection/NIH-chest-xray-14/labels/train_labels.csv'  # Path to the labels CSV file

# Load labels
train_labels = pd.read_csv(train_labels_path)
val_labels = pd.read_csv(val_labels_path)

# Split into train and validation sets
# train_labels, val_labels = train_test_split(labels, test_size=0.2, random_state=42)


# Define transformations
transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize to fit model input
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Create datasets
train_dataset = ChestXrayDataset(labels_file=train_labels_path, img_dir=data_dir, transform=transform)
val_dataset = ChestXrayDataset(labels_file=val_labels_path, img_dir=data_dir, transform=transform)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=2)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=2)

In [7]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import models
from tqdm import tqdm
from PIL import Image


# Check if CUDA (GPU) is available, otherwise use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load the pretrained ResNet50 model
model = models.resnet50(pretrained=True)

# Modify the fully connected layer for 14 disease classes
num_classes = 14  # ChestX-ray14 has 14 classes
model.fc = nn.Linear(model.fc.in_features, num_classes)

# Move the model to the appropriate device (GPU or CPU)
model = model.to(device)

# Define loss function and optimizer
criterion = nn.BCEWithLogitsLoss()  # For multi-label classification
optimizer = optim.Adam(model.parameters(), lr=0.001)


Using device: cuda


Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
100%|██████████| 97.8M/97.8M [00:00<00:00, 173MB/s]


In [None]:
# Training loop
num_epochs = 5
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0

    for images, labels in tqdm(train_loader):
        # Move images and labels to the same device as the model
        images, labels = images.to(device), labels.to(device)

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(images)

        # Calculate loss
        loss = criterion(outputs, labels)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {running_loss / len(train_loader):.4f}")

# Validation loop
model.eval()
val_loss = 0.0
with torch.no_grad():
    for images, labels in val_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        loss = criterion(outputs, labels)
        val_loss += loss.item()

    print(f"Validation Loss: {val_loss / len(val_loader):.4f}")


 89%|████████▉ | 588/662 [1:59:49<13:52, 11.25s/it]

In [None]:
torch.save(model.state_dict(), f"/content/drive/MyDrive/Chest-disease-detection/NIH-chest-xray-14.pth")
