In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
#         print(os.path.join(dirname, filename))
        pass

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import os
import numpy as np
import torch
import torch.nn as nn
import torch.backends.cudnn as cudnn
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from sklearn.metrics import roc_auc_score
from torch.utils.data import Dataset
from PIL import Image
import pandas as pd

In [6]:
# Set constants
N_CLASSES = 15
CLASS_NAMES = ['Atelectasis', 'Cardiomegaly', 'Effusion', 'Infiltration', 'Mass', 'Nodule', 'Pneumonia',
               'Pneumothorax', 'Consolidation', 'Edema', 'Emphysema', 'Fibrosis', 'Pleural_Thickening', 'Hernia', 'None_Of_these_diseases']
DATA_DIR_TRAIN = '/kaggle/input/'
DATA_DIR_VALID = '/kaggle/input/'
TRAIN_IMAGE_LIST = '/kaggle/input/dataset-v3-0/training_only_labels.csv'
VAL_IMAGE_LIST = '/kaggle/input/dataset-v3-0/testing_only_labels.csv'
BATCH_SIZE = 64
EPOCHS = 10
LEARNING_RATE = 1e-4

In [None]:
train_data = train_data.iloc[:, 1:]

In [None]:
class_distribution = train_data.sum()
print("Class Distribution:\n", class_distribution)

In [None]:
class_distribution_dict = class_distribution.to_dict()
class_distribution_dict

In [None]:
import matplotlib.pyplot as plt

# Creating a bar plot
plt.figure(figsize=(10, 6))
plt.bar(class_distribution_dict.keys(), class_distribution_dict.values(), color='skyblue')
plt.xlabel('Disease')
plt.ylabel('Number of Images')
plt.title('Class Distribution of Diseases in the Training Data')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()

# Display the plot
plt.show()

In [None]:
# total_samples = sum(class_distribution_dict.values())
total_samples = train_data.shape[0]
print("Total number of samples: ", total_samples)
class_weights_dict = {cls: total_samples / (len(class_distribution_dict) * freq) for cls, freq in class_distribution_dict.items()}

print(total_samples)
print("Class Weights:\n", class_weights_dict)

# Convert weights to a tensor (order should match your class labels)
class_weights = torch.tensor(list(class_weights_dict.values()), dtype=torch.float)

In [None]:
# Multiply each value in class_distribution_dict by the corresponding weight
weighted_values = {key: class_distribution_dict[key] * class_weights_dict[key] for key in class_distribution_dict.keys()}

# Creating the bar plot
plt.figure(figsize=(10, 6))
plt.bar(weighted_values.keys(), weighted_values.values(), color='skyblue')
plt.xlabel('Disease')
plt.ylabel('Weighted Number of Images')
plt.title('Weighted Class Distribution of Diseases in the Training Data')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()

# Display the plot
plt.show()

In [None]:
class ChestXrayDataSet(Dataset):
    def __init__(self, data_dir, csv_file, transform=None):
        """
        Args:
            data_dir: path to the directory containing images.
            csv_file: path to the CSV file containing image filenames and corresponding labels.
            transform: optional transform to be applied on a sample.
        """
        self.image_names = []
        self.labels = []
        start = True  # Using a boolean to control the header skip is more clear
        with open(csv_file, "r") as f:
            for line in f:
                if start:
                    start = False
                    continue
                items = line.strip().split(",")  # Using strip() to remove any trailing newline characters
                image_name = items[0]
                label = [int(i) for i in items[1:]]
                image_path = self.find_image_path(data_dir, image_name)  # Adjusted to self.find_image_path
                if image_path:  # Ensure the image path exists
                    self.image_names.append(image_path)
                    self.labels.append(label)

        self.transform = transform

    @staticmethod
    def find_image_path(data_dir, image_name):
        """
        Static method to find the path of an image given the data directory and the image name.
        """
        for i in range(1, 12):  # Assuming there are six folders
            folder_name = f'images-0{i}'
            image_path = os.path.join(data_dir, folder_name, 'images', image_name)
            if os.path.exists(image_path):
                return image_path
        return None  # Consider how you want to handle cases where the image isn't found

    def __getitem__(self, index):
        """
        Args:
            index: the index of item

        Returns:
            image and its labels
        """
        image_name = self.image_names[index]
        image = Image.open(image_name).convert('RGB')
        label = self.labels[index]
        if self.transform is not None:
            image = self.transform(image)
        return image, torch.FloatTensor(label)

    def __len__(self):
        return len(self.image_names)

In [None]:
# Define the model architecture
class DenseNet121(nn.Module):
    """Model modified for multi-label classification."""
    def __init__(self, out_size):
        super(DenseNet121, self).__init__()
        self.densenet121 = torchvision.models.densenet121(pretrained=True)
        num_ftrs = self.densenet121.classifier.in_features
        self.densenet121.classifier = nn.Sequential(
            nn.Linear(num_ftrs, out_size),
            nn.Sigmoid()
        )

    def forward(self, x):
        x = self.densenet121(x)
        return x

In [None]:
# Function to compute AUCs
def compute_AUCs(gt, pred):
    """Computes Area Under the Curve (AUC) from prediction scores."""
    AUROCs = []
    gt_np = gt.cpu().numpy()
    pred_np = pred.cpu().numpy()
    for i in range(N_CLASSES):
        AUROCs.append(roc_auc_score(gt_np[:, i], pred_np[:, i]))
    return AUROCs

In [None]:
# Validation function
def validate(model, val_loader, criterion):
    model.eval()
    val_loss = 0.0
    gt = torch.FloatTensor().cuda()
    pred = torch.FloatTensor().cuda()

    with torch.no_grad():
        for i, (inputs, targets) in enumerate(val_loader):
            inputs, targets = inputs.cuda(), targets.cuda()

            outputs = model(inputs)
            loss = criterion(outputs, targets)

            val_loss += loss.item()
            gt = torch.cat((gt, targets), 0)
            pred = torch.cat((pred, outputs), 0)

    val_loss /= len(val_loader)
    AUROCs = compute_AUCs(gt, pred)
    val_auc = np.array(AUROCs).mean()

    return val_loss, val_auc

In [None]:
# Set up the model
cudnn.benchmark = True

model = DenseNet121(out_size=N_CLASSES).cuda()
model = torch.nn.DataParallel(model).cuda()

# Define loss function and optimizer
criterion = nn.BCEWithLogitsLoss(pos_weight=class_weights).cuda()
# criterion = nn.BCELoss().cuda()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

# Data augmentation and normalization for training
normalize = transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])

In [None]:
train_dataset = ChestXrayDataSet(data_dir=DATA_DIR_TRAIN,
                                 csv_file=TRAIN_IMAGE_LIST,
                                 transform=transforms.Compose([
                                     transforms.Resize(256),
                                     transforms.RandomResizedCrop(224),
                                     transforms.RandomHorizontalFlip(),
                                     transforms.ToTensor(),
                                     normalize,
                                 ]))

In [None]:
train_loader = DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE,
                          shuffle=True, num_workers=3, pin_memory=True)

In [None]:
val_dataset = ChestXrayDataSet(data_dir=DATA_DIR_VALID,
                               csv_file=VAL_IMAGE_LIST,
                               transform=transforms.Compose([
                                   transforms.Resize(256),
                                   transforms.CenterCrop(224),
                                   transforms.ToTensor(),
                                   normalize,
                               ]))

In [None]:
val_loader = DataLoader(dataset=val_dataset, batch_size=BATCH_SIZE,
                        shuffle=False, num_workers=3, pin_memory=True)

In [None]:
# Training loop
for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    model.train()
    running_loss = 0.0

    for i, (inputs, targets) in enumerate(train_loader):
        inputs, targets = inputs.cuda(), targets.cuda()

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

        if i % 10 == 9:  # Print every 10 batches
            print(f'Batch {i + 1}, Loss: {running_loss / 10:.4f}')
            running_loss = 0.0

    # Validation after each epoch
    val_loss, val_auc = validate(model, val_loader, criterion)
    print(f'Validation Loss: {val_loss:.4f}, Validation AUC: {val_auc:.4f}')

    # Save the model with the validation AUC in the filename
    model_filename = f'chexnet_2nd_epoch_{epoch + 1}_auc_{val_auc:.4f}.pth'
    torch.save(model.state_dict(), model_filename)
    print(f'Model saved as {model_filename}')