In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        continue

In [None]:
import fastai
from fastai.vision.all import *
from tqdm import tqdm
from glob import glob
from torch.utils.data import DataLoader
from torchvision import transforms
from sklearn.utils.class_weight import compute_class_weight
import os
import numpy as np
import torch
import torch.nn as nn
import torch.backends.cudnn as cudnn
import torchvision
import torchvision.transforms as transforms
from sklearn.metrics import roc_auc_score
from torch.utils.data import Dataset
from PIL import Image
import pandas as pd
from imblearn.over_sampling import SMOTE

In [None]:
BATCH_SIZE = 32
LEARNING_RATE = 1e-4
N_CLASSES = 14

In [None]:
SEED = 85
def seed_everything(seed):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(SEED)

In [None]:
labels_train_val = pd.read_csv('/kaggle/input/data/train_val_list.txt')
labels_train_val.columns = ['Image_Index']
labels_test = pd.read_csv('/kaggle/input/data/test_list.txt')
labels_test.columns = ['Image_Index']
disease_labels = ['Atelectasis', 'Consolidation', 'Infiltration', 'Pneumothorax', 'Edema', 'Emphysema', 'Fibrosis', 'Effusion', 'Pneumonia', 'Pleural_Thickening',
'Cardiomegaly', 'Nodule', 'Mass', 'Hernia']

# NIH Dataset Labels CSV File 
labels_df = pd.read_csv('/kaggle/input/data-entry-2017-v2020/Data_Entry_2017_v2020.csv')
print(labels_df.shape)
labels_df.columns = ['Image Index', 'Finding_Labels', 'Follow-Up #', 'Patient ID',
                  'Patient Age', 'Patient Gender', 'View Position',
                  'OriginalImage[Width', 'Height]',
                  'OriginalImagePixelSpacing[x',
                  'y]']
# One hot encoding
for diseases in tqdm(disease_labels): 
    labels_df[diseases] = labels_df['Finding_Labels'].map(lambda result: 1 if diseases in result else 0)

labels_df=labels_df[labels_df.Finding_Labels != 'No Finding']
print(labels_df.shape)

In [None]:
labels_df.columns

In [None]:
unique_patients = np.unique(labels_df['Patient ID'])
unique_patients

In [None]:

labels_df['Finding_Labels'] = labels_df['Finding_Labels'].apply(lambda s: [l for l in str(s).split('|')])

num_glob = glob('/kaggle/input/data/*/images/*.png')
img_path = {os.path.basename(x): x for x in num_glob}

labels_df['Paths'] = labels_df['Image Index'].map(img_path.get)
labels_df

In [None]:
unique_patients = np.unique(labels_df['Patient ID'])
len(unique_patients)

In [None]:
from sklearn.model_selection import train_test_split

# train-70
# val-10
# test-20
train_val_df_patients, test_df_patients = train_test_split(unique_patients, test_size = 0.2,random_state = SEED, shuffle= True)

train_df_patients, valid_df_patients = train_test_split(train_val_df_patients, 
                                                        test_size=0.125,  # 0.125 of 80% is 10%
                                                        random_state=SEED, 
                                                        shuffle=True)
len(train_val_df_patients)

In [None]:
train_df = labels_df[labels_df['Patient ID'].isin(train_df_patients)]
val_df = labels_df[labels_df['Patient ID'].isin(valid_df_patients)]

# train_df.to_csv('train_image_list.csv')
# val_df.to_csv('val_image_list.csv')

In [None]:
labels = train_df[disease_labels].values

In [None]:
# Calculate the count of 1's (positives)
original_class_distribution_ones = train_df[disease_labels].sum()

# Calculate the count of 0's (negatives)
original_class_distribution_zeros = train_df[disease_labels].shape[0] - original_class_distribution_ones

In [None]:
# Combine the counts into a DataFrame for easier plotting
distribution_df = pd.DataFrame({
    'Disease': disease_labels,
    'Count of Ones': original_class_distribution_ones,
    'Count of Zeros': original_class_distribution_zeros
})

In [None]:
import matplotlib.pyplot as plt

# Plotting
fig, ax = plt.subplots(figsize=(14, 7))
distribution_df.plot(kind='bar', x='Disease', y=['Count of Ones', 'Count of Zeros'], ax=ax, width=0.8)
ax.set_xlabel('Disease')
ax.set_ylabel('Number of Samples')
ax.set_title('Count of Ones and Zeros for Each Disease in Training Data')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
labels_df.shape
print('train size', train_df.shape[0])
print('test size', val_df.shape[0])

In [None]:
class ChestXrayDataSet(Dataset):
    def __init__(self, df, transform=None):
        """
        Args:
            df: DataFrame containing image paths and labels.
            transform: optional transform to be applied on a sample.
        """
        self.df = df
        self.image_paths = df['Paths'].values
        self.labels = df[disease_labels].values
        self.transform = transform

    def __getitem__(self, index):
        """
        Args:
            index: the index of the item.

        Returns:
            image and its labels.
        """
        image_path = self.image_paths[index]
        image = Image.open(image_path).convert('RGB')
        label = self.labels[index]
        if self.transform:
            image = self.transform(image)
        return image, torch.FloatTensor(label)

    def __len__(self):
        return len(self.image_paths)

In [None]:
class DenseNet121(nn.Module):
    """Model modified for multi-label classification using DenseNet121."""
    def __init__(self, out_size, freeze_layers=True):
        super(DenseNet121, self).__init__()
        self.densenet121 = torchvision.models.densenet121(pretrained=True)

        if freeze_layers:
            # Freeze the initial layers
            for param in self.densenet121.parameters():
                param.requires_grad = False

        # Unfreeze the last two dense blocks
        for param in self.densenet121.features.denseblock3.parameters():
            param.requires_grad = True
        for param in self.densenet121.features.denseblock4.parameters():
            param.requires_grad = True
        for param in self.densenet121.classifier.parameters():
            param.requires_grad = True

        num_ftrs = self.densenet121.classifier.in_features
        self.densenet121.classifier = nn.Sequential(
            nn.Linear(num_ftrs, out_size),
        )

    def forward(self, x):
        x = self.densenet121(x)
        return x

In [None]:
class WeightedBCELoss(nn.Module):
    def __init__(self, pos_weights, neg_weights):
        super().__init__()
        self.pos_weights = pos_weights
        self.neg_weights = neg_weights

    def forward(self, inputs, targets):
        loss = targets * self.pos_weights * (inputs.sigmoid().log()) + \
               (1 - targets) * self.neg_weights * ((1 - inputs.sigmoid()).log())
        return -loss.mean()

In [None]:
# Function to compute AUCs
def compute_AUCs(gt, pred):
    """Computes Area Under the Curve (AUC) from prediction scores."""
    AUROCs = []
    gt_np = gt.cpu().numpy()
    pred_np = pred.cpu().numpy()
    for i in range(N_CLASSES):
        AUROCs.append(roc_auc_score(gt_np[:, i], pred_np[:, i]))
    return AUROCs

In [None]:
def validate(model, val_loader, criterion):
    model.eval()
    val_loss = 0.0
    gt = torch.FloatTensor().cuda()
    pred = torch.FloatTensor().cuda()

    with torch.no_grad():
        for i, (inputs, targets) in enumerate(val_loader):
            inputs, targets = inputs.cuda(), targets.cuda()

            # Reshape inputs for 10 crops (batch_size * 10, 3, 224, 224)
            bs, ncrops, c, h, w = inputs.size()  # ncrops is 10
            inputs = inputs.view(-1, c, h, w)

            # Forward pass (run the 10 crops through the model)
            outputs = model(inputs)

            # Reshape the outputs back (batch_size, 10, n_classes) and average over crops
            outputs = outputs.view(bs, ncrops, -1).mean(1)

            # Compute loss
            loss = criterion(outputs, targets)
            val_loss += loss.item()

            # Accumulate ground truth and predictions for AUC calculation
            gt = torch.cat((gt, targets), 0)
            pred = torch.cat((pred, outputs), 0)

    val_loss /= len(val_loader)
    AUROCs = compute_AUCs(gt, pred)
    val_auc = np.array(AUROCs).mean()

    return val_loss, val_auc

In [None]:
normalize = transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])

In [None]:
# Compute weights
label_counts = train_df[disease_labels].sum()
total_samples = len(train_df)

pos_weights = np.minimum(total_samples / (label_counts), 30)
neg_weights = total_samples / (total_samples - label_counts)

# Convert the class weights to tensors
pos_weights_tensor = torch.FloatTensor(pos_weights).cuda()
neg_weights_tensor = torch.FloatTensor(neg_weights).cuda()

# Initialize the custom loss function with these weights
criterion = WeightedBCELoss(pos_weights=pos_weights_tensor, neg_weights=neg_weights_tensor).cuda()

In [None]:
# Update data augmentation transformations
train_transform = transforms.Compose([
    transforms.RandomResizedCrop(224),
    transforms.RandomRotation(degrees=5),  # Small degrees for medical imaging
    transforms.ColorJitter(brightness=0.1, contrast=0.1),
    transforms.ToTensor(),
    normalize,
])

val_transform = transforms.Compose([
    transforms.Resize(256),
    transforms.TenCrop(224),  # Crop into 10 images
    transforms.Lambda(lambda crops: torch.stack([transforms.ToTensor()(crop) for crop in crops])),
    transforms.Lambda(lambda crops: torch.stack([normalize(crop) for crop in crops]))
])

In [None]:
# Adjust model training loop accordingly
def train_one_epoch(model, train_loader, criterion, optimizer):
    model.train()
    running_loss = 0.0

    for i, (inputs, labels) in enumerate(train_loader):
        inputs, labels = inputs.cuda(), labels.cuda()

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(inputs)

        loss = criterion(outputs, labels)

        # Backward pass and optimize
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    return running_loss / len(train_loader)

In [None]:
from sklearn.model_selection import KFold

# Number of folds for cross-validation
N_FOLDS = 5
kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)

# List to store results for each fold
fold_results = []

In [None]:
EPOCHS = 20

In [None]:
from sklearn.model_selection import KFold

# Number of folds for cross-validation
N_FOLDS = 5
kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)

# List to store AUC results for each fold
fold_results = []

In [None]:
# Cross-validation loop
for fold, (train_idx, val_idx) in enumerate(kf.split(train_df)):
    print(f'\nFold {fold + 1}/{N_FOLDS}')
    
    # Create train and validation sets for the current fold
    train_fold_df = train_df.iloc[train_idx]
    val_fold_df = train_df.iloc[val_idx]

    # Create datasets and data loaders for this fold
    train_dataset = ChestXrayDataSet(train_fold_df, transform=train_transform)
    val_dataset = ChestXrayDataSet(val_fold_df, transform=val_transform)

    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)

    # Initialize the model, criterion, and optimizer for this fold
    model = DenseNet121(N_CLASSES, freeze_layers=True).cuda()
    model = torch.nn.DataParallel(model).cuda()
    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
    
    # Training loop for this fold
    for epoch in range(EPOCHS):
        model.train()
        running_loss = 0.0
        total_batches = len(train_loader)

        # Training loop
        for batch_idx, (inputs, labels) in enumerate(train_loader):
            inputs, labels = inputs.cuda(), labels.cuda()

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

            # Print loss for every 10 batches
            if (batch_idx + 1) % 10 == 0:
                #print(f'Fold {fold + 1}, Epoch {epoch+1}, Batch {batch_idx+1}/{total_batches}, Batch Loss: {loss.item():.4f}')
                pass

        average_train_loss = running_loss / total_batches

        # Inline validation step
        model.eval()
        val_loss = 0.0
        gt = torch.FloatTensor().cuda()
        pred = torch.FloatTensor().cuda()

        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.cuda(), labels.cuda()

                # Handle TenCrop: reshape and process
                bs, ncrops, c, h, w = inputs.size()
                inputs = inputs.view(-1, c, h, w)  # reshape for model input
                outputs = model(inputs)

                # Average the results from the crops
                outputs = outputs.view(bs, ncrops, -1).mean(1)

                loss = criterion(outputs, labels)
                val_loss += loss.item()

                # Accumulate for AUC calculation
                gt = torch.cat((gt, labels), 0)
                pred = torch.cat((pred, outputs), 0)

        average_val_loss = val_loss / len(val_loader)
        AUROCs = compute_AUCs(gt, pred)
        val_auc = np.array(AUROCs).mean()

        # Print epoch results
        print(f'Fold {fold + 1}, Epoch {epoch+1}, Average Train Loss: {average_train_loss:.4f}, Average Val Loss: {average_val_loss:.4f}, Val AUC: {val_auc:.4f}')

        # Save the model for the current epoch
        model_filename = f'chexnet_fold_{fold + 1}_epoch_{epoch + 1}_auc_{val_auc:.4f}.pth'
        torch.save(model.state_dict(), model_filename)
        print(f'Model for Fold {fold + 1}, Epoch {epoch + 1} saved as {model_filename}')

    # Store the AUC result for this fold
    fold_results.append(val_auc)
    print(f'Fold {fold + 1} AUC: {val_auc:.4f}')

# Print the average AUC over all folds
mean_auc = np.mean(fold_results)
print(f'\nCross-Validation Mean AUC: {mean_auc:.4f}')

In [None]:
# # Create datasets and data loaders with the updated transformations
# train_dataset = ChestXrayDataSet(train_df, transform=train_transform)
# val_dataset = ChestXrayDataSet(val_df, transform=val_transform)

# train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4)
# val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)

In [None]:
# # Set up the model
# cudnn.benchmark = True

# # Initialize the model
# model = DenseNet121(N_CLASSES, freeze_layers=True).cuda()
# model = torch.nn.DataParallel(model).cuda()

In [None]:
# optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [None]:
# EPOCHS = 20
# for epoch in range(EPOCHS):
#     model.train()
#     running_loss = 0.0
#     total_batches = len(train_loader)

#     # Training loop
#     for batch_idx, (inputs, labels) in enumerate(train_loader):
#         inputs, labels = inputs.cuda(), labels.cuda()

#         optimizer.zero_grad()
#         outputs = model(inputs)
#         loss = criterion(outputs, labels)
#         loss.backward()
#         optimizer.step()

#         running_loss += loss.item()

#         # Print loss for every 10 batches
#         if (batch_idx + 1) % 10 == 0:
#             print(f'Epoch {epoch+1}, Batch {batch_idx+1}/{total_batches}, Batch Loss: {loss.item():.4f}')

#     average_train_loss = running_loss / total_batches

#     # Inline validation step
#     model.eval()
#     val_loss = 0.0
#     gt = torch.FloatTensor().cuda()
#     pred = torch.FloatTensor().cuda()

#     with torch.no_grad():
#         for inputs, labels in val_loader:
#             inputs, labels = inputs.cuda(), labels.cuda()

#             # Handle TenCrop: reshape and process
#             bs, ncrops, c, h, w = inputs.size()
#             inputs = inputs.view(-1, c, h, w)  # reshape for model input
#             outputs = model(inputs)

#             # Average the results from the crops
#             outputs = outputs.view(bs, ncrops, -1).mean(1)

#             loss = criterion(outputs, labels)
#             val_loss += loss.item()

#             # Accumulate for AUC calculation
#             gt = torch.cat((gt, labels), 0)
#             pred = torch.cat((pred, outputs), 0)

#     average_val_loss = val_loss / len(val_loader)
#     AUROCs = compute_AUCs(gt, pred)
#     val_auc = np.array(AUROCs).mean()

#     # Print epoch results and save the model
#     print(f'Epoch {epoch+1}, Average Train Loss: {average_train_loss:.4f}, Average Val Loss: {average_val_loss:.4f}, Val AUC: {val_auc:.4f}')
#     model_filename = f'chexnet_epoch_{epoch + 1}_auc_{val_auc:.4f}.pth'
#     torch.save(model.state_dict(), model_filename)
#     print(f'Model saved as {model_filename}')