In [7]:
import copy
import os
import random
import sys

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import torch
import torch.nn as nn
from PIL import Image
from sklearn.metrics import cohen_kappa_score, precision_score, recall_score, accuracy_score
from torch.utils.data import Dataset, DataLoader
from torchvision import models, transforms
from torchvision.transforms.functional import to_pil_image
from tqdm import tqdm
import torch.nn.functional as F
# from library import *

In [8]:
import copy
import os
import random
import sys

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import torch
import torch.nn as nn
from PIL import Image
from sklearn.metrics import cohen_kappa_score, precision_score, recall_score, accuracy_score
from torch.utils.data import Dataset, DataLoader
from torchvision import models, transforms
from torchvision.transforms.functional import to_pil_image
from tqdm import tqdm

class RetinopathyDataset(Dataset):
    def __init__(self, ann_file, image_dir, transform=None, mode='single', test=False):
        self.ann_file = ann_file
        self.image_dir = image_dir
        self.transform = transform

        self.test = test
        self.mode = mode
        self.default_transform = transforms.Compose([
                        transforms.Resize((224, 224)),
                        transforms.ToTensor(),
                        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
            ])

        if self.mode == 'single':
            self.data = self.load_data()
            self.sp = len(self.data)
            self.data.extend(self.data)
            
        else:
            self.data = self.load_data_dual()
            self.sp = len(self.data)
            self.data.extend(self.data)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        if self.mode == 'single':
            return self.get_item(index)
        else:
            return self.get_item_dual(index)

    # 1. single image
    def load_data(self):
        df = pd.read_csv(self.ann_file)

        data = []
        for _, row in df.iterrows():
            file_info = dict()
            file_info['img_path'] = os.path.join(self.image_dir, row['img_path'])
            if not self.test:
                file_info['dr_level'] = int(row['patient_DR_Level'])
            data.append(file_info)
        return data

    def get_item(self, index):
        data = self.data[index]
        img = Image.open(data['img_path']).convert('RGB')
        r = random.random()
        if self.transform and index >= self.sp:
            img = self.transform(img)
        else: 
            img = self.default_transform(img)
            

        if not self.test:
            label = torch.tensor(data['dr_level'], dtype=torch.int64)
            return img, label
        else:
            return img

    # 2. dual image
    def load_data_dual(self):
        df = pd.read_csv(self.ann_file)

        df['prefix'] = df['image_id'].str.split('_').str[0]  # The patient id of each image
        df['suffix'] = df['image_id'].str.split('_').str[1].str[0]  # The left or right eye
        grouped = df.groupby(['prefix', 'suffix'])

        data = []
        for (prefix, suffix), group in grouped:
            file_info = dict()
            file_info['img_path1'] = os.path.join(self.image_dir, group.iloc[0]['img_path'])
            file_info['img_path2'] = os.path.join(self.image_dir, group.iloc[1]['img_path'])
            if not self.test:
                file_info['dr_level'] = int(group.iloc[0]['patient_DR_Level'])
            data.append(file_info)
        return data

    def get_item_dual(self, index):
        data = self.data[index]
        img1 = Image.open(data['img_path1']).convert('RGB')
        img2 = Image.open(data['img_path2']).convert('RGB')

        if self.transform and index >= self.sp:
            img1 = self.transform(img1)
            img2 = self.transform(img2)
        else: 
            img1 = self.default_transform(img1)
            img2 = self.default_transform(img2)

        if not self.test:
            label = torch.tensor(data['dr_level'], dtype=torch.int64)
            return [img1, img2], label
        else:
            return [img1, img2]

class SLORandomPad:
    def __init__(self, size):
        self.size = size

    def __call__(self, img):
        pad_width = max(0, self.size[0] - img.width)
        pad_height = max(0, self.size[1] - img.height)
        pad_left = random.randint(0, pad_width)
        pad_top = random.randint(0, pad_height)
        pad_right = pad_width - pad_left
        pad_bottom = pad_height - pad_top
        return transforms.functional.pad(img, (pad_left, pad_top, pad_right, pad_bottom))

class FundRandomRotate:
    def __init__(self, prob, degree):
        self.prob = prob
        self.degree = degree

    def __call__(self, img):
        if random.random() < self.prob:
            angle = random.uniform(-self.degree, self.degree)
            return transforms.functional.rotate(img, angle)
        return img

def train_model(model, train_loader, val_loader, device, criterion, optimizer, lr_scheduler, num_epochs=25,
                checkpoint_path='model.pth'):
    best_model = model.state_dict()
    best_epoch = None
    best_val_kappa = -1.0  # Initialize the best kappa score
    kappas = np.zeros(num_epochs)

    for epoch in range(1, num_epochs + 1):
        print(f'\nEpoch {epoch}/{num_epochs}')
        running_loss = []
        all_preds = []
        all_labels = []

        model.train()

        with tqdm(total=len(train_loader), desc=f'Training', unit=' batch', file=sys.stdout) as pbar:
            for images, labels in train_loader:
                if not isinstance(images, list):
                    images = images.to(device)  # single image case
                else:
                    images = [x.to(device) for x in images]  # dual images case

                labels = labels.to(device)

                optimizer.zero_grad()

                outputs = model(images)
                loss = criterion(outputs, labels.long())

                loss.backward()
                optimizer.step()

                preds = torch.argmax(outputs, 1)
                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())

                running_loss.append(loss.item())

                pbar.set_postfix({'lr': f'{optimizer.param_groups[0]["lr"]:.1e}', 'Loss': f'{loss.item():.4f}'})
                pbar.update(1)

        lr_scheduler.step()

        epoch_loss = sum(running_loss) / len(running_loss)

        train_metrics = compute_metrics(all_preds, all_labels, per_class=True)
        kappa, accuracy, precision, recall = train_metrics[:4]

        print(f'[Train] Kappa: {kappa:.4f} Accuracy: {accuracy:.4f} '
              f'Precision: {precision:.4f} Recall: {recall:.4f} Loss: {epoch_loss:.4f}')

        if len(train_metrics) > 4:
            precision_per_class, recall_per_class = train_metrics[4:]
            for i, (precision, recall) in enumerate(zip(precision_per_class, recall_per_class)):
                print(f'[Train] Class {i}: Precision: {precision:.4f}, Recall: {recall:.4f}')

        # Evaluation on the validation set at the end of each epoch
        val_metrics = evaluate_model(model, val_loader, device)
        val_kappa, val_accuracy, val_precision, val_recall = val_metrics[:4]
        print(f'[Val] Kappa: {val_kappa:.4f} Accuracy: {val_accuracy:.4f} '
              f'Precision: {val_precision:.4f} Recall: {val_recall:.4f}')
        kappas[epoch - 1] = val_kappa

        if val_kappa > best_val_kappa:
            best_val_kappa = val_kappa
            best_epoch = epoch
            best_model = model.state_dict()
            torch.save(best_model, checkpoint_path)

    print(f'[Val] Best kappa: {best_val_kappa:.4f}, Epoch {best_epoch}')

    return model, kappas

def evaluate_model(model, test_loader, device, test_only=False, prediction_path='./test_predictions.csv'):
    model.eval()

    all_preds = []
    all_labels = []
    all_image_ids = []

    with tqdm(total=len(test_loader), desc=f'Evaluating', unit=' batch', file=sys.stdout) as pbar:
        for i, data in enumerate(test_loader):

            if test_only:
                images = data
            else:
                images, labels = data

            if not isinstance(images, list):
                images = images.to(device)  # single image case
            else:
                images = [x.to(device) for x in images]  # dual images case

            with torch.no_grad():
                outputs = model(images)
                preds = torch.argmax(outputs, 1)

            if not isinstance(images, list):
                # single image case
                all_preds.extend(preds.cpu().numpy())
                image_ids = [
                    os.path.basename(test_loader.dataset.data[idx]['img_path']) for idx in
                    range(i * test_loader.batch_size, i * test_loader.batch_size + len(images))
                ]
                all_image_ids.extend(image_ids)
                if not test_only:
                    all_labels.extend(labels.numpy())
            else:
                # dual images case
                for k in range(2):
                    all_preds.extend(preds.cpu().numpy())
                    image_ids = [
                        os.path.basename(test_loader.dataset.data[idx][f'img_path{k + 1}']) for idx in
                        range(i * test_loader.batch_size, i * test_loader.batch_size + len(images[k]))
                    ]
                    all_image_ids.extend(image_ids)
                    if not test_only:
                        all_labels.extend(labels.numpy())

            pbar.update(1)

    # Save predictions to csv file for Kaggle online evaluation
    if test_only:
        df = pd.DataFrame({
            'ID': all_image_ids,
            'TARGET': all_preds
        })
        df.to_csv(prediction_path, index=False)
        print(f'[Test] Save predictions to {os.path.abspath(prediction_path)}')
    else:
        metrics = compute_metrics(all_preds, all_labels)
        return metrics

def compute_metrics(preds, labels, per_class=False):
    kappa = cohen_kappa_score(labels, preds, weights='quadratic')
    accuracy = accuracy_score(labels, preds)
    precision = precision_score(labels, preds, average='weighted', zero_division=0)
    recall = recall_score(labels, preds, average='weighted', zero_division=0)

    # Calculate and print precision and recall for each class
    if per_class:
        precision_per_class = precision_score(labels, preds, average=None, zero_division=0)
        recall_per_class = recall_score(labels, preds, average=None, zero_division=0)
        return kappa, accuracy, precision, recall, precision_per_class, recall_per_class

    return kappa, accuracy, precision, recall

In [9]:
torch.manual_seed(0)
batch_size = 24
num_classes = 5  # 5 DR levels
learning_rate = 0.0001
num_epochs = 20
dataset_folder = "/kaggle/input/finaldeeplearning/521153S-3005-final-project"

In [10]:
transform_train = transforms.Compose([
    transforms.Resize((256, 256)),
    transforms.RandomCrop((210, 210)),
    SLORandomPad((224, 224)),
    FundRandomRotate(prob=0.5, degree=30),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomVerticalFlip(p=0.5),
    transforms.RandomRotation(90),
    transforms.ColorJitter(brightness=(0.1, 0.9)),
    transforms.RandomInvert(p=0.7), # 0.7
    transforms.RandomGrayscale(p=0.5), # 0.5
    transforms.RandomPerspective(p=0.3), # 0.3    
    transforms.GaussianBlur(kernel_size=5), # 5
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

transform_test = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

In [11]:
def train_and_save(model, name, num_epochs=10): 
    train_dataset = RetinopathyDataset(data_path+'/DeepDRiD/train.csv', data_path+'/DeepDRiD/train/', transform_train, mode)
    val_dataset = RetinopathyDataset(data_path+'./DeepDRiD/val.csv', data_path+'/DeepDRiD/val/', transform_test, mode)
    test_dataset = RetinopathyDataset(data_path+'./DeepDRiD/test.csv', data_path+'/DeepDRiD/test/', transform_test, mode, test=True)
    
    # Create dataloaders
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    
    # Define the weighted CrossEntropyLoss
    criterion = nn.CrossEntropyLoss()
    
    # Use GPU device is possible
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print('Device:', device)
    
    # Move class weights to the device
    model = model.to(device)
    
    # Optimizer and Learning rate scheduler
    optimizer = torch.optim.Adam(params=model.parameters(), lr=learning_rate)
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)
    
    # Train and evaluate the model with the training and validation set
    model, kappas = train_model(
        model, train_loader, val_loader, device, criterion, optimizer,
        lr_scheduler=lr_scheduler, num_epochs=num_epochs,
        checkpoint_path='./aptos-resnet-deepdrip_{}.pth'.format(name)
    )

    # Load the pretrained checkpoint
    state_dict = torch.load('./aptos-resnet-deepdrip_{}.pth'.format(name), map_location='cpu')
    model.load_state_dict(state_dict, strict=True)
    
    # Make predictions on testing set and save the prediction results
    evaluate_model(model, test_loader, device, test_only=True, prediction_path="./test_predictions_aptos-{}-deepdrip.csv".format(name))
    return model, kappas

In [35]:
densenet_model_path = "/kaggle/input/models/models/aptos-resnet-deepdrip_densenet.pth"
resnet_model_path = "/kaggle/input/models/models/aptos-resnet-deepdrip_resnet.pth"
vgg_model_path = "/kaggle/input/models/models/aptos-resnet-deepdrip_vgg.pth"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

resnet_model = models.resnet18()
name = "resnet"
state_dict = torch.load(resnet_model_path, map_location=device,weights_only="True")
resnet_model.load_state_dict(state_dict, strict=True)

vgg_model = models.vgg16()
name = "vgg"
state_dict = torch.load(vgg_model_path, map_location=device,weights_only="True")
vgg_model.load_state_dict(state_dict, strict=True)

densenet_model = models.densenet161()
name = "densenet"
state_dict = torch.load(densenet_model_path, map_location=device,weights_only="True")
densenet_model.load_state_dict(state_dict, strict=True)


# Define dataset and loaders
mode = 'single'
data_path = "/kaggle/input/finaldeeplearning/521153S-3005-final-project/"
train_dataset = RetinopathyDataset(data_path+'DeepDRiD/train.csv', data_path+'DeepDRiD/train/', transform_train, mode)
val_dataset = RetinopathyDataset(data_path+'DeepDRiD/val.csv', data_path+'DeepDRiD/val/', transform_test, mode)
test_dataset = RetinopathyDataset(data_path+'DeepDRiD/test.csv', data_path+'DeepDRiD/test/', transform_test, mode, test=True)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


candidate_models = [resnet_model,vgg_model,densenet_model]
candidate_models = [model.to(device) for model in candidate_models]
    

In [36]:
def generate_predictions(models, data_loader, device):
    """
    Generate predictions for a list of models and collect ground truths from the data loader.

    Args:
        models (list): List of models to generate predictions from.
        data_loader (DataLoader): DataLoader object for input data.
        device (torch.device): Device to run the models on.

    Returns:
        model_predictions (list): List of numpy arrays, one for each model's predictions.
        ground_truths (np.array): Ground truth labels for the data loader.
    """
    # Step 1: Collect ground truths
    ground_truths = []
    for _, labels in data_loader:
        if isinstance(labels, torch.Tensor):
            labels = labels.numpy()
        ground_truths.extend(labels)
    ground_truths = np.array(ground_truths)

    # Step 2: Generate predictions for each model
    model_predictions = []  # To store predictions from each model

    for model in models:
        model.eval()  # Set model to evaluation mode
        all_preds = []  # Collect predictions for this model

        with torch.no_grad():
            for images, _ in data_loader:  # Only need images during prediction
                images = images.to(device)

                # Forward pass through the model
                outputs = model(images)
                # outputs = outputs[:,:5]
                probabilities = F.softmax(outputs, dim=1).cpu().numpy()  # Convert logits to probabilities
                all_preds.append(probabilities)  # Store predictions

        # Combine all batch predictions for the current model
        model_predictions.append(np.vstack(all_preds))  # Stack all predictions
    # model_predictions = model_predictions[,:5]
    return model_predictions, ground_truths
a,b = generate_predictions(candidate_models, val_loader, device)


In [37]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from scipy.stats import mode
import numpy as np
import torch

def stacking(candidate_models, train_loader, val_loader, device):
    # Step 1: Generate predictions on training data for the meta-model
    train_predictions, train_labels = generate_predictions(candidate_models, train_loader, device)
    X_train = np.hstack(train_predictions)  # Training features for meta-model
    y_train = train_labels  # Ground truth labels for training

    # Step 2: Generate predictions on validation data for evaluation
    val_predictions, val_labels = generate_predictions(candidate_models, val_loader, device)
    X_val = np.hstack(val_predictions)  # Validation features for meta-model

    # Step 3: Train the meta-model
    meta_model = LogisticRegression(max_iter=1000)
    meta_model.fit(X_train, y_train)

    # Step 4: Predict on training and validation sets using the meta-model
    y_train_pred = meta_model.predict(X_train)
    y_val_pred = meta_model.predict(X_val)

    # Calculate accuracy for the training set
    train_accuracy = accuracy_score(y_train, y_train_pred)
    print(f"Meta-Model Training Accuracy: {train_accuracy:.4f}")

    return y_val_pred

def weighted_average(candidate_models, train_loader, val_loader, device):
    train_predictions, train_labels = generate_predictions(candidate_models, train_loader, device)
    weights = [accuracy_score(train_labels, np.argmax(preds, axis=1)) for preds in train_predictions]
    weights = np.array(weights) / sum(weights)

    val_predictions, _ = generate_predictions(candidate_models, val_loader, device)
    weighted_val_preds = sum(w * preds for w, preds in zip(weights, val_predictions))
    final_predictions = np.argmax(weighted_val_preds, axis=1)
    return final_predictions

def max_voting(candidate_models, val_loader, device):
    val_predictions, _ = generate_predictions(candidate_models, val_loader, device)
    class_predictions = [np.argmax(preds, axis=1) for preds in val_predictions]
    final_predictions = mode(np.column_stack(class_predictions), axis=1).mode.flatten()
    return final_predictions

def evaluate_ensemble_methods(candidate_models, train_loader, val_loader, device):
    """
    Evaluate ensemble methods using the validation set as the test set.
    """
    # Extract ground truth labels for the validation dataset
    print("\nExtracting ground truth labels from the validation loader...")
    val_labels = []
    for _, labels in val_loader:
        val_labels.extend(labels.numpy())
    val_labels = np.array(val_labels)  # Convert to numpy array

    # Stacking
    print("\nEvaluating Stacking...")
    stacking_preds = stacking(candidate_models, train_loader, val_loader,device)
    stacking_metrics = compute_metrics(stacking_preds, val_labels, per_class=True)
    print_metrics("Stacking", stacking_metrics)

    # Weighted Average
    print("\nEvaluating Weighted Average...")
    weighted_preds = weighted_average(candidate_models, train_loader,val_loader, device)
    weighted_metrics = compute_metrics(weighted_preds, val_labels, per_class=True)
    print_metrics("Weighted Average", weighted_metrics)

    # Max Voting
    print("\nEvaluating Max Voting...")
    voting_preds = max_voting(candidate_models, val_loader, device)
    voting_metrics = compute_metrics(voting_preds, val_labels, per_class=True)
    print_metrics("Max Voting", voting_metrics)

def print_metrics(method_name, metrics):
    """
    Print the metrics for a specific ensemble method.
    """
    kappa, accuracy, precision, recall = metrics[:4]
    print(f"=== {method_name} ===")
    print(f"Kappa: {kappa:.4f}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision (Weighted): {precision:.4f}")
    print(f"Recall (Weighted): {recall:.4f}")
    
    # Print per-class precision and recall if available
    if len(metrics) > 4:
        precision_per_class, recall_per_class = metrics[4:]
        for i, (p, r) in enumerate(zip(precision_per_class, recall_per_class)):
            print(f"Class {i}: Precision: {p:.4f}, Recall: {r:.4f}")
evaluate_ensemble_methods(candidate_models, train_loader, val_loader, device)


Extracting ground truth labels from the validation loader...

Evaluating Stacking...
Meta-Model Training Accuracy: 0.8662
=== Stacking ===
Kappa: 0.8430
Accuracy: 0.6900
Precision (Weighted): 0.6964
Recall (Weighted): 0.6900
Class 0: Precision: 0.8478, Recall: 0.9750
Class 1: Precision: 0.7037, Recall: 0.4750
Class 2: Precision: 0.4623, Recall: 0.6125
Class 3: Precision: 0.7778, Recall: 0.7000
Class 4: Precision: 0.5333, Recall: 0.4000

Evaluating Weighted Average...
=== Weighted Average ===
Kappa: 0.8590
Accuracy: 0.7050
Precision (Weighted): 0.7026
Recall (Weighted): 0.7050
Class 0: Precision: 0.8500, Recall: 0.9917
Class 1: Precision: 0.7321, Recall: 0.5125
Class 2: Precision: 0.5000, Recall: 0.6000
Class 3: Precision: 0.7143, Recall: 0.7500
Class 4: Precision: 0.5833, Recall: 0.3500

Evaluating Max Voting...
=== Max Voting ===
Kappa: 0.8539
Accuracy: 0.6875
Precision (Weighted): 0.6792
Recall (Weighted): 0.6875
Class 0: Precision: 0.8500, Recall: 0.9917
Class 1: Precision: 0.6562,

In [None]:
evaluate_ensemble_methods(candidate_models,train_loader, val_loader, device)