In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import torch

from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.metrics import average_precision_score, roc_auc_score
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import roc_auc_score, average_precision_score

from tqdm import trange, tqdm

import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

In [2]:
# Load cohort file
filename = "ckd_cohort"
cohort = pd.read_csv(f"../eval_datasets/{filename}.csv")

# When having med-gte-simcse and med-gte-tsdae embeddings we can concatenate them to employ med-gte-hybrid
modelname = 'simcse'
fp = f'../data/embeddings/{modelname}_{filename}.npy'
embeddings_simcse = np.load(fp)

modelname = 'tsdae'
fp = f'../data/embeddings/{modelname}_{filename}.npy'
embeddings_tsdae = np.load(fp)

ensemble_embeddings = np.concatenate((embeddings_simcse, embeddings_tsdae), axis=-1)

In [7]:
# CKD prognosis task setup

class CKDPredictor(nn.Module):
    def __init__(self, input_size):
        super(CKDPredictor, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_size, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.layers(x)

def ckd_prognosis(embeddings, name='default'):
    print(f'\nRunning CKD prognosis for model {name} with 5-fold cross-validation')

    X = embeddings
    y = cohort['label']

    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    accuracies = []
    macro_f1s = []
    aurocs = []
    auprcs = []

    for fold, (train_idx, val_idx) in enumerate(kfold.split(X, y), 1):
        print(f"\nFold {fold}")

        # Split the data
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        # Convert to PyTorch tensors
        X_train_tensor = torch.FloatTensor(X_train)
        y_train_tensor = torch.FloatTensor(y_train.values)
        X_val_tensor = torch.FloatTensor(X_val)
        y_val_tensor = torch.FloatTensor(y_val.values)

        # Create DataLoader
        train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
        train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

        # Initialize the model
        model = CKDPredictor(input_size=len(X[0]))
        criterion = nn.BCELoss()
        optimizer = optim.Adam(model.parameters(), lr=0.001)

        # Training loop
        num_epochs = 150
        t = trange(num_epochs, desc="Epochs")
        for epoch in t:
            model.train()
            epoch_loss = 0.0
            for batch_X, batch_y in train_loader:
                optimizer.zero_grad()
                outputs = model(batch_X).squeeze()
                loss = criterion(outputs, batch_y)
                loss.backward()
                optimizer.step()
                epoch_loss += loss.item()
            avg_loss = epoch_loss / len(train_loader)
            t.set_description(f"Loss: {avg_loss:.4f}")

        # Evaluation
        model.eval()
        torch.save(model.state_dict(), '../models/ckd_pred.pth')
        with torch.no_grad():
            y_pred_proba = model(X_val_tensor).squeeze().numpy()
            y_pred = (y_pred_proba > 0.5).astype(int)

        auroc = roc_auc_score(y_val, y_pred_proba)
        auprc = average_precision_score(y_val, y_pred_proba)

        aurocs.append(auroc)
        auprcs.append(auprc)

        # Print fold results
        print(f"Fold {fold} - AUROC: {auroc:.4f}, AUPRC: {auprc:.4f}")

    # Print average results
    print("\nAverage results across 5 folds:")
    print(f"AUROC: {np.mean(aurocs):.4f} (+/- {np.std(aurocs):.4f})")
    print(f"AUPRC: {np.mean(auprcs):.4f} (+/- {np.std(auprcs):.4f})")

In [4]:
# eGFR prediction setup

class EGFRPredictor(nn.Module):
    def __init__(self, input_dim):
        super(EGFRPredictor, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1),
        )

    def forward(self, x):
        return self.layers(x)

def train(model, train_loader, criterion, optimizer, num_epochs):
    t = trange(num_epochs, desc="Epochs")
    for epoch in t:
        model.train()
        epoch_loss = 0.0
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        
        avg_loss = epoch_loss / len(train_loader)
        t.set_description(f"Loss: {avg_loss:.4f}")

def evaluate(model, test_loader):
    model.eval()
    y_true = []
    y_pred = []
    with torch.no_grad():
        for batch_X, batch_y in test_loader:
            outputs = model(batch_X)
            y_true.extend(batch_y.numpy())
            y_pred.extend(outputs.numpy())
    
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    
    return mae, r2

def egfr_pred(embeddings, name='default'):
    print(f'\nRunning EGFR prediction for model {name} with 5-fold cross-validation')
    X = embeddings  
    y = cohort['egfr'].values

    # Initialize 5-fold cross-validation
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)

    maes = []
    r2s = []

    # Cross-validation loop
    for fold, (train_idx, val_idx) in enumerate(kfold.split(X), 1):
        print(f"\nFold {fold}")

        # Split the data
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]

        # Convert to PyTorch tensors
        X_train = torch.FloatTensor(X_train)
        y_train = torch.FloatTensor(y_train).view(-1, 1)
        X_val = torch.FloatTensor(X_val)
        y_val = torch.FloatTensor(y_val).view(-1, 1)

        # Create DataLoaders
        train_dataset = TensorDataset(X_train, y_train)
        val_dataset = TensorDataset(X_val, y_val)
        train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

        # Initialize the model
        model = EGFRPredictor(input_dim=len(X[0]))

        # Set up training
        criterion = nn.MSELoss()
        optimizer = optim.Adam(model.parameters(), lr=0.001)
        num_epochs = 50

        # Train the model
        train(model, train_loader, criterion, optimizer, num_epochs)

        # Evaluate the model
        mae, r2 = evaluate(model, val_loader)
        maes.append(mae)
        r2s.append(r2)

        print(f"Fold {fold} - Mean Absolute Error: {mae:.4f}, R-squared Score: {r2:.4f}")
    
    print("\nAverage results across 5 folds:")
    print(f"Mean Absolute Error: {np.mean(maes):.4f} (+/- {np.std(maes):.4f})")
    print(f"R-squared Score: {np.mean(r2s):.4f} (+/- {np.std(r2s):.4f})")
    
    return model

In [10]:
# Progosis task, is CKD going to be cured (class 0) or not (class 1)
ckd_prognosis(ensemble_embeddings, name='ensemble')

# Predict egfr values for an admission
egfr_pred(ensemble_embeddings, name='ensemble')