In [None]:
import os
import numpy as np
from sklearn import metrics
from tqdm.notebook import tqdm

import torch
from torch.nn import BCELoss
from torch.optim import RMSprop
from torch.utils.data import DataLoader, TensorDataset
from torch.utils.data.sampler import SubsetRandomSampler

from utils import Config, build_model, Metric, Standardizer, EarlyStopping

In [None]:
data_folder = './data/mimic3_17f_24h/'
data_filename = os.path.join(data_folder, 'imputed-normed-ep_1_24.npz')
folds_filename = os.path.join(data_folder, '5-folds.npz')
features_filename = os.path.join(data_folder, 'input.csv')
results_folder = './results/mimic3_17f_24h/'

In [None]:
if not os.path.exists(data_folder):
    print(f'Wrong data_folder specified. This folder must exist')
    exit(1)

if not os.path.exists(results_folder):
    os.makedirs(results_folder)

In [None]:
config = Config()
config

In [None]:
folds_file = np.load(folds_filename, allow_pickle=True)
folds = folds_file['folds_ep_mor'][config.label_type][0]

data_file = np.load(data_filename, allow_pickle=True)
y = data_file['adm_labels_all'][:, config.label_type]
y = (y > 0).astype(float)

X = np.genfromtxt(features_filename, delimiter=',')

In [None]:
TASK_NAME = 'Mortality'
CLF_NAME = 'TraditionalFeedForwardNetwork'

In [None]:
metrics = [
    Metric('Accuracy', metrics.accuracy_score, use_soft=False),
    Metric('Precision', metrics.precision_score, use_soft=False),
    Metric('Recall', metrics.recall_score, use_soft=False),
    Metric('F1 score', metrics.f1_score, use_soft=False),
    Metric('ROC AUC', metrics.roc_auc_score, use_soft=True),
    Metric('Average precision', metrics.average_precision_score, use_soft=True),
]

In [None]:
def create_datasets(X, y, train_idx, test_idx):
    tensor_X, tensor_y = torch.Tensor(X), torch.Tensor(y).view(-1, 1)
    dataset = TensorDataset(tensor_X, tensor_y) 

    num_train = len(train_idx)
    split = int(np.floor(config.validation_split * num_train))
    train_idx, valid_idx = train_idx[split:], train_idx[:split]
    
    train_sampler = SubsetRandomSampler(train_idx)
    valid_sampler = SubsetRandomSampler(valid_idx)
    test_sampler = SubsetRandomSampler(test_idx)
    
    train_loader = DataLoader(dataset,
                              batch_size=config.batch_size,
                              sampler=train_sampler,
                              num_workers=0)
    
    valid_loader = DataLoader(dataset,
                              batch_size=config.batch_size,
                              sampler=valid_sampler,
                              num_workers=0)
    
    test_loader = DataLoader(dataset,
                             batch_size=config.batch_size,
                             sampler=test_sampler,
                             num_workers=0)
    
    return train_loader, valid_loader, test_loader

In [None]:
def train(model, train_loader, valid_loader):
    criterion = BCELoss() # binary cross-entropy
    optimizer = RMSprop(model.parameters(), lr=config.learning_rate)
    early_stopping = EarlyStopping(patience=config.early_stopping_patience)
    
    for epoch in tqdm(range(config.epochs)):
        
        model.train()
        for data, target in train_loader:
            optimizer.zero_grad()
            
            output = model(data)
            
            loss = criterion(output, target)
            loss.backward()

            optimizer.step()

        model.eval() 
        valid_losses = []
        for data, target in valid_loader:
            output = model(data)
            loss = criterion(output, target)
            valid_losses.append(loss.item())
        valid_loss = np.average(valid_losses)
        
        if early_stopping.should_early_stop(valid_loss, model):
            break
    
    model.load_state_dict(early_stopping.best_model_state)
    
    return model

In [None]:
def predict(model, data_loader):
    model.eval()
    
    num_elements = len(data_loader.sampler)
    num_batches = len(data_loader)
    
    predictions = torch.zeros(num_elements)
    targets = torch.zeros(num_elements)
    
    for i, (data, target) in enumerate(data_loader):
        start = i * data_loader.batch_size
        end = start + data_loader.batch_size
        if i == num_batches - 1:
            end = num_elements
        
        targets[start:end] = target.view(-1)
        
        with torch.no_grad():
            output = model(data)
            predictions[start:end] = output.view(-1)
    return predictions, targets

In [None]:
for train_idx, valid_idx, test_idx in folds:
    train_idx = np.concatenate((train_idx, valid_idx))
    
    standardizer = Standardizer()
    standardizer.fit(X[train_idx])
    X_transformed = standardizer.transform(X)

    model = build_model(config, n_features=X_transformed.shape[1])
    
    train_loader, valid_loader, test_loader = create_datasets(X_transformed, y, train_idx, test_idx)
    
    model = train(model, train_loader, valid_loader)
    
    y_soft, y_true = predict(model, test_loader)
    y_pred = (y_soft > 0.5).type(torch.int)

    for metric in metrics:
        if metric.use_soft:
            score = metric.function(y_true, y_soft)
        else:
            score = metric.function(y_true, y_pred)
        metric.scores.append(score)

In [None]:
results_filename = os.path.join(results_folder, f'{TASK_NAME}_{CLF_NAME}.txt')

In [None]:
with open(results_filename, 'w') as f:
    f.write(f'{TASK_NAME} {CLF_NAME}\n\n')
    f.write(f'{str(config)}\n\n')
    for metric in metrics:
        mean, std = np.mean(metric.scores), np.std(metric.scores)
        print(f'{metric.name}: {mean:.5f} ± {std:.5f}')
        f.write(f'{metric.name}: {mean:.5f} ± {std:.5f}\n')