In [None]:
import json
import numpy as np
import os
import time

from sklearn import metrics
from tqdm.notebook import tqdm

import torch
from torch.nn import BCELoss
from torch.optim import RMSprop
from torch.utils.data import DataLoader, TensorDataset
from torch.utils.data.sampler import SubsetRandomSampler

from utils import build_model, Config, config_to_dict, EarlyStopping, Metric, MORTALITY_SETUP, Standardizer

In [None]:
setup = MORTALITY_SETUP

In [None]:
experiment_configuration = {
    'Task': 'Mortality prediction',
    'Approach': 'Centralized ML',
    'Classifier': 'Feed-forward network'
}

In [None]:
data_folder = './data/'
data_filename = os.path.join(data_folder, 'imputed-normed-ep_1_24.npz')
folds_filename = os.path.join(data_folder, '5-folds.npz')
features_filename = os.path.join(data_folder, 'input.csv')
results_folder = './results/'
results_id = f'centralized_{setup.results_filename}'

In [None]:
if not os.path.exists(data_folder):
    print(f'Wrong data_folder specified. This folder must exist')
    exit(1)

if not os.path.exists(results_folder):
    os.makedirs(results_folder)

In [None]:
config = Config()
config

In [None]:
folds_file = np.load(folds_filename, allow_pickle=True)
folds = folds_file[setup.folds_file][config.label_type][0]

data_file = np.load(data_filename, allow_pickle=True)
y = data_file[setup.y_label][:, config.label_type]
y = (y > 0).astype(float)

X = np.genfromtxt(features_filename, delimiter=',')

In [None]:
def create_datasets(X, y, train_idx, test_idx):
    tensor_X, tensor_y = torch.Tensor(X), torch.Tensor(y).view(-1, 1)
    dataset = TensorDataset(tensor_X, tensor_y) 

    num_train = len(train_idx)
    split = int(np.floor(config.validation_split * num_train))
    train_idx, valid_idx = train_idx[split:], train_idx[:split]
    
    train_sampler = SubsetRandomSampler(train_idx)
    valid_sampler = SubsetRandomSampler(valid_idx)
    test_sampler = SubsetRandomSampler(test_idx)
    
    train_loader = DataLoader(dataset,
                              batch_size=config.batch_size,
                              sampler=train_sampler,
                              num_workers=0)
    
    valid_loader = DataLoader(dataset,
                              batch_size=config.batch_size,
                              sampler=valid_sampler,
                              num_workers=0)
    
    test_loader = DataLoader(dataset,
                             batch_size=config.batch_size,
                             sampler=test_sampler,
                             num_workers=0)
    
    return train_loader, valid_loader, test_loader

In [None]:
def train(model, train_loader, valid_loader):
    criterion = BCELoss() # binary cross-entropy
    optimizer = RMSprop(model.parameters(), lr=config.learning_rate)
    early_stopping = EarlyStopping(patience=config.early_stopping_patience)
    
    for epoch in tqdm(range(config.epochs)):
        
        model.train()
        for data, target in train_loader:
            optimizer.zero_grad()
            
            output = model(data)
            
            loss = criterion(output, target)
            loss.backward()

            optimizer.step()

        model.eval()
        valid_losses = []
        for data, target in valid_loader:
            output = model(data)
            loss = criterion(output, target)
            valid_losses.append(loss.item())
        valid_loss = np.average(valid_losses)
        
        if early_stopping.should_early_stop(valid_loss, model):
            break
    
    model.load_state_dict(early_stopping.best_model_state)
    
    return model, epoch + 1

In [None]:
def predict(model, data_loader):
    model.eval()
    
    num_elements = len(data_loader.sampler)
    num_batches = len(data_loader)
    
    predictions = torch.zeros(num_elements)
    targets = torch.zeros(num_elements)
    
    for i, (data, target) in enumerate(data_loader):
        start = i * data_loader.batch_size
        end = start + data_loader.batch_size
        if i == num_batches - 1:
            end = num_elements
        
        targets[start:end] = target.view(-1)
        
        with torch.no_grad():
            output = model(data)
            predictions[start:end] = output.view(-1)
    return predictions, targets

In [None]:
metric_list = [
    Metric('Accuracy', metrics.accuracy_score, use_soft=False),
    Metric('Precision', metrics.precision_score, use_soft=False),
    Metric('Recall', metrics.recall_score, use_soft=False),
    Metric('F1 score', metrics.f1_score, use_soft=False),
    Metric('ROC AUC', metrics.roc_auc_score, use_soft=True),
    Metric('Average precision', metrics.average_precision_score, use_soft=True),
]

In [None]:
time_measurements = {t: [] for t in ['creating_datasets', 'training', 'training_per_epoch', 'prediction']}

In [None]:
for train_idx, valid_idx, test_idx in folds:
    train_idx = np.concatenate((train_idx, valid_idx))
    
    standardizer = Standardizer()
    standardizer.fit(X[train_idx])
    X_transformed = standardizer.transform(X)
    
    start = time.time()
    train_loader, valid_loader, test_loader = create_datasets(X_transformed, y, train_idx, test_idx)
    time_measurements['creating_datasets'].append(time.time() - start)
    
    model = build_model(config, n_features=X_transformed.shape[1], output_size=setup.output_size)
    
    start = time.time()
    model, finished_epochs = train(model, train_loader, valid_loader)
    training_time = time.time() - start
    time_measurements['training'].append(training_time)
    time_measurements['training_per_epoch'].append(training_time / finished_epochs)
    
    start = time.time()
    y_soft, y_true = predict(model, test_loader)
    time_measurements['prediction'].append(time.time() - start)
    y_pred = (y_soft > 0.5).type(torch.int)

    for metric in metric_list:
        if metric.use_soft:
            score = metric.function(y_true, y_soft)
        else:
            score = metric.function(y_true, y_pred)
        metric.scores.append(score)

In [None]:
def create_summary(configuration, metric_list, time_measurements):
    summary = ''
    for label, value in configuration.items():
        summary += f'{label + ":": <20} {value}\n'
    summary += '\nMETRICS\n'
    for metric in metric_list:
        mean, std = np.mean(metric.scores), np.std(metric.scores)
        summary += f'{metric.name + ":": <20} {mean:.5f} ± {std:.5f}\n'
    summary += '\nTIME MEASUREMENTS\n'
    for label, times in time_measurements.items():
        mean, std = np.mean(times), np.std(times)
        summary += f'{label+":": <20} {mean:.5f} ± {std:.5f}\n'
    return summary

In [None]:
summary = create_summary(experiment_configuration, metric_list, time_measurements)
print(summary)

In [None]:
summary_filename = os.path.join(results_folder, f'{results_id}_summary.txt')
with open(summary_filename, 'w') as f:
    f.write(summary)

In [None]:
results = {
    'experiment_configuration': experiment_configuration,
    'metrics': {m.name: m.scores for m in metric_list},
    'time_measurements': time_measurements,
    'model_configuration': config_to_dict(config)
}

In [None]:
results_filename = os.path.join(results_folder, f'{results_id}_results.json')
with open(results_filename, 'w') as f:
    json.dump(results, f, indent=4)