# MIMIC-III Length-of-stay Task Logistic Regression Model
This notebook is with simplified dataloaders for logistic regression performance report.

We use a pre-processed benchmark version of MIMIC-III. To produce this dataset please follow the instructions from https://github.com/ratschlab/ncl and https://github.com/YerevaNN/mimic3-benchmarks. We do not provide the original or preprocessed MIMIC-III dataset here and you must request for the access directly from https://mimic.physionet.org/.

In [None]:
import numpy as np
import torch
import sys
import argparse
from tqdm import tqdm

In [None]:
data_path ='../data/mimic3'
from utils.load_data import load_mimic3

In [None]:
parser = argparse.ArgumentParser()
parser.add_argument('--dataset', type=str, help='path to dataset or torch dataset name')
parser.add_argument('-b', '--batch-size', default=512, type=int,
                    metavar='N',
                    help='mini-batch size (default: 512), this is the total '
                         'batch size of all GPUs on the current node when '
                         'using Data Parallel or Distributed Data Parallel')
parser.add_argument('--method', type=str, default='lfr',  help='training method', 
                    choices=['lfr', 'supervised', 'supervised-aug', 
                             'autoencoder', 'simsiam', 
                             'simclr', 'diet', 'diet-aug'])
parser.add_argument('-j', '--workers', default=4, type=int, metavar='N',
                    help='number of data loading workers (default: 1)')
parser.add_argument('--eval_bs', '--eval-batch-size', default=20480, type=int)


args_string = ' --batch-size 20480 --dataset mimic3-los --method supervised'.split()
args = parser.parse_args(args_string)

In [None]:
train_loader, test_loader = load_mimic3(args, data_path, 'los', True)

In [None]:

def get_data(dataloader):
    features, labels = [], []
    for i, data in tqdm(enumerate(dataloader), total=len(dataloader), desc='loading data'):
        x, y = data
        features.extend(x.tolist())
        labels.extend(y.tolist())
    return features, labels

x_train, y_train = get_data(train_loader)
x_test, y_test = get_data(test_loader)


In [28]:
x_train, y_train = np.array(x_train), np.array(y_train)
x_test, y_test = np.array(x_test), np.array(y_test)

x_train = x_train.reshape(x_train.shape[0], -1)
x_test = x_test.reshape(x_test.shape[0], -1)
print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)

(2377738, 2016) (2377738,) (523200, 2016) (523200,)


In [None]:
from linear_eval import report_auprc, report_auroc, report_kappa
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score

if args.dataset == 'mimic3-decomp':
    args.metrics = ['acc', 'auprc', 'auroc']
elif args.dataset == 'mimic3-los':
    args.metrics = ['acc', 'kappa']

acc = []
kappa = []
auprc = []
auroc = []
for rand_seed in [0]:
    np.random.seed(rand_seed)
    print('start training')
    lr = LogisticRegression(max_iter=1000)
    lr.fit(x_train, y_train)
    print('finished training')
    predictions = lr.predict_proba(x_test)
    predicted_labels = lr.predict(x_test)
    acc.append(accuracy_score(y_test, predicted_labels))
    if 'kappa' in args.metrics:
        kappa.append(report_kappa(y_test, predicted_labels))
    if 'auprc' in args.metrics:
        auprc.append(report_auprc(y_test, predictions[:, 1]))
    if 'auroc' in args.metrics:
        auroc.append(report_auroc(y_test, predictions[:, 1]))
    print(acc, kappa, auprc, auroc)
acc = np.array(acc)
kappa = np.array(kappa)
auprc = np.array(auprc)
auroc = np.array(auroc)

print('acc:', acc.mean(), acc.std())
print('kappa:', kappa.mean(), kappa.std())
print('auprc:', auprc.mean(), auprc.std())
print('auroc:', auroc.mean(), auroc.std())
