# Extract subset of MIMIC data and create training and holdout data

In [1]:
import random
from pathlib import Path

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold

## Selecting a random subset of the original MIMIC summary data

This section should only be executed to subset the original data once.  After that, the subsetted data should be loaded from the file; see the next section. The file `condensed_summary_data.csv` is generated using the [MIMIC-extract](https://github.com/juliangilbey/MIMIC_Extract/releases/tag/v1.0) repository; see [README-imputation](https://github.com/juliangilbey/MIMIC_Extract/blob/v1.0/README-imputation.md) for more details.

In [2]:
mimic_full = pd.read_csv("condensed_summary_data.csv")

FileNotFoundError: [Errno 2] No such file or directory: 'condensed_summary_data.csv'

In [None]:
mimic_full.shape

In [None]:
mimic_full.isnull().any()

In [None]:
samp_vec = np.arange(len(mimic_full))

Generate an index subset array with the length equal to 7214 (a third of the samples).

In [None]:
random_subset_ind = random.sample(list(samp_vec), 7214) 

In [None]:
random_subset_ind = np.sort(random_subset_ind)
random_subset_ind

In [None]:
mimic_df = mimic_full.iloc[random_subset_ind, ]
mimic_df

In [None]:
mimic_df.to_csv("MIMIC_rand_sub.csv", index=False)

## Reloading the random subset of the original MIMIC summary data

This section is executed to load previously subsetted data.

In [2]:
mimic_df = pd.read_csv("MIMIC_rand_sub.csv")

In [3]:
mimic_df

Unnamed: 0,Diastolic blood pressure mean,Diastolic blood pressure std,Heart Rate mean,Heart Rate std,Mean blood pressure mean,Mean blood pressure std,Oxygen saturation mean,Oxygen saturation std,Respiratory rate mean,Respiratory rate std,Systolic blood pressure mean,Systolic blood pressure std,Temperature mean,Temperature std,outcome
0,71.461538,8.985458,90.354839,7.884789,88.641020,9.955409,97.233333,1.959650,25.333333,2.890146,123.000000,13.111827,36.703707,0.391439,0
1,76.595855,13.123706,85.994792,6.426791,96.403141,12.537934,97.325000,2.161652,14.618321,5.565298,149.518135,22.271079,36.504636,0.698926,1
2,65.219298,9.343739,98.017241,15.294775,88.174712,13.146849,98.565217,1.562333,17.625000,5.229301,132.271930,23.364100,37.474999,0.476677,0
3,47.379310,8.715396,77.812500,8.093954,67.517245,8.126275,93.709677,2.410996,20.531250,3.026596,107.793103,10.248273,36.351850,0.245414,0
4,54.276042,7.031853,54.814208,8.096154,70.626528,8.192074,97.805000,1.723901,9.461255,4.619341,102.151042,13.371968,37.144929,1.024559,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7209,93.215190,11.757890,84.614286,9.940829,112.487179,13.296561,98.072464,1.743212,16.034091,5.919865,152.278481,14.627807,36.903292,0.611420,0
7210,54.474747,12.406667,76.218182,13.452862,73.303030,11.008767,94.733333,2.913056,22.676190,4.193620,132.454545,15.091742,36.754274,0.794452,0
7211,67.557143,8.303371,76.250000,13.731582,83.614286,8.061885,94.833333,1.556450,23.014286,4.437955,136.300000,11.714416,36.419753,0.331817,0
7212,66.450000,20.388120,89.761905,3.904977,79.983871,20.436621,97.349206,2.919118,18.531250,2.794375,121.200000,19.891059,36.913194,0.399572,0


## Create training and test sets with completely at random missingness

In [4]:
def binary_sampler(p, rows, cols):
    np.random.seed(6289278)
    unif_random_matrix = np.random.uniform(0., 1., size = (rows, cols))
    binary_random_matrix = 1 * (unif_random_matrix < p)
    return binary_random_matrix


def make_missing_mcar(data_df, miss_rate=0.25, outcome_column='outcome'):
    data_features = data_df.drop(columns=[outcome_column])
    data_features_arr = np.array(data_features)

    n_rows, n_cols = data_features_arr.shape

    data_features_mask = binary_sampler(1 - miss_rate, n_rows, n_cols)
    miss_data_features_arr = data_features_arr.copy()
    miss_data_features_arr[data_features_mask == 0] = np.nan

    miss_data_features = pd.DataFrame(miss_data_features_arr)
    outcome = pd.DataFrame(data_df[outcome_column].reset_index(drop=True))
    
    miss_data = pd.concat([miss_data_features, outcome], axis=1)

    return miss_data

In [5]:
outdir = Path('MIMIC_subset_mcar')
outdir.mkdir(exist_ok=True)

n_splits = 3
n_folds = 5
idx = np.arange(len(mimic_df))

kf_splits = KFold(n_splits=n_splits, random_state=1896, shuffle=True)

for holdout_num, out_split in enumerate(kf_splits.split(idx)):
    idx_train = idx[out_split[0]]
    idx_test = idx[out_split[1]]
    devel_fold = mimic_df.iloc[idx_train, ]
    test_fold = mimic_df.iloc[idx_test, ]

    for train_percentage in [0,0.25,0.50]:
        for test_percentage in [0,0.25,0.50]:
            percent_str = f'train_missing_{train_percentage}_test_missing_{test_percentage}'
            train_data = make_missing_mcar(devel_fold, train_percentage)
            test_data  = make_missing_mcar(test_fold, test_percentage)

            test_data.to_csv(outdir / f'holdout_{holdout_num}_{percent_str}.csv', index=False)

            kf_folds = KFold(n_splits=n_folds, random_state=165782 * holdout_num, shuffle=True)
            idx_folds = np.arange(len(train_data))
            for fold_num, idx_fold_split in enumerate(kf_folds.split(idx_folds)):
                train_fold = train_data.iloc[idx_fold_split[0]]
                val_fold = train_data.iloc[idx_fold_split[1]]
                train_fold.to_csv(outdir / f'devel_{holdout_num}_train_{fold_num}_{percent_str}.csv', index=False)
                val_fold.to_csv(outdir / f'devel_{holdout_num}_val_{fold_num}_{percent_str}.csv', index=False)