In [1]:
# only for taurus
import os
print(os.getcwd())
os.chdir("notebooks")
print(os.getcwd())

/data/horse/ws/jori152b-medinf/KP_MedInf
/data/horse/ws/jori152b-medinf/KP_MedInf/notebooks


In [2]:
from datetime import datetime
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from matplotlib import pyplot as plt
import seaborn as sns

In [7]:
# File paths and separator
DATA_PATH_stages = "../data/extracted/kdigo_stages_measured.csv"
DATA_PATH_labs = "../data/extracted/labs_original.csv"
DATA_PATH_labs_extended = "../data/extracted/labs_extended.csv"
DATA_PATH_labs_new = "../data/extracted/labs_new.csv"
# DATA_PATH_vitals = "../data/extracted/vitals-kdigo_stages_measured.csv"
DATA_PATH_vitals = "../data/extracted/vitals.csv"
# DATA_PATH_vents = "../data/extracted/vents-vasopressor-sedatives-kdigo_stages_measured.csv"
DATA_PATH_vents = "../data/extracted/vents_vasopressor_sedatives.csv"
# DATA_PATH_detail = "../data/extracted/icustay_detail-kdigo_stages_measured.csv"
DATA_PATH_detail = "../data/extracted/icustay_detail.csv"
DATA_PATH_heightweight = "../data/extracted/heightweight.csv"
DATA_PATH_calcium = "../data/extracted/calcium.csv"
DATA_PATH_inr_max = "../data/extracted/inr_max.csv"
SEPARATOR = ";"

# Constants
IMPUTE_EACH_ID = False
IMPUTE_COLUMN = False
TESTING = False
TEST_SIZE = 0.05
SPLIT_SIZE = 0.2
MAX_DAYS = 35
CLASS1 = True
ALL_STAGES = False
MAX_FEATURE_SET = True
FIRST_TURN_POS = True
TIME_SAMPLING = True
SAMPLING_INTERVAL = '6H'
RESAMPLE_LIMIT = 16
MOST_COMMON = False
IMPUTE_METHOD = 'most_frequent'
FILL_VALUE = 0
ADULTS_MIN_AGE = 18
ADULTS_MAX_AGE = 120
NORMALIZATION = 'min-max'
HOURS_AHEAD = 48
NORM_TYPE = 'min_max'
RANDOM = 42

def filter_by_length_of_stay(X):
    drop_list = []
    long_stays = X.groupby(['icustay_id']).apply(lambda group: (group['charttime'].max() - group['charttime'].min()).total_seconds() / (24 * 60 * 60) > MAX_DAYS)

    for icustay_id, is_long in long_stays.items():
        if is_long:
            max_time = X[X['icustay_id'] == icustay_id]['charttime'].max() - pd.to_timedelta(MAX_DAYS, unit='D')
            X = X[~((X['icustay_id'] == icustay_id) & (X['charttime'] < max_time))]

    short_stays = X.groupby(['icustay_id']).apply(lambda group: (group['charttime'].max() - group['charttime'].min()).total_seconds() / (24 * 60 * 60) < (HOURS_AHEAD/24))
    drop_list = short_stays[short_stays].index.tolist()

    X = X[~X.icustay_id.isin(drop_list)]
    return X

In [12]:
# Load datasets
print("Loading datasets...")
X = pd.read_csv(DATA_PATH_stages, sep=SEPARATOR)
X.drop(["aki_stage_creat", "aki_stage_uo"], axis=1, inplace=True)
X = X.dropna(how='all', subset=['creat', 'uo_rt_6hr', 'uo_rt_12hr', 'uo_rt_24hr', 'aki_stage'])
X['charttime'] = pd.to_datetime(X['charttime'])

print(len(X))
print(X['aki_stage'].value_counts())

dataset_detail = pd.read_csv(DATA_PATH_detail, sep=SEPARATOR)
dataset_detail.drop(['dod', 'admittime', 'dischtime', 'los_hospital', 'ethnicity', 
                     'hospital_expire_flag', 'hospstay_seq', 'first_hosp_stay', 'intime', 
                     'outtime', 'los_icu', 'icustay_seq', 'first_icu_stay'], axis=1, inplace=True)

dataset_labs = pd.read_csv(DATA_PATH_labs, sep=SEPARATOR)
dataset_labs = dataset_labs.dropna(subset=['charttime']).dropna(subset=dataset_labs.columns[4:], how='all')
dataset_labs['charttime'] = pd.to_datetime(dataset_labs['charttime'])
dataset_labs = dataset_labs.sort_values(by=['icustay_id', 'charttime'])
dataset_labs.drop(['albumin_min', 'albumin_max','bilirubin_min', 'bilirubin_max','bands_min', 'bands_max',
                   'lactate_min', 'lactate_max','platelet_min', 'platelet_max','ptt_min', 'ptt_max', 
                   'inr_min', 'inr_max', 'pt_min', 'pt_max'], axis = 1, inplace = True)

# dataset_labs_new = pd.read_csv(DATA_PATH_labs_new, sep=SEPARATOR)
# dataset_labs_new = dataset_labs_new.dropna(subset=['charttime']).dropna(subset=dataset_labs_new.columns[4:], how='all')
# dataset_labs_new['charttime'] = pd.to_datetime(dataset_labs_new['charttime'])
# dataset_labs_new = dataset_labs_new.sort_values(by=['icustay_id', 'charttime'])


# dataset_labs_extended = pd.read_csv(DATA_PATH_labs_extended, sep=SEPARATOR)
# dataset_labs_extended = dataset_labs_extended.dropna(subset=['charttime']).dropna(subset=dataset_labs_extended.columns[4:], how='all')
# dataset_labs_extended['charttime'] = pd.to_datetime(dataset_labs_extended['charttime'])
# dataset_labs_extended = dataset_labs_extended.sort_values(by=['icustay_id', 'charttime'])

dataset_vitals = pd.read_csv(DATA_PATH_vitals, sep=SEPARATOR)
dataset_vents = pd.read_csv(DATA_PATH_vents, sep=SEPARATOR)
dataset_vitals.drop(["heartrate_min", "heartrate_max", "sysbp_min", "sysbp_max", "diasbp_min", "diasbp_max",
                        'meanbp_min', 'meanbp_max', 'tempc_min', 'tempc_max', "resprate_min", "resprate_max", 
                        "spo2_min", "spo2_max", "glucose_min", "glucose_max"], axis=1, inplace=True)
dataset_vitals['charttime'] = pd.to_datetime(dataset_vitals['charttime'])
dataset_vents['charttime'] = pd.to_datetime(dataset_vents['charttime'])
dataset_vitals = dataset_vitals.dropna(subset=dataset_vitals.columns[4:], how='all')
dataset_vitals = dataset_vitals.sort_values(by=['icustay_id', 'charttime'])
dataset_vents = dataset_vents.sort_values(by=['icustay_id', 'charttime'])

dataset_heightweight = pd.read_csv(DATA_PATH_heightweight, sep=SEPARATOR)
dataset_heightweight = dataset_heightweight.dropna(subset=['icustay_id', 'height_first', 'weight_first'], how='all')
dataset_heightweight = dataset_heightweight.sort_values(by=['icustay_id'])

dataset_calcium = pd.read_csv(DATA_PATH_calcium, sep=SEPARATOR)
dataset_calcium.drop(["hadm_id"], axis=1, inplace=True)
dataset_calcium['charttime'] = pd.to_datetime(dataset_calcium['charttime'])
dataset_calcium = dataset_calcium.sort_values(by=['icustay_id', 'charttime'])

dataset_inr_max = pd.read_csv(DATA_PATH_inr_max, sep=SEPARATOR)
dataset_inr_max.drop(["hadm_id", "subject_id"], axis=1, inplace=True)
dataset_inr_max = dataset_inr_max.sort_values(by=['icustay_id'])

# Calculate mean for each pair and drop original columns
column_pairs = [('aniongap_min', 'aniongap_max'), ('albumin_min', 'albumin_max'), 
                ('bands_min', 'bands_max'), ('bicarbonate_min', 'bicarbonate_max'), 
                ('bilirubin_min', 'bilirubin_max'), ('creatinine_min', 'creatinine_max'), 
                ('chloride_min', 'chloride_max'), ('glucose_min', 'glucose_max'), 
                ('hematocrit_min', 'hematocrit_max'), ('hemoglobin_min', 'hemoglobin_max'), 
                ('lactate_min', 'lactate_max'), ('platelet_min', 'platelet_max'), 
                ('potassium_min', 'potassium_max'), ('ptt_min', 'ptt_max'), 
                ('inr_min', 'inr_max'), ('pt_min', 'pt_max'), ('sodium_min', 'sodium_max'), 
                ('bun_min', 'bun_max'), ('wbc_min', 'wbc_max')]

for min_col, max_col in column_pairs:
    try:
        mean_col = min_col.rsplit('_', 1)[0] + '_mean'
        dataset_labs[mean_col] = dataset_labs[[min_col, max_col]].mean(axis=1)
        dataset_labs.drop([min_col, max_col], axis=1, inplace=True)
    except:
        pass

# column_pairs_new = [('aniongap_min', 'aniongap_max'), ('albumin_min', 'albumin_max'), 
#                 ('bands_min', 'bands_max'), ('bicarbonate_min', 'bicarbonate_max'), 
#                 ('bilirubin_min', 'bilirubin_max'), ('creatinine_min', 'creatinine_max'), 
#                 ('chloride_min', 'chloride_max'), ('glucose_min', 'glucose_max'), 
#                 ('hematocrit_min', 'hematocrit_max'), ('hemoglobin_min', 'hemoglobin_max'), 
#                 ('lactate_min', 'lactate_max'), ('platelet_min', 'platelet_max'), 
#                 ('potassium_min', 'potassium_max'), ('ptt_min', 'ptt_max'), 
#                 ('inr_min', 'inr_max'), ('pt_min', 'pt_max'), ('sodium_min', 'sodium_max'), 
#                 ('bun_min', 'bun_max'), ('wbc_min', 'wbc_max'), ('bilirubin_total_min', 'bilirubin_total_max'), 
#                 ('estimated_gfr_min', 'estimated_gfr_max'), ('phosphate_min', 'phosphate_max'), 
#                 ('urea_nitrogen_min', 'urea_nitrogen_max'), ('uric_acid_min', 'uric_acid_max'), 
#                 ('calcium_total_min', 'calcium_total_max'), ('inr_pt_min', 'inr_pt_max'), 
#                 ('platelet_count_min', 'platelet_count_max')]

# for min_col, max_col in column_pairs_new:
#     mean_col = min_col.rsplit('_', 1)[0] + '_mean'
#     dataset_labs_new[mean_col] = dataset_labs_new[[min_col, max_col]].mean(axis=1)
#     dataset_labs_new.drop([min_col, max_col], axis=1, inplace=True)

# column_pairs_extended = [('aniongap_min', 'aniongap_max'), ('albumin_min', 'albumin_max'), 
#                 ('bands_min', 'bands_max'), ('bicarbonate_min', 'bicarbonate_max'), 
#                 ('bilirubin_min', 'bilirubin_max'), ('creatinine_min', 'creatinine_max'), 
#                 ('chloride_min', 'chloride_max'), ('glucose_min', 'glucose_max'), 
#                 ('hematocrit_min', 'hematocrit_max'), ('hemoglobin_min', 'hemoglobin_max'), 
#                 ('lactate_min', 'lactate_max'), ('platelet_min', 'platelet_max'), 
#                 ('potassium_min', 'potassium_max'), ('ptt_min', 'ptt_max'), 
#                 ('inr_min', 'inr_max'), ('pt_min', 'pt_max'), ('sodium_min', 'sodium_max'), 
#                 ('bun_min', 'bun_max'), ('wbc_min', 'wbc_max'), 
#                 ('gfr_min', 'gfr_max'), ('phosphate_min', 'phosphate_max'),('uric_acid_min', 'uric_acid_max'), 
#                 ('calcium_min', 'calcium_max')]


# for min_col, max_col in column_pairs_extended:
#     mean_col = min_col.rsplit('_', 1)[0] + '_mean'
#     dataset_labs_extended[mean_col] = dataset_labs_extended[[min_col, max_col]].mean(axis=1)
#     dataset_labs_extended.drop([min_col, max_col], axis=1, inplace=True)
    
# dataset_labs_extended.drop(['gfr_mean'], axis=1, inplace=True)

# Merge datasets
if MAX_FEATURE_SET:
    X = X.merge(dataset_labs, on=["icustay_id", "charttime"], how="outer")
    # X = X.merge(dataset_labs_new, on=["icustay_id", "charttime"], how="outer")
    # X = X.merge(dataset_labs_extended, on=["icustay_id", "charttime"], how="outer")
    X = X.merge(dataset_vitals, on=["icustay_id", "charttime", "subject_id", "hadm_id"], how="outer")
    X = X.merge(dataset_vents, on=["icustay_id", "charttime"], how="outer")
    X.drop(["subject_id"], axis=1, inplace=True)
    X = X.merge(dataset_calcium, on=["icustay_id", "charttime"], how="outer")

Loading datasets...
3737147
aki_stage
0    3000795
2     353266
3     207759
1     175327
Name: count, dtype: int64


In [16]:
dataset_vitals.describe()

Unnamed: 0,subject_id,hadm_id,icustay_id,charttime,heartrate_mean,sysbp_mean,diasbp_mean,meanbp_mean,resprate_mean,tempc_mean,spo2_mean,glucose_mean
count,9067143.0,9067143.0,9067143.0,9067143,7859472.0,5776911.0,5775124.0,5792180.0,6271224.0,1733080.0,6078894.0,1261644.0
mean,33239.94,149879.9,250610.3,2152-04-21 23:31:49.216401408,101.0353,121.5073,60.27826,79.1496,20.20792,37.00486,97.08057,142.5699
min,3.0,100001.0,200001.0,2100-06-08 00:18:00,0.146,0.06,0.32,0.43,0.17,15.0,0.5,0.106
25%,11722.0,125333.0,225825.0,2127-12-28 14:00:00,77.0,104.0,50.0,68.0,16.0,36.44444,96.0,107.0
50%,23695.0,149237.0,250901.0,2153-01-22 13:15:00,92.0,119.0,59.0,77.0,20.0,37.0,98.0,129.0
75%,53119.0,175020.0,275488.0,2177-08-03 22:00:00,117.0,137.0,69.0,88.6667,24.0,37.55555,99.0,160.0
max,99999.0,199999.0,299999.0,2209-08-07 16:00:00,292.0,355.0,298.0,299.0,69.0,46.5,100.0,999999.0
std,27924.36,28768.94,28871.92,,32.65558,23.6875,14.55439,16.73071,6.043288,0.8432392,3.493982,1260.194


In [17]:
pd.set_option('display.max_columns', None)
X.describe()    

Unnamed: 0,icustay_id,charttime,creat,uo_rt_6hr,uo_rt_12hr,uo_rt_24hr,aki_stage,hadm_id,aniongap_mean,bicarbonate_mean,creatinine_mean,chloride_mean,glucose_mean_x,hematocrit_mean,hemoglobin_mean,potassium_mean,sodium_mean,bun_mean,wbc_mean,heartrate_mean,sysbp_mean,diasbp_mean,meanbp_mean,resprate_mean,tempc_mean,spo2_mean,glucose_mean_y,vent,vasopressor,sedative,subject_id,calcium
count,13604200.0,13589169,386396.0,3267035.0,3267035.0,3267035.0,3737147.0,10186210.0,727962.0,739479.0,741939.0,807392.0,936984.0,921519.0,782635.0,1012462.0,845254.0,739446.0,687683.0,7859472.0,5776911.0,5775124.0,5792180.0,6271224.0,1733080.0,6078894.0,1261645.0,13528100.0,13528100.0,13528100.0,239028.0,91775.0
mean,250380.9,2152-01-27 20:04:09.496556544,1.524148,1.643963,1.491771,1.438268,0.40275,149883.7,13.746018,25.259717,1.542183,103.824401,134.087883,30.41102,10.267063,4.138491,138.393092,30.991901,11.251299,101.0353,121.5073,60.27826,79.1496,20.20792,37.00486,97.08057,142.5699,0.4075521,0.1529394,0.2183027,33145.886348,1.135195
min,200001.0,2100-06-07 09:20:00,0.0,-66.3717,-66.3717,-46.9484,0.0,100001.0,1.0,2.0,0.05,1.6,0.139,2.0,1.2,0.6,1.07,1.0,0.1,0.146,0.06,0.32,0.43,0.17,15.0,0.5,0.106,0.0,0.0,0.0,3.0,0.09
25%,225536.0,2127-07-12 19:04:00,0.7,0.596,0.625,0.6612,0.0,125285.0,11.0,22.0,0.7,100.0,101.0,26.9,9.0,3.7,135.0,15.0,7.0,77.0,104.0,50.0,68.0,16.0,36.44444,96.0,107.0,0.0,0.0,0.0,11789.0,1.07
50%,250678.0,2152-07-28 18:00:00,1.0,1.0592,1.0717,1.0913,0.0,149256.0,13.0,25.0,1.0,104.0,121.0,29.8,10.1,4.1,138.0,23.0,9.9,92.0,119.0,59.0,77.0,20.0,37.0,98.0,129.0,0.0,0.0,0.0,23657.0,1.13
75%,275270.0,2177-04-29 07:26:00,1.6,1.8826,1.7833,1.7292,0.0,175023.0,16.0,28.0,1.7,108.0,150.0,33.3,11.3,4.4,141.0,40.0,13.7,117.0,137.0,69.0,88.6667,24.0,37.55555,99.0,160.0,1.0,0.0,0.0,52641.0,1.19
max,299999.0,2210-08-24 05:53:00,138.0,900.0,900.0,900.0,3.0,199999.0,118.0,65.0,138.0,198.0,3565.0,77.7,43.0,27.5,184.0,290.0,846.7,292.0,355.0,298.0,299.0,69.0,46.5,100.0,999999.0,1.0,1.0,1.0,99999.0,96.0
std,28862.92,,1.620051,3.591551,3.016078,2.875846,0.8735885,28783.83,3.876696,5.07716,1.57959,6.351784,65.007258,5.317715,1.863787,0.6698996,5.206882,24.166935,9.247759,32.65558,23.6875,14.55439,16.73071,6.043288,0.8432392,3.493982,1260.193,0.4913791,0.3599291,0.413094,27859.399825,0.555198


In [5]:
# take only head of X
X = X.head(50000)

In [18]:
print("Filtering patients by age and length of stay...")
# Filtering patients by age and length of stay
dataset_detail = dataset_detail[dataset_detail['admission_age'] >= ADULTS_MIN_AGE]
adults_icustay_id_list = dataset_detail['icustay_id'].unique()
X = X[X.icustay_id.isin(adults_icustay_id_list)].sort_values(by=['icustay_id', 'charttime'])

X = filter_by_length_of_stay(X)
dataset_detail = dataset_detail[dataset_detail.icustay_id.isin(X['icustay_id'].unique())].sort_values(by=['icustay_id'])

Filtering patients by age and length of stay...


  long_stays = X.groupby(['icustay_id']).apply(lambda group: (group['charttime'].max() - group['charttime'].min()).total_seconds() / (24 * 60 * 60) > MAX_DAYS)


In [17]:
label = ['aki_stage']
skip = ['icustay_id', 'charttime', 'aki_stage']
discrete_feat = ['sedative', 'vasopressor', 'vent', 'hadm_id']
skip.extend(discrete_feat)    
numeric_feat = list(X.columns.difference(skip))

In [10]:
print(os.getcwd())


/data/horse/ws/jori152b-medinf/KP_MedInf/notebooks


In [13]:
# save to csv
X.to_csv('../data/analysis/data_preprocessed_extended__filtered_before_resampling.csv', index=False)

In [20]:
X = pd.read_csv('../data/analysis/data_preprocessed_extended__filtered_before_resampling.csv')

In [21]:
X['charttime'] = pd.to_datetime(X['charttime'])


In [24]:
X_old = X.copy()

In [42]:
X = X_old.copy()

In [14]:
SAMPLING_INTERVALS = ['1H', '2H', '3H', '4H', '6H']


In [None]:
dataset_detail = dataset_detail[dataset_detail['icustay_id'].isin(X['icustay_id'].unique())].sort_values(by=['icustay_id'])
dataset_detail = pd.get_dummies(dataset_detail, columns=['gender', 'ethnicity_grouped'])
dataset_detail.drop(['subject_id', 'hadm_id'], axis=1, inplace=True)

In [24]:
# saving every sampling interval
# Resampling
for SAMPLING_INTERVAL in SAMPLING_INTERVALS:
    if TIME_SAMPLING:
        
        # Set index and group by 'icustay_id' before resampling
        X_resampled = X.set_index('charttime').groupby('icustay_id').resample(SAMPLING_INTERVAL)
        
        # Resample and aggregate features
        if MAX_FEATURE_SET:
            X_discrete = X_resampled[discrete_feat].max().fillna(FILL_VALUE).astype(np.int64)
        X_numeric = X_resampled[numeric_feat].mean()
        X_label = X_resampled['aki_stage'].max()

        print("Merging sampled features")
        try:
            X_resampled = pd.concat([X_numeric, X_discrete, X_label], axis=1).reset_index()
        except:
            X_resampled = pd.concat([X_numeric, X_label], axis=1).reset_index()



    # Forward fill again after resampling
    X_resampled['aki_stage'] = X_resampled.groupby('icustay_id')['aki_stage'].ffill(limit=RESAMPLE_LIMIT).fillna(0)



    # Ensure binary values (convert any positive number to 1)
    X_resampled['aki_stage'] = (X_resampled['aki_stage'] > 0).astype(int)

    # Shifting labels
    shift_steps = HOURS_AHEAD // int(SAMPLING_INTERVAL[:-1])
    X_resampled['aki_stage'] = X_resampled.groupby('icustay_id')['aki_stage'].shift(-shift_steps)
    X_resampled = X_resampled.dropna(subset=['aki_stage'])

    # Merging not time-dependent data

    X_resampled = X_resampled.merge(dataset_detail, on='icustay_id')
    X_resampled = X_resampled.merge(dataset_heightweight, on='icustay_id')
    X_resampled = X_resampled.merge(dataset_inr_max, on='icustay_id')

    # If no imputation method selected or only impute each id, for the remaining nan impute direclty with FILL_VALUE
    X_resampled = X_resampled.fillna(FILL_VALUE) 


    # Save preprocessed data
    X_resampled.to_csv(f'../data/preprocessed/preprocessed_data_extended_{SAMPLING_INTERVAL}.csv', index=False)




  X_resampled = X.set_index('charttime').groupby('icustay_id').resample(SAMPLING_INTERVAL)


In [41]:
# Resampling
if TIME_SAMPLING:
    
    # Set index and group by 'icustay_id' before resampling
    X = X.set_index('charttime').groupby('icustay_id').resample(SAMPLING_INTERVAL)
    
    # Resample and aggregate features
    if MAX_FEATURE_SET:
        X_discrete = X[discrete_feat].max()
    X_numeric = X[numeric_feat].mean()
    X_label = X['aki_stage'].max()

    print("Merging sampled features")
    try:
        X = pd.concat([X_numeric, X_discrete, X_label], axis=1).reset_index()
    except:
        X = pd.concat([X_numeric, X_label], axis=1).reset_index()
        




  X = X.set_index('charttime').groupby('icustay_id').resample(SAMPLING_INTERVAL)


KeyboardInterrupt: 

In [None]:
# Resampling
if TIME_SAMPLING:
    
    # Set index and group by 'icustay_id' before resampling
    X = X.set_index('charttime').groupby('icustay_id').resample(SAMPLING_INTERVAL)
    
    # Resample and aggregate features
    if MAX_FEATURE_SET:
        X_discrete = X[discrete_feat].max().fillna(FILL_VALUE).astype(np.int64)
    X_numeric = X[numeric_feat].mean()
    X_label = X['aki_stage'].max()

    print("Merging sampled features")
    try:
        X = pd.concat([X_numeric, X_discrete, X_label], axis=1).reset_index()
    except:
        X = pd.concat([X_numeric, X_label], axis=1).reset_index()



# Forward fill again after resampling
X['aki_stage'] = X.groupby('icustay_id')['aki_stage'].ffill(limit=RESAMPLE_LIMIT).fillna(0)



# Ensure binary values (convert any positive number to 1)
X['aki_stage'] = (X['aki_stage'] > 0).astype(int)

# Shifting labels
shift_steps = HOURS_AHEAD // int(SAMPLING_INTERVAL[:-1])
X['aki_stage'] = X.groupby('icustay_id')['aki_stage'].shift(-shift_steps)
X = X.dropna(subset=['aki_stage'])

# Merging not time-dependent data
dataset_detail = dataset_detail[dataset_detail['icustay_id'].isin(X['icustay_id'].unique())].sort_values(by=['icustay_id'])
dataset_detail = pd.get_dummies(dataset_detail, columns=['gender', 'ethnicity_grouped'])
dataset_detail.drop(['subject_id', 'hadm_id'], axis=1, inplace=True)
X = X.merge(dataset_detail, on='icustay_id')
X = X.merge(dataset_heightweight, on='icustay_id')
X = X.merge(dataset_inr_max, on='icustay_id')

# If no imputation method selected or only impute each id, for the remaining nan impute direclty with FILL_VALUE
X = X.fillna(FILL_VALUE) 


  X = X.set_index('charttime').groupby('icustay_id').resample(SAMPLING_INTERVAL)


Merging sampled features


In [15]:

# Save preprocessed data
X.to_csv('../data/preprocessed/preprocessed_data_extended_small.csv', index=False)