In [1]:
import numpy as np
import pandas as pd

In [2]:
id_vars = ['subject_id','hadm_id','stay_id']

In [3]:

def read_local_data(data_dir):
    static_vars = pd.read_csv(data_dir + 'static_vars.csv')
    dynamic_vars = pd.read_csv(data_dir + 'dynamic_vars.csv')
    outcome_vars = pd.read_csv(data_dir + 'outcome_vars.csv')
    input_vars = pd.read_csv(data_dir + 'input_vars.csv')

    return static_vars, dynamic_vars, outcome_vars, input_vars


In [4]:
static_vars, dynamic_vars, outcome_vars, input_vars =\
        read_local_data('/home/joe/mimic_understander/data/external/')

print(dynamic_vars.head())
    

   subject_id   hadm_id   stay_id               intime              outtime  \
0    13505226  21820338  37831702  2142-09-15 00:31:52  2142-09-16 17:49:14   
1    14025587  23594805  37282379  2136-12-24 14:14:00  2136-12-26 19:10:12   
2    14156778  21837109  31333611  2164-06-03 09:08:16  2164-06-04 15:28:18   
3    14873487  23630661  32995267  2129-02-01 22:03:45  2129-02-05 23:44:25   
4    15776719  20137532  37269622  2143-04-13 01:16:00  2143-04-17 11:42:38   

             charttime  itemid                         label value valueuom  
0  2142-09-16 10:00:00  224329             PCA lockout (min)     6      min  
1  2136-12-24 17:00:00  228099  18 Gauge placed in the field     0      NaN  
2  2164-06-03 19:23:00  220645                Sodium (serum)   137    mEq/L  
3  2129-02-02 05:05:00  220644                           ALT    19     IU/L  
4  2143-04-14 20:00:00  227367   18 Gauge Dressing Occlusive     1      NaN  


In [17]:
outcome_vars.head()

Unnamed: 0,subject_id,hadm_id,stay_id,mort_icu,los,hospital_expire_flag,HospMort30day
0,10122297,25825366,36349608,0,1.979468,0,0
1,17168310,21560534,33376903,0,1.026331,0,0
2,15703353,29272306,35111434,0,9.013519,0,0
3,10308232,21297383,30153687,0,4.980833,0,0
4,13307171,21128752,38043905,0,0.737431,0,0


In [6]:
def preprocess_static_vars(static_vars_df: pd.DataFrame) -> pd.DataFrame:
    static_vars_clean = static_vars_df.copy()

    static_vars_clean = pd.get_dummies(static_vars_clean)

    return static_vars_clean

In [15]:
outcome_vars

Unnamed: 0,subject_id,hadm_id,stay_id,mort_icu,los,hospital_expire_flag,HospMort30day
0,10122297,25825366,36349608,0,1.979468,0,0
1,17168310,21560534,33376903,0,1.026331,0,0
2,15703353,29272306,35111434,0,9.013519,0,0
3,10308232,21297383,30153687,0,4.980833,0,0
4,13307171,21128752,38043905,0,0.737431,0,0
...,...,...,...,...,...,...,...
55121,10496572,29908222,35485159,0,0.973993,0,0
55122,19739872,23023377,35695410,0,4.547465,0,0
55123,18043783,27439603,35896715,0,0.611481,0,0
55124,11184695,24888536,38194069,0,1.443877,0,0


In [12]:
static_vars_clean = pd.get_dummies(static_vars)

In [14]:
pd.isna(static_vars_clean).any()

subject_id                                 False
hadm_id                                    False
stay_id                                    False
age                                        False
ethnicity_AMERICAN INDIAN/ALASKA NATIVE    False
ethnicity_ASIAN                            False
ethnicity_BLACK/AFRICAN AMERICAN           False
ethnicity_HISPANIC/LATINO                  False
ethnicity_OTHER                            False
ethnicity_UNABLE TO OBTAIN                 False
ethnicity_UNKNOWN                          False
ethnicity_WHITE                            False
dtype: bool

In [6]:
outcome_vars.head()

Unnamed: 0,subject_id,hadm_id,stay_id,hospital_expire_flag,HospMort30day
0,19586042,23279761,32441505,0,0
1,18320677,27591411,38358287,0,0
2,12441061,25709725,34566245,0,0
3,14825995,26548609,32257177,0,0
4,16723797,28913496,31146604,0,0


In [16]:

def get_regular_timeseries(timestamp_timeseries: pd.DataFrame, ) -> pd.DataFrame:
    """converts a timeseries with each variable recorded as value - datetime
    into hourly (or other interval)"""

    temp  = dynamic_vars.head(1000000)
    temp['time_in'] = pd.to_datetime(temp['charttime']) - pd.to_datetime(temp['intime'])
    temp = temp.drop(['intime','outtime','charttime','itemid','valueuom'], axis=1) #TODO: check if valueuom can be dropped
    temp = temp.set_index(['subject_id','hadm_id','stay_id','time_in'])

    top_k_feats = list(dynamic_vars['label'].value_counts()[:5].index)

    temp = temp.loc[temp['label'].isin(top_k_feats),:]
    


    b = pd.pivot(temp, columns=['label']).head(1000)

    #b = pd.pivot(temp, columns=['label', 'subject_id','hadm_id','stay_id']).head(10)
    #b = b.reset_index()
    b.columns = b.columns.get_level_values(1)
    b = b.reset_index()#.set_index('time_in')
    b[top_k_feats] = b[top_k_feats].apply(pd.to_numeric)
    #b[id_vars] = b[id_vars].apply(pd.to_)

    timestamp_timeseries = b.groupby(id_vars).resample('H', on='time_in').mean().drop(id_vars, axis=1).reset_index()

    return timestamp_timeseries



In [17]:
dynamic_regular = get_regular_timeseries(dynamic_vars)

In [24]:
dynamic_regular

label,subject_id,hadm_id,stay_id,time_in,Heart Rate,Non Invasive Blood Pressure diastolic,Non Invasive Blood Pressure mean,O2 saturation pulseoxymetry,Respiratory Rate
0,10001725,25563031,37542711,0 days 15:07:38,,,,100.0,
1,10001725,25563031,37542711,0 days 16:07:38,,,,,
2,10001725,25563031,37542711,0 days 17:07:38,,,,,
3,10001725,25563031,37542711,0 days 18:07:38,,,,,
4,10001725,25563031,37542711,0 days 19:07:38,,,,,
...,...,...,...,...,...,...,...,...,...
9951,10361930,22828898,32607530,3 days 10:18:01,,,,,
9952,10361930,22828898,32607530,3 days 11:18:01,,,,,
9953,10361930,22828898,32607530,3 days 12:18:01,,55.0,,,
9954,10362330,25416268,33355839,0 days 08:29:12,,54.0,,,


In [36]:
top_k_feats = list(dynamic_vars['label'].value_counts()[:5].index)

In [51]:

# first, impute by forwards filling
dynamic_regular_df = dynamic_regular
dynamic_regular_df = dynamic_regular_df.set_index(id_vars)
dynamic_regular_df = dynamic_regular_df.groupby(id_vars).ffill() 

#dynamic_regular_df[top_k_feats].fillna(dynamic_regular_df.groupby(id_vars).median()) personal level imputing, might information leak so dont use
dynamic_regular_df[top_k_feats].fillna(dynamic_regular_df.median()) 

Unnamed: 0_level_0,Unnamed: 1_level_0,label,Heart Rate,Respiratory Rate,O2 saturation pulseoxymetry,Non Invasive Blood Pressure mean,Non Invasive Blood Pressure diastolic
subject_id,hadm_id,stay_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
10001725,25563031,37542711,82.0,19.0,100.0,74.0,61.0
10001725,25563031,37542711,82.0,19.0,100.0,74.0,61.0
10001725,25563031,37542711,82.0,19.0,100.0,74.0,61.0
10001725,25563031,37542711,82.0,19.0,100.0,74.0,61.0
10001725,25563031,37542711,82.0,19.0,100.0,74.0,61.0
...,...,...,...,...,...,...,...
10361930,22828898,32607530,82.0,25.0,98.0,74.0,61.0
10361930,22828898,32607530,82.0,25.0,98.0,74.0,61.0
10361930,22828898,32607530,82.0,25.0,98.0,74.0,55.0
10362330,25416268,33355839,82.0,19.0,98.0,74.0,54.0


In [46]:
dynamic_regular_df.groupby(id_vars).median()

Unnamed: 0_level_0,Unnamed: 1_level_0,label,Heart Rate,Non Invasive Blood Pressure diastolic,Non Invasive Blood Pressure mean,O2 saturation pulseoxymetry,Respiratory Rate
subject_id,hadm_id,stay_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
10001725,25563031,37542711,,58.0,,100.0,
10002155,23822395,34599502,,54.0,,94.0,
10002428,20321825,37206936,100.0,,,,
10002428,28662225,39157783,,,,,27.0
10003019,22774359,35012584,81.0,,,,
...,...,...,...,...,...,...,...
10360766,27554699,34464986,110.0,,87.0,,
10361825,27526421,30953288,,,65.0,,35.0
10361930,22828898,32607530,,55.0,,,25.0
10362330,25416268,33355839,,54.0,,,


In [23]:
temp.fillna()

pandas.core.frame.DataFrame

In [64]:
dynamic_regular_imputed  = impute_dynamic_data(dynamic_regular, top_k_feats)
dynamic_regular_imputed

Unnamed: 0_level_0,Unnamed: 1_level_0,label,Heart Rate,Respiratory Rate,O2 saturation pulseoxymetry,Non Invasive Blood Pressure mean,Non Invasive Blood Pressure diastolic
subject_id,hadm_id,stay_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
10001725,25563031,37542711,82.0,19.0,100.0,74.0,61.0
10001725,25563031,37542711,82.0,19.0,100.0,74.0,61.0
10001725,25563031,37542711,82.0,19.0,100.0,74.0,61.0
10001725,25563031,37542711,82.0,19.0,100.0,74.0,61.0
10001725,25563031,37542711,82.0,19.0,100.0,74.0,61.0
...,...,...,...,...,...,...,...
10361930,22828898,32607530,82.0,25.0,98.0,74.0,61.0
10361930,22828898,32607530,82.0,25.0,98.0,74.0,61.0
10361930,22828898,32607530,82.0,25.0,98.0,74.0,55.0
10362330,25416268,33355839,82.0,19.0,98.0,74.0,54.0


In [63]:
def impute_dynamic_data(dynamic_regular_df, feature_names):

    # first, impute by forwards filling
    dynamic_regular_df = dynamic_regular.copy()
    dynamic_regular_df = dynamic_regular_df.set_index(id_vars)
    dynamic_regular_df = dynamic_regular_df.groupby(id_vars).ffill() 

    #dynamic_regular_df[top_k_feats].fillna(dynamic_regular_df.groupby(id_vars).median()) personal level imputing, might information leak so dont use
    dynamic_regular_imputed = dynamic_regular_df[feature_names].fillna(dynamic_regular_df.median()) 

    return dynamic_regular_imputed