In [1]:
import numpy as np
import pandas as pd
import sys
import os
import pickle 

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import average_precision_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import label_binarize
from sklearn.ensemble import RandomForestClassifier
import scipy.stats as ss

In [2]:
sys.path.append('../utils')
from simple_impute import simple_imputer

# Task Specifics

In [4]:
INTERVENTION = 'vent'
RANDOM = 0
MAX_LEN = 240
SLICE_SIZE = 6
GAP_TIME = 6
PREDICTION_WINDOW = 4
OUTCOME_TYPE = 'all'
NUM_CLASSES = 4

In [5]:
CHUNK_KEY = {'ONSET': 0, 'CONTROL': 1, 'ON_INTERVENTION': 2, 'WEAN': 3}

# Load Data

In [6]:
DATAFILE = 'D:/data/MIMIC_Extract/samples.h5'

In [8]:
X = pd.read_hdf(DATAFILE,'vitalslabs')
Y = pd.read_hdf(DATAFILE,'interventions')
static = pd.read_hdf(DATAFILE,'patients')

# save data by h5py

In [9]:
static.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,gender,ethnicity,age,insurance,admittime,diagnosis_at_admission,dischtime,discharge_location,fullcode_first,dnr_first,...,outtime,los_icu,admission_type,first_careunit,mort_icu,mort_hosp,hospital_expire_flag,hospstay_seq,readmission_30,max_hours
subject_id,hadm_id,icustay_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
3,145834,211552,M,WHITE,76.526792,Medicare,2101-10-20 19:08:00,HYPOTENSION,2101-10-31 13:58:00,SNF,1.0,0.0,...,2101-10-26 20:43:09,6.06456,EMERGENCY,MICU,0,0,0,1,0,145
4,185777,294638,F,WHITE,47.845047,Private,2191-03-16 00:28:00,"FEVER,DEHYDRATION,FAILURE TO THRIVE",2191-03-23 18:41:00,HOME WITH HOME IV PROVIDR,1.0,0.0,...,2191-03-17 16:46:31,1.678472,EMERGENCY,MICU,0,0,0,1,0,40
6,107064,228232,F,WHITE,65.942297,Medicare,2175-05-30 07:15:00,CHRONIC RENAL FAILURE/SDA,2175-06-15 16:00:00,HOME HEALTH CARE,1.0,0.0,...,2175-06-03 13:39:54,3.672917,ELECTIVE,SICU,0,0,0,1,0,88
9,150750,220597,M,UNKNOWN/NOT SPECIFIED,41.790228,Medicaid,2149-11-09 13:06:00,HEMORRHAGIC CVA,2149-11-14 10:15:00,DEAD/EXPIRED,1.0,0.0,...,2149-11-14 20:52:14,5.323056,EMERGENCY,MICU,1,1,1,1,0,127
11,194540,229441,F,WHITE,50.148295,Private,2178-04-16 06:18:00,BRAIN MASS,2178-05-11 19:00:00,HOME HEALTH CARE,1.0,0.0,...,2178-04-17 20:21:05,1.58441,EMERGENCY,SICU,0,0,0,1,0,38


In [10]:
X.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,LEVEL2,alanine aminotransferase,alanine aminotransferase,alanine aminotransferase,albumin,albumin,albumin,albumin ascites,albumin ascites,albumin ascites,albumin pleural,...,white blood cell count,white blood cell count urine,white blood cell count urine,white blood cell count urine,ph,ph,ph,ph urine,ph urine,ph urine
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Aggregation Function,count,mean,std,count,mean,std,count,mean,std,count,...,std,count,mean,std,count,mean,std,count,mean,std
subject_id,hadm_id,icustay_id,hours_in,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2
3,145834,211552,0,2.0,25.0,0.0,2.0,1.8,0.0,0.0,,,0.0,...,4.012837,0.0,,,9.0,7.4,0.147733,1.0,5.0,
3,145834,211552,1,0.0,,,0.0,,,0.0,,,0.0,...,,0.0,,,0.0,,,0.0,,
3,145834,211552,2,0.0,,,0.0,,,0.0,,,0.0,...,,0.0,,,3.0,7.26,0.0,0.0,,
3,145834,211552,3,0.0,,,0.0,,,0.0,,,0.0,...,,0.0,,,0.0,,,0.0,,
3,145834,211552,4,0.0,,,0.0,,,0.0,,,0.0,...,,0.0,,,0.0,,,0.0,,


In [11]:
Y.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,vent,vaso,adenosine,dobutamine,dopamine,epinephrine,isuprel,milrinone,norepinephrine,phenylephrine,vasopressin,colloid_bolus,crystalloid_bolus,nivdurations
subject_id,hadm_id,icustay_id,hours_in,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
3,145834,211552,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3,145834,211552,1,1,1,0,0,1,0,0,0,0,1,0,0,0,0
3,145834,211552,2,1,1,0,0,1,0,0,0,0,1,0,0,0,0
3,145834,211552,3,1,1,0,0,0,0,0,0,0,1,0,0,0,0
3,145834,211552,4,1,1,0,0,0,0,0,0,1,1,0,0,0,0


# Preprocessing Data

## Train-Test Split, Stratified

In [12]:
train_ids, test_ids = train_test_split(static.reset_index(), test_size=0.2, 
                                       random_state=RANDOM, stratify=static['mort_hosp'])
split_train_ids, val_ids = train_test_split(train_ids, test_size=0.125, 
                                            random_state=RANDOM, stratify=train_ids['mort_hosp'])

In [15]:
train_ids

Unnamed: 0,subject_id,hadm_id,icustay_id,gender,ethnicity,age,insurance,admittime,diagnosis_at_admission,dischtime,...,outtime,los_icu,admission_type,first_careunit,mort_icu,mort_hosp,hospital_expire_flag,hospstay_seq,readmission_30,max_hours
403,618,181546,261361,M,WHITE,77.569199,Medicare,2117-03-13 22:10:00,HIP FRACTURE,2117-03-26 14:55:00,...,2117-03-19 12:43:23,2.000903,EMERGENCY,CSRU,0,0,0,1,0,48
602,937,148592,228181,M,UNKNOWN/NOT SPECIFIED,75.324795,Medicare,2163-01-20 18:39:00,INTRACRANIAL HEMORRHAGE,2163-01-24 08:00:00,...,2163-01-26 00:45:43,5.253021,EMERGENCY,MICU,0,0,1,1,0,126
377,583,193132,284423,M,WHITE,63.917917,Private,2129-05-30 18:59:00,CORONARY ARTERY DISEASE,2129-06-07 14:30:00,...,2129-06-03 11:27:53,0.969954,EMERGENCY,CSRU,0,0,0,1,0,23
891,1380,146215,266829,M,WHITE,63.483889,Private,2168-03-14 23:34:00,SEPSIS,2168-04-25 16:09:00,...,2168-03-18 18:01:37,3.768530,EMERGENCY,MICU,0,0,0,1,0,90
533,823,158797,221472,F,UNKNOWN/NOT SPECIFIED,37.632179,Private,2134-08-29 20:26:00,S/P MOTOR VEHICLE ACCIDENT,2134-08-31 16:15:00,...,2134-08-30 20:39:26,1.008252,EMERGENCY,TSICU,0,0,0,1,0,24
365,561,105399,233424,M,UNKNOWN/NOT SPECIFIED,64.022987,Medicare,2113-07-25 21:11:00,INTRACRANIAL HEMORRAHGE;HEAD BLEED;TELEMETRY,2113-07-28 22:05:00,...,2113-07-28 23:12:27,3.083183,EMERGENCY,TSICU,1,1,1,1,0,73
421,642,130310,252990,F,WHITE,57.482259,Private,2116-04-07 00:23:00,"TE FISTULA,ASPIRATION",2116-04-15 18:30:00,...,2116-04-11 15:55:32,2.728171,EMERGENCY,CSRU,0,0,0,1,0,65
795,1231,193451,273059,M,WHITE,23.786689,Medicaid,2153-11-25 21:32:00,SPLENIC RUPTURE,2153-11-30 16:40:00,...,2153-11-27 20:58:14,1.975741,EMERGENCY,TSICU,0,0,0,1,0,47
698,1092,196723,249889,M,BLACK/AFRICAN AMERICAN,80.910348,Medicare,2155-09-10 17:53:00,RULE-OUT MYOCARDIAL INFARCTION;TELEMETRY;?CVA,2155-09-20 11:05:00,...,2155-09-19 14:54:47,8.763947,EMERGENCY,MICU,0,1,1,1,0,210
330,498,127076,227493,M,UNKNOWN/NOT SPECIFIED,59.082795,Medicaid,2151-04-23 18:36:00,CHEST PAIN\CATH,2151-05-01 11:30:00,...,2151-04-28 14:34:48,2.089329,EMERGENCY,CSRU,0,0,0,1,0,50


In [16]:
test_ids

Unnamed: 0,subject_id,hadm_id,icustay_id,gender,ethnicity,age,insurance,admittime,diagnosis_at_admission,dischtime,...,outtime,los_icu,admission_type,first_careunit,mort_icu,mort_hosp,hospital_expire_flag,hospstay_seq,readmission_30,max_hours
90,142,131357,269111,F,WHITE,48.090299,Private,2143-04-01 07:15:00,MR\MITRAL VALVE REPLACEMENT MINIMALLY INVASIVE...,2143-04-05 18:40:00,...,2143-04-03 11:11:38,1.869456,ELECTIVE,CSRU,0,0,0,1,0,44
821,1268,183355,278756,M,WHITE,81.837515,Medicare,2110-05-07 08:00:00,CORONARY ARTERY DISEASE\CORONARY ARTERY BYPASS...,2110-05-12 20:15:00,...,2110-05-08 18:44:50,1.283391,ELECTIVE,CSRU,0,0,0,1,0,30
695,1087,106134,285291,M,UNKNOWN/NOT SPECIFIED,52.978333,Private,2169-12-21 21:52:00,AMPUTATION OF 4TH FINGER LEFT HAND,2169-12-27 11:51:00,...,2169-12-23 17:01:03,1.796759,EMERGENCY,SICU,0,0,0,1,0,43
197,298,119446,225523,F,WHITE,72.896938,Medicare,2140-08-07 00:33:00,LOWER GI BLEED,2140-08-19 17:00:00,...,2140-08-09 06:46:53,2.258981,EMERGENCY,MICU,0,0,0,1,0,54
483,746,117382,225829,M,WHITE,74.483316,Medicare,2159-11-26 09:33:00,STROKE/TIA,2159-12-05 16:30:00,...,2159-11-30 16:52:14,4.267731,EMERGENCY,SICU,0,0,0,1,0,102
889,1377,192054,273715,M,UNKNOWN/NOT SPECIFIED,72.922407,Medicare,2146-12-19 20:41:00,CONGESTIVE HEART FAILURE,2147-01-08 13:30:00,...,2147-01-05 10:47:00,6.123623,EMERGENCY,CSRU,0,0,0,1,0,146
610,952,163476,229883,M,BLACK/AFRICAN AMERICAN,62.044208,Medicare,2180-03-02 03:35:00,RESPIRATORY DISTRESS,2180-03-10 14:36:00,...,2180-03-07 22:10:35,5.773368,EMERGENCY,MICU,0,0,0,1,0,138
749,1163,127531,239353,M,WHITE,56.296597,Private,2195-08-19 21:08:00,DYSPNEA,2195-08-24 14:40:00,...,2195-08-20 18:51:04,0.903947,EMERGENCY,MICU,0,0,0,1,0,21
658,1026,103873,215365,F,WHITE,86.472043,Medicare,2196-04-02 05:18:00,SEPSIS,2196-04-04 16:30:00,...,2196-04-04 16:57:16,2.484514,EMERGENCY,CCU,0,0,0,1,0,59
773,1197,176732,217880,F,BLACK/AFRICAN AMERICAN,68.672025,Medicare,2193-08-21 21:44:00,INFECTION;GASTROINTESTINAL BLEED,2193-08-29 11:10:00,...,2193-08-23 21:36:53,1.992859,EMERGENCY,MICU,0,0,0,1,0,47


## Imputation and Standardization of Time Series Features

In [17]:
X_clean = simple_imputer(X,train_ids['subject_id'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)


In [21]:
X_clean

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,LEVEL2,alanine aminotransferase,alanine aminotransferase,alanine aminotransferase,albumin,albumin,albumin,albumin ascites,albumin ascites,albumin ascites,albumin pleural,...,venous pvo2,weight,weight,weight,white blood cell count,white blood cell count,white blood cell count,white blood cell count urine,white blood cell count urine,white blood cell count urine
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Aggregation Function,mask,mean,time_since_measured,mask,mean,time_since_measured,mask,mean,time_since_measured,mask,...,time_since_measured,mask,mean,time_since_measured,mask,mean,time_since_measured,mask,mean,time_since_measured
subject_id,hadm_id,icustay_id,hours_in,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2
3,145834,211552,0,1.0,25.0,0.0,1.0,1.800000,0.0,0.0,,100.0,0.0,...,100.0,0.0,107.0,100.0,1.0,14.842857,0.0,0.0,35.000000,100.0
3,145834,211552,1,0.0,25.0,1.0,0.0,1.800000,1.0,0.0,,100.0,0.0,...,100.0,0.0,107.0,100.0,0.0,14.842857,1.0,0.0,35.000000,100.0
3,145834,211552,2,0.0,25.0,2.0,0.0,1.800000,2.0,0.0,,100.0,0.0,...,100.0,0.0,107.0,100.0,0.0,14.842857,2.0,0.0,35.000000,100.0
3,145834,211552,3,0.0,25.0,3.0,0.0,1.800000,3.0,0.0,,100.0,0.0,...,100.0,0.0,107.0,100.0,0.0,14.842857,3.0,0.0,35.000000,100.0
3,145834,211552,4,0.0,25.0,4.0,0.0,1.800000,4.0,0.0,,100.0,0.0,...,100.0,0.0,107.0,100.0,0.0,14.842857,4.0,0.0,35.000000,100.0
3,145834,211552,5,0.0,25.0,5.0,0.0,1.800000,5.0,0.0,,100.0,0.0,...,100.0,0.0,107.0,100.0,0.0,14.842857,5.0,0.0,35.000000,100.0
3,145834,211552,6,0.0,25.0,6.0,0.0,1.800000,6.0,0.0,,100.0,0.0,...,100.0,0.0,107.0,100.0,0.0,14.842857,6.0,0.0,35.000000,100.0
3,145834,211552,7,0.0,25.0,7.0,0.0,1.800000,7.0,0.0,,100.0,0.0,...,100.0,0.0,107.0,100.0,1.0,24.400000,0.0,0.0,35.000000,100.0
3,145834,211552,8,0.0,25.0,8.0,0.0,1.800000,8.0,0.0,,100.0,0.0,...,100.0,0.0,107.0,100.0,0.0,24.400000,1.0,1.0,35.000000,0.0
3,145834,211552,9,0.0,25.0,9.0,0.0,1.800000,9.0,0.0,,100.0,0.0,...,100.0,0.0,107.0,100.0,0.0,24.400000,2.0,0.0,35.000000,1.0


In [22]:
def minmax(x):# normalize
    mins = x.min()
    maxes = x.max()
    x_std = (x - mins) / (maxes - mins)
    return x_std

In [23]:
def std_time_since_measurement(x):
    idx = pd.IndexSlice
    x = np.where(x==100, 0, x)
    means = x.mean()
    stds = x.std()
    x_std = (x - means)/stds
    return x_std

In [24]:
idx = pd.IndexSlice
X_std = X_clean.copy()
X_std.loc[:,idx[:,'mean']] = X_std.loc[:,idx[:,'mean']].apply(lambda x: minmax(x))
X_std.loc[:,idx[:,'time_since_measured']] = X_std.loc[:,idx[:,'time_since_measured']].apply(lambda x: std_time_since_measurement(x))

  


In [25]:
X_std

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,LEVEL2,alanine aminotransferase,alanine aminotransferase,alanine aminotransferase,albumin,albumin,albumin,albumin ascites,albumin ascites,albumin ascites,albumin pleural,...,venous pvo2,weight,weight,weight,white blood cell count,white blood cell count,white blood cell count,white blood cell count urine,white blood cell count urine,white blood cell count urine
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Aggregation Function,mask,mean,time_since_measured,mask,mean,time_since_measured,mask,mean,time_since_measured,mask,...,time_since_measured,mask,mean,time_since_measured,mask,mean,time_since_measured,mask,mean,time_since_measured
subject_id,hadm_id,icustay_id,hours_in,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2
3,145834,211552,0,1.0,0.019704,-1.014724,1.0,0.000000,-1.191095,0.0,,,0.0,...,-1.462355,0.0,0.951872,-1.069032,1.0,0.398095,-1.333207,0.0,0.046154,-1.544729
3,145834,211552,1,0.0,0.019704,-1.000048,0.0,0.000000,-1.178426,0.0,,,0.0,...,-1.462355,0.0,0.951872,-1.069032,0.0,0.398095,-1.203332,0.0,0.046154,-1.544729
3,145834,211552,2,0.0,0.019704,-0.985372,0.0,0.000000,-1.165758,0.0,,,0.0,...,-1.462355,0.0,0.951872,-1.069032,0.0,0.398095,-1.073457,0.0,0.046154,-1.544729
3,145834,211552,3,0.0,0.019704,-0.970696,0.0,0.000000,-1.153089,0.0,,,0.0,...,-1.462355,0.0,0.951872,-1.069032,0.0,0.398095,-0.943582,0.0,0.046154,-1.544729
3,145834,211552,4,0.0,0.019704,-0.956020,0.0,0.000000,-1.140421,0.0,,,0.0,...,-1.462355,0.0,0.951872,-1.069032,0.0,0.398095,-0.813708,0.0,0.046154,-1.544729
3,145834,211552,5,0.0,0.019704,-0.941344,0.0,0.000000,-1.127752,0.0,,,0.0,...,-1.462355,0.0,0.951872,-1.069032,0.0,0.398095,-0.683833,0.0,0.046154,-1.544729
3,145834,211552,6,0.0,0.019704,-0.926668,0.0,0.000000,-1.115083,0.0,,,0.0,...,-1.462355,0.0,0.951872,-1.069032,0.0,0.398095,-0.553958,0.0,0.046154,-1.544729
3,145834,211552,7,0.0,0.019704,-0.911992,0.0,0.000000,-1.102415,0.0,,,0.0,...,-1.462355,0.0,0.951872,-1.069032,1.0,0.716667,-1.333207,0.0,0.046154,-1.544729
3,145834,211552,8,0.0,0.019704,-0.897316,0.0,0.000000,-1.089746,0.0,,,0.0,...,-1.462355,0.0,0.951872,-1.069032,0.0,0.716667,-1.203332,1.0,0.046154,-1.544729
3,145834,211552,9,0.0,0.019704,-0.882640,0.0,0.000000,-1.077077,0.0,,,0.0,...,-1.462355,0.0,0.951872,-1.069032,0.0,0.716667,-1.073457,0.0,0.046154,-1.540717


In [26]:
X_std.columns = X_std.columns.droplevel(-1)

In [27]:
del X

In [28]:
X_std

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,LEVEL2,alanine aminotransferase,alanine aminotransferase,alanine aminotransferase,albumin,albumin,albumin,albumin ascites,albumin ascites,albumin ascites,albumin pleural,...,venous pvo2,weight,weight,weight,white blood cell count,white blood cell count,white blood cell count,white blood cell count urine,white blood cell count urine,white blood cell count urine
subject_id,hadm_id,icustay_id,hours_in,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
3,145834,211552,0,1.0,0.019704,-1.014724,1.0,0.000000,-1.191095,0.0,,,0.0,...,-1.462355,0.0,0.951872,-1.069032,1.0,0.398095,-1.333207,0.0,0.046154,-1.544729
3,145834,211552,1,0.0,0.019704,-1.000048,0.0,0.000000,-1.178426,0.0,,,0.0,...,-1.462355,0.0,0.951872,-1.069032,0.0,0.398095,-1.203332,0.0,0.046154,-1.544729
3,145834,211552,2,0.0,0.019704,-0.985372,0.0,0.000000,-1.165758,0.0,,,0.0,...,-1.462355,0.0,0.951872,-1.069032,0.0,0.398095,-1.073457,0.0,0.046154,-1.544729
3,145834,211552,3,0.0,0.019704,-0.970696,0.0,0.000000,-1.153089,0.0,,,0.0,...,-1.462355,0.0,0.951872,-1.069032,0.0,0.398095,-0.943582,0.0,0.046154,-1.544729
3,145834,211552,4,0.0,0.019704,-0.956020,0.0,0.000000,-1.140421,0.0,,,0.0,...,-1.462355,0.0,0.951872,-1.069032,0.0,0.398095,-0.813708,0.0,0.046154,-1.544729
3,145834,211552,5,0.0,0.019704,-0.941344,0.0,0.000000,-1.127752,0.0,,,0.0,...,-1.462355,0.0,0.951872,-1.069032,0.0,0.398095,-0.683833,0.0,0.046154,-1.544729
3,145834,211552,6,0.0,0.019704,-0.926668,0.0,0.000000,-1.115083,0.0,,,0.0,...,-1.462355,0.0,0.951872,-1.069032,0.0,0.398095,-0.553958,0.0,0.046154,-1.544729
3,145834,211552,7,0.0,0.019704,-0.911992,0.0,0.000000,-1.102415,0.0,,,0.0,...,-1.462355,0.0,0.951872,-1.069032,1.0,0.716667,-1.333207,0.0,0.046154,-1.544729
3,145834,211552,8,0.0,0.019704,-0.897316,0.0,0.000000,-1.089746,0.0,,,0.0,...,-1.462355,0.0,0.951872,-1.069032,0.0,0.716667,-1.203332,1.0,0.046154,-1.544729
3,145834,211552,9,0.0,0.019704,-0.882640,0.0,0.000000,-1.077077,0.0,,,0.0,...,-1.462355,0.0,0.951872,-1.069032,0.0,0.716667,-1.073457,0.0,0.046154,-1.540717


## Categorization of Static Features

In [29]:
def categorize_age(age):
    if age > 10 and age <= 30: 
        cat = 1
    elif age > 30 and age <= 50:
        cat = 2
    elif age > 50 and age <= 70:
        cat = 3
    else: 
        cat = 4
    return cat

def categorize_ethnicity(ethnicity):
    if 'AMERICAN INDIAN' in ethnicity:
        ethnicity = 'AMERICAN INDIAN'
    elif 'ASIAN' in ethnicity:
        ethnicity = 'ASIAN'
    elif 'WHITE' in ethnicity:
        ethnicity = 'WHITE'
    elif 'HISPANIC' in ethnicity:
        ethnicity = 'HISPANIC/LATINO'
    elif 'BLACK' in ethnicity:
        ethnicity = 'BLACK'
    else: 
        ethnicity = 'OTHER'
    return ethnicity

In [30]:
# use gender, first_careunit, age and ethnicity for prediction
static_to_keep = static[['gender', 'age', 'ethnicity', 'first_careunit', 'intime']]

In [31]:
static_to_keep

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,gender,age,ethnicity,first_careunit,intime
subject_id,hadm_id,icustay_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
3,145834,211552,M,76.526792,WHITE,MICU,2101-10-20 19:10:11
4,185777,294638,F,47.845047,WHITE,MICU,2191-03-16 00:29:31
6,107064,228232,F,65.942297,WHITE,SICU,2175-05-30 21:30:54
9,150750,220597,M,41.790228,UNKNOWN/NOT SPECIFIED,MICU,2149-11-09 13:07:02
11,194540,229441,F,50.148295,WHITE,SICU,2178-04-16 06:19:32
12,112213,232669,M,72.374177,WHITE,SICU,2104-08-08 02:08:17
13,143045,263738,F,39.866118,WHITE,CCU,2167-01-08 18:44:25
17,194023,277042,F,47.455336,WHITE,CSRU,2134-12-27 16:21:48
18,188822,298129,M,50.841559,WHITE,CCU,2167-10-02 11:20:39
19,109235,273430,M,300.002970,WHITE,TSICU,2108-08-05 16:26:09


In [32]:
static_to_keep.loc[:, 'intime'] = static_to_keep['intime'].astype('datetime64').apply(lambda x : x.hour)
static_to_keep

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,gender,age,ethnicity,first_careunit,intime
subject_id,hadm_id,icustay_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
3,145834,211552,M,76.526792,WHITE,MICU,19
4,185777,294638,F,47.845047,WHITE,MICU,0
6,107064,228232,F,65.942297,WHITE,SICU,21
9,150750,220597,M,41.790228,UNKNOWN/NOT SPECIFIED,MICU,13
11,194540,229441,F,50.148295,WHITE,SICU,6
12,112213,232669,M,72.374177,WHITE,SICU,2
13,143045,263738,F,39.866118,WHITE,CCU,18
17,194023,277042,F,47.455336,WHITE,CSRU,16
18,188822,298129,M,50.841559,WHITE,CCU,11
19,109235,273430,M,300.002970,WHITE,TSICU,16


In [33]:
static_to_keep.loc[:, 'age'] = static_to_keep['age'].apply(categorize_age)
static_to_keep.loc[:, 'ethnicity'] = static_to_keep['ethnicity'].apply(categorize_ethnicity)
static_to_keep = pd.get_dummies(static_to_keep, columns = ['gender', 'age', 'ethnicity', 'first_careunit'])
static_to_keep

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,intime,gender_F,gender_M,age_1,age_2,age_3,age_4,ethnicity_AMERICAN INDIAN,ethnicity_ASIAN,ethnicity_BLACK,ethnicity_HISPANIC/LATINO,ethnicity_OTHER,ethnicity_WHITE,first_careunit_CCU,first_careunit_CSRU,first_careunit_MICU,first_careunit_SICU,first_careunit_TSICU
subject_id,hadm_id,icustay_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
3,145834,211552,19,0,1,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0
4,185777,294638,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0
6,107064,228232,21,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0
9,150750,220597,13,0,1,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0
11,194540,229441,6,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0
12,112213,232669,2,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0
13,143045,263738,18,1,0,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0
17,194023,277042,16,1,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0
18,188822,298129,11,0,1,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0
19,109235,273430,16,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1


In [17]:
# # use gender, first_careunit, age and ethnicity for prediction
# static_to_keep = static[['gender', 'age', 'ethnicity', 'first_careunit', 'intime']]
# static_to_keep.loc[:, 'intime'] = static_to_keep['intime'].astype('datetime64').apply(lambda x : x.hour)
# static_to_keep.loc[:, 'age'] = static_to_keep['age'].apply(categorize_age)
# static_to_keep.loc[:, 'ethnicity'] = static_to_keep['ethnicity'].apply(categorize_ethnicity)
# static_to_keep = pd.get_dummies(static_to_keep, columns = ['gender', 'age', 'ethnicity', 'first_careunit'])

## Create Feature Matrix

In [34]:
X_merge = pd.merge(X_std.reset_index(), static_to_keep.reset_index(), on=['subject_id','icustay_id','hadm_id'])
X_merge

Unnamed: 0,subject_id,hadm_id,icustay_id,hours_in,alanine aminotransferase,alanine aminotransferase.1,alanine aminotransferase.2,albumin,albumin.1,albumin.2,...,ethnicity_ASIAN,ethnicity_BLACK,ethnicity_HISPANIC/LATINO,ethnicity_OTHER,ethnicity_WHITE,first_careunit_CCU,first_careunit_CSRU,first_careunit_MICU,first_careunit_SICU,first_careunit_TSICU
0,3,145834,211552,0,1.0,0.019704,-1.014724,1.0,0.000000,-1.191095,...,0,0,0,0,1,0,0,1,0,0
1,3,145834,211552,1,0.0,0.019704,-1.000048,0.0,0.000000,-1.178426,...,0,0,0,0,1,0,0,1,0,0
2,3,145834,211552,2,0.0,0.019704,-0.985372,0.0,0.000000,-1.165758,...,0,0,0,0,1,0,0,1,0,0
3,3,145834,211552,3,0.0,0.019704,-0.970696,0.0,0.000000,-1.153089,...,0,0,0,0,1,0,0,1,0,0
4,3,145834,211552,4,0.0,0.019704,-0.956020,0.0,0.000000,-1.140421,...,0,0,0,0,1,0,0,1,0,0
5,3,145834,211552,5,0.0,0.019704,-0.941344,0.0,0.000000,-1.127752,...,0,0,0,0,1,0,0,1,0,0
6,3,145834,211552,6,0.0,0.019704,-0.926668,0.0,0.000000,-1.115083,...,0,0,0,0,1,0,0,1,0,0
7,3,145834,211552,7,0.0,0.019704,-0.911992,0.0,0.000000,-1.102415,...,0,0,0,0,1,0,0,1,0,0
8,3,145834,211552,8,0.0,0.019704,-0.897316,0.0,0.000000,-1.089746,...,0,0,0,0,1,0,0,1,0,0
9,3,145834,211552,9,0.0,0.019704,-0.882640,0.0,0.000000,-1.077077,...,0,0,0,0,1,0,0,1,0,0


In [35]:
abs_time = (X_merge['intime'] + X_merge['hours_in'])%24

In [36]:
X_merge.insert(4, 'absolute_time', abs_time)

In [37]:
X_merge

Unnamed: 0,subject_id,hadm_id,icustay_id,hours_in,absolute_time,alanine aminotransferase,alanine aminotransferase.1,alanine aminotransferase.2,albumin,albumin.1,...,ethnicity_ASIAN,ethnicity_BLACK,ethnicity_HISPANIC/LATINO,ethnicity_OTHER,ethnicity_WHITE,first_careunit_CCU,first_careunit_CSRU,first_careunit_MICU,first_careunit_SICU,first_careunit_TSICU
0,3,145834,211552,0,19,1.0,0.019704,-1.014724,1.0,0.000000,...,0,0,0,0,1,0,0,1,0,0
1,3,145834,211552,1,20,0.0,0.019704,-1.000048,0.0,0.000000,...,0,0,0,0,1,0,0,1,0,0
2,3,145834,211552,2,21,0.0,0.019704,-0.985372,0.0,0.000000,...,0,0,0,0,1,0,0,1,0,0
3,3,145834,211552,3,22,0.0,0.019704,-0.970696,0.0,0.000000,...,0,0,0,0,1,0,0,1,0,0
4,3,145834,211552,4,23,0.0,0.019704,-0.956020,0.0,0.000000,...,0,0,0,0,1,0,0,1,0,0
5,3,145834,211552,5,0,0.0,0.019704,-0.941344,0.0,0.000000,...,0,0,0,0,1,0,0,1,0,0
6,3,145834,211552,6,1,0.0,0.019704,-0.926668,0.0,0.000000,...,0,0,0,0,1,0,0,1,0,0
7,3,145834,211552,7,2,0.0,0.019704,-0.911992,0.0,0.000000,...,0,0,0,0,1,0,0,1,0,0
8,3,145834,211552,8,3,0.0,0.019704,-0.897316,0.0,0.000000,...,0,0,0,0,1,0,0,1,0,0
9,3,145834,211552,9,4,0.0,0.019704,-0.882640,0.0,0.000000,...,0,0,0,0,1,0,0,1,0,0


In [38]:
X_merge.drop('intime', axis=1, inplace=True)

In [40]:
X_merge = X_merge.set_index(['subject_id','icustay_id','hadm_id','hours_in'])

In [41]:
X_merge

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,absolute_time,alanine aminotransferase,alanine aminotransferase,alanine aminotransferase,albumin,albumin,albumin,albumin ascites,albumin ascites,albumin ascites,...,ethnicity_ASIAN,ethnicity_BLACK,ethnicity_HISPANIC/LATINO,ethnicity_OTHER,ethnicity_WHITE,first_careunit_CCU,first_careunit_CSRU,first_careunit_MICU,first_careunit_SICU,first_careunit_TSICU
subject_id,icustay_id,hadm_id,hours_in,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
3,211552,145834,0,19,1.0,0.019704,-1.014724,1.0,0.000000,-1.191095,0.0,,,...,0,0,0,0,1,0,0,1,0,0
3,211552,145834,1,20,0.0,0.019704,-1.000048,0.0,0.000000,-1.178426,0.0,,,...,0,0,0,0,1,0,0,1,0,0
3,211552,145834,2,21,0.0,0.019704,-0.985372,0.0,0.000000,-1.165758,0.0,,,...,0,0,0,0,1,0,0,1,0,0
3,211552,145834,3,22,0.0,0.019704,-0.970696,0.0,0.000000,-1.153089,0.0,,,...,0,0,0,0,1,0,0,1,0,0
3,211552,145834,4,23,0.0,0.019704,-0.956020,0.0,0.000000,-1.140421,0.0,,,...,0,0,0,0,1,0,0,1,0,0
3,211552,145834,5,0,0.0,0.019704,-0.941344,0.0,0.000000,-1.127752,0.0,,,...,0,0,0,0,1,0,0,1,0,0
3,211552,145834,6,1,0.0,0.019704,-0.926668,0.0,0.000000,-1.115083,0.0,,,...,0,0,0,0,1,0,0,1,0,0
3,211552,145834,7,2,0.0,0.019704,-0.911992,0.0,0.000000,-1.102415,0.0,,,...,0,0,0,0,1,0,0,1,0,0
3,211552,145834,8,3,0.0,0.019704,-0.897316,0.0,0.000000,-1.089746,0.0,,,...,0,0,0,0,1,0,0,1,0,0
3,211552,145834,9,4,0.0,0.019704,-0.882640,0.0,0.000000,-1.077077,0.0,,,...,0,0,0,0,1,0,0,1,0,0


In [18]:
# # merge time series and static data
# # X_merge = pd.merge(X_std.reset_index(), static_to_keep.reset_index(), on=['subject_id','icustay_id','hadm_id'])
# # add absolute time feature
# abs_time = (X_merge['intime'] + X_merge['hours_in'])%24
# X_merge.insert(4, 'absolute_time', abs_time)
# X_merge.drop('intime', axis=1, inplace=True)
# X_merge = X_merge.set_index(['subject_id','icustay_id','hadm_id','hours_in'])

In [42]:
del X_std, X_clean

## Make Tensors

In [55]:
def create_x_matrix(x):
    zeros = np.zeros((MAX_LEN, x.shape[1]-4))
    x = x.values
    x = x[:(MAX_LEN), 4:]
    zeros[0:x.shape[0], :] = x
    return zeros

def create_y_matrix(y):
    zeros = np.zeros((MAX_LEN, y.shape[1]-4))
    y = y.values
    y = y[:,4:]
    y = y[:MAX_LEN, :]
    zeros[:y.shape[0], :] = y
    return zeros

In [58]:
Y.loc[idx[:44],:]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,vent,vaso,adenosine,dobutamine,dopamine,epinephrine,isuprel,milrinone,norepinephrine,phenylephrine,vasopressin,colloid_bolus,crystalloid_bolus,nivdurations
subject_id,hadm_id,icustay_id,hours_in,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
3,145834,211552,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3,145834,211552,1,1,1,0,0,1,0,0,0,0,1,0,0,0,0
3,145834,211552,2,1,1,0,0,1,0,0,0,0,1,0,0,0,0
3,145834,211552,3,1,1,0,0,0,0,0,0,0,1,0,0,0,0
3,145834,211552,4,1,1,0,0,0,0,0,0,1,1,0,0,0,0
3,145834,211552,5,1,1,0,0,0,0,0,0,1,1,0,0,0,0
3,145834,211552,6,1,1,0,0,0,0,0,0,1,1,0,0,0,0
3,145834,211552,7,1,1,0,0,0,0,0,0,1,1,0,0,0,0
3,145834,211552,8,1,1,0,0,0,0,0,0,1,1,0,0,0,0
3,145834,211552,9,1,1,0,0,0,0,0,0,1,1,0,0,0,0


In [60]:
x = np.array(list(X_merge.reset_index().groupby('subject_id').apply(create_x_matrix)))
y = np.array(list(Y.loc[idx[:44],:].reset_index().groupby('subject_id').apply(create_y_matrix)))[:,:,0]

In [61]:
x.shape

(30, 240, 330)

In [62]:
y.shape

(30, 240)

In [63]:
y

array([[1., 1., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.]])

In [64]:
lengths = np.array(list(X_merge.reset_index().groupby('subject_id').apply(lambda x: x.shape[0])))

In [65]:
lengths

array([146,  41,  89, 128,  39, 184,  88,  50,  31,  32,  26, 142,  28,
        31,  13,  86,  52,  27,  45, 191, 185,  38,  18, 130,  27,  28,
        82,  46, 112, 100])

In [67]:
keys = pd.Series(X_merge.reset_index()['subject_id'].unique())

In [68]:
keys

0      3
1      4
2      6
3      9
4     11
5     12
6     13
7     17
8     18
9     19
10    20
11    21
12    22
13    23
14    24
15    25
16    26
17    28
18    30
19    31
20    32
21    33
22    34
23    35
24    36
25    37
26    41
27    42
28    43
29    44
dtype: int64

In [69]:
print("X tensor shape: ", x.shape)
print("Y tensor shape: ", y.shape)
print("lengths shape: ", lengths.shape)

X tensor shape:  (30, 240, 330)
Y tensor shape:  (30, 240)
lengths shape:  (30,)


## Stratified Sampling

In [70]:
np.where(keys.isin(train_ids['subject_id']))

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 12, 14, 15, 16, 17, 18,
        20, 21, 23, 24, 25, 27, 28, 29], dtype=int64),)

In [71]:
np.where(keys.isin(test_ids['subject_id']))[0]

array([11, 13, 19, 22, 26], dtype=int64)

In [73]:
np.where(keys.isin(split_train_ids['subject_id']))

(array([ 0,  1,  4,  5,  6,  7,  8,  9, 10, 14, 15, 16, 17, 18, 21, 23, 25,
        27, 28, 29], dtype=int64),)

In [74]:
np.where(keys.isin(val_ids['subject_id']))[0]

array([ 2,  3, 12, 20, 24], dtype=int64)

In [75]:
train_indices = np.where(keys.isin(train_ids['subject_id']))[0]
test_indices = np.where(keys.isin(test_ids['subject_id']))[0]
train_static = train_ids
split_train_indices = np.where(keys.isin(split_train_ids['subject_id']))[0]
val_indices = np.where(keys.isin(val_ids['subject_id']))[0]

In [76]:
X_train = x[split_train_indices]
Y_train = y[split_train_indices]
X_test = x[test_indices]
Y_test = y[test_indices]
X_val = x[val_indices]
Y_val = y[val_indices]
lengths_train = lengths[split_train_indices]
lengths_val = lengths[val_indices]
lengths_test = lengths[test_indices]

In [27]:
print("Training size: ", X_train.shape[0])
print("Validation size: ", X_val.shape[0])
print("Test size: ", X_test.shape[0])

Training size:  24129
Validation size:  3448
Test size:  6895


In [99]:
set(np.diff(Y_train[0][1: 1 + 6]))

{0.0}

In [91]:
np.expand_dims(Y_train[0][1: 1 + 6], 1).shape

(6, 1)

In [92]:
np.concatenate((X_train[0][1: 1 + 6], np.expand_dims(Y_train[0][1: 1 + 6],1)), axis=1).shape

(6, 331)

## Make Windows

In [102]:
def make_3d_tensor_slices(X_tensor, Y_tensor, lengths):

    num_patients = X_tensor.shape[0]
    timesteps = X_tensor.shape[1]
    num_features = X_tensor.shape[2]
    # SLICE_SIZE 片大小 6
    X_tensor_new = np.zeros((lengths.sum(), SLICE_SIZE, num_features + 1))
    Y_tensor_new = np.zeros((lengths.sum()))

    current_row = 0
    
    for patient_index in range(num_patients):
        x_patient = X_tensor[patient_index]
        y_patient = Y_tensor[patient_index]
        length = lengths[patient_index]
# PREDICTION_WINDOW 预测窗口 4 
# GAP_TIME 4 间隔时间
# 
        for timestep in range(length - PREDICTION_WINDOW - GAP_TIME - SLICE_SIZE):
            x_window = x_patient[timestep:timestep+SLICE_SIZE]
            y_window = y_patient[timestep:timestep+SLICE_SIZE]
            x_window = np.concatenate((x_window, np.expand_dims(y_window,1)), axis=1)
            # 隔了 PREDICTION_WINDOW
            result_window = y_patient[timestep+SLICE_SIZE+GAP_TIME:timestep+SLICE_SIZE+GAP_TIME+PREDICTION_WINDOW]
            result_window_diff = set(np.diff(result_window))
            # 如果有1 意味着 有 从 0 -》 变到 1 
            #if 1 in result_window_diff: pdb.set_trace()
            gap_window = y_patient[timestep+SLICE_SIZE:timestep+SLICE_SIZE+GAP_TIME]
            gap_window_diff = set(np.diff(gap_window))

            #print result_window, result_window_diff

            if OUTCOME_TYPE == 'binary':
                if max(gap_window) == 1:
                    result = None
                elif max(result_window) == 1:
                    result = 1
                elif max(result_window) == 0:
                    result = 0
                if result != None:
                    X_tensor_new[current_row] = x_window
                    Y_tensor_new[current_row] = result
                    current_row += 1

            else: 
                if 1 in gap_window_diff or -1 in gap_window_diff:
                    result = None
                elif (len(result_window_diff) == 1) and (0 in result_window_diff) and (max(result_window) == 0):
                    result = CHUNK_KEY['CONTROL']
                elif (len(result_window_diff) == 1) and (0 in result_window_diff) and (max(result_window) == 1):
                    result = CHUNK_KEY['ON_INTERVENTION']
                elif 1 in result_window_diff: 
                    result = CHUNK_KEY['ONSET']
                elif -1 in result_window_diff:
                    result = CHUNK_KEY['WEAN']
                else:
                    result = None

                if result != None:
                    X_tensor_new[current_row] = x_window
                    Y_tensor_new[current_row] = result
                    current_row += 1

    X_tensor_new = X_tensor_new[:current_row,:,:]
    Y_tensor_new = Y_tensor_new[:current_row]

    return X_tensor_new, Y_tensor_new, current_row

In [105]:
x_train, y_train, a = make_3d_tensor_slices(X_train, Y_train, lengths_train)
x_val, y_val, b = make_3d_tensor_slices(X_val, Y_val, lengths_val)
x_test, y_test, c = make_3d_tensor_slices(X_test, Y_test, lengths_test)

In [107]:
a

933

In [106]:
x_train.shape

(933, 6, 331)

In [93]:
y_train.shape

(933,)

In [110]:
y_train_classes = label_binarize(y_train, classes=range(NUM_CLASSES))
y_val_classes = label_binarize(y_val, classes=range(NUM_CLASSES))
y_test_classes = label_binarize(y_test, classes=range(NUM_CLASSES))

In [119]:
y_train_classes

array([[0, 0, 1, 0],
       [0, 0, 1, 0],
       [0, 0, 1, 0],
       ...,
       [0, 1, 0, 0],
       [0, 1, 0, 0],
       [0, 1, 0, 0]])

In [112]:
y_train_classes.shape

(933, 4)

In [113]:
del X_train, Y_train, X_test, Y_test, X_val, Y_val

In [114]:
print('shape of x_train: ', x_train.shape)
print('shape of x_val: ', x_val.shape)
print('shape of x_test: ', x_test.shape)

shape of x_train:  (933, 6, 331)
shape of x_val:  (370, 6, 331)
shape of x_test:  (368, 6, 331)


(933,)

# Random Forest and Logistic Regression

## Prepare data

In [33]:
static_col = 17 #static_to_keep.shape[1] - 1
time_series_col = 124 #X_merge.shape[1] - static_col _shape_repr(array.shape)

In [34]:
def remove_duplicate_static(x):
    x_static = x[:,0,time_series_col:x.shape[2]-1]
    x_timeseries = np.reshape(x[:,:,:time_series_col],(x.shape[0], -1))
    x_int = x[:,:,-1]
    x_concat = np.concatenate((x_static, x_timeseries, x_int), axis=1)
    return x_concat

In [35]:
# concatenate hourly features
x_train_concat = remove_duplicate_static(x_train)
x_val_concat = remove_duplicate_static(x_val)
x_test_concat = remove_duplicate_static(x_test)

In [36]:
print(x_train_concat.shape)
print(x_val_concat.shape)
print(x_test_concat.shape)

(1107493, 767)
(161025, 767)
(314548, 767)


## Hyperparameter Generation

In [48]:
class DictDist():
    def __init__(self, dict_of_rvs): self.dict_of_rvs = dict_of_rvs
    def rvs(self, n):
        a = {k: v.rvs(n) for k, v in self.dict_of_rvs.items()}
        out = []
        for i in range(n): out.append({k: vs[i] for k, vs in a.items()})
        return out
    
class Choice():
    def __init__(self, options): self.options = options
    def rvs(self, n): return [self.options[i] for i in ss.randint(0, len(self.options)).rvs(n)]

In [49]:
N = 10
np.random.seed(RANDOM)
LR_dist = DictDist({
    'C': Choice(np.geomspace(1e-3, 1e3, 10000)),
    'penalty': Choice(['l2']),
    'solver': Choice(['sag']),
    'max_iter': Choice([100, 200]),
    'class_weight': Choice(['balanced']),
    'multi_class': Choice(['multinomial']),
    'random_state': Choice([RANDOM])
})
LR_hyperparams_list = LR_dist.rvs(N)
        
RF_dist = DictDist({
    'n_estimators': ss.randint(50, 200),
    'max_depth': ss.randint(2, 10),
    'min_samples_split': ss.randint(2, 75),
    'min_samples_leaf': ss.randint(1, 50),
    'class_weight': Choice(['balanced']),
    'random_state': Choice([RANDOM])
})
RF_hyperparams_list = RF_dist.rvs(N)

## Fit model

In [56]:
def run_basic(model, hyperparams_list, X_train, X_val, X_test):
    best_s, best_hyperparams = -np.Inf, None
    for i, hyperparams in enumerate(hyperparams_list):
        M = model(**hyperparams)
        M.fit(X_train, y_train)
        s = roc_auc_score(y_val_classes, M.predict_proba(X_val),average='macro')
        if s > best_s:
            best_s, best_hyperparams = s, hyperparams

    return run_only_final(model, best_hyperparams, X_train, X_val, X_test)

def run_only_final(model, best_hyperparams, X_train, X_val, X_test):
    best_M = model(**best_hyperparams)
    best_M.fit(np.concatenate((X_train, X_val)), np.concatenate((y_train, y_val)))
    y_pred  = best_M.predict_proba(X_test)
    idx = np.argmax(y_pred, axis=-1)
    y_pred_label = np.zeros(y_pred.shape)
    y_pred_label[np.arange(y_pred_label.shape[0]), idx] = 1
    auc   = roc_auc_score(y_test_classes, y_pred, average=None)
    aucmacro = roc_auc_score(y_test_classes, y_pred, average='macro')
    accuracy =  accuracy_score(y_test_classes, y_pred_label)
    f1 = f1_score(y_test_classes, y_pred_label, average='macro')
    auprc = average_precision_score(y_test_classes, y_pred_label, average='macro')
    return auc, aucmacro, accuracy, f1, auprc

In [57]:
results = {}
for model_name, model, hyperparams_list in [('RF', RandomForestClassifier, RF_hyperparams_list), 
                                            ('LR', LogisticRegression, LR_hyperparams_list)]:
    if model_name not in results: results[model_name] = {}

    results[model_name] = run_basic(
        model, hyperparams_list, x_train_concat, x_val_concat, x_test_concat)
    print("Final results for model %s, (AUC, Macro_AUC, Accuracy, F1 Macro, AUPRC Macro)" % (model_name))
    print(results[model_name])

Final results for model RF, (AUC, Macro_AUC, Accuracy, F1 Macro, AUPRC Macro)
(array([0.8707733 , 0.98981498, 0.9852656 , 0.93999635]), 0.9464625569824558, 0.7965493342828439, 0.4810231595666129, 0.42696393873684396)




Final results for model LR, (AUC, Macro_AUC, Accuracy, F1 Macro, AUPRC Macro)
(array([0.71889636, 0.98294004, 0.98429292, 0.93236335]), 0.9046231696007481, 0.7849231277897173, 0.4772524408862778, 0.43060796540254614)


# CNN

In [41]:
import tensorflow as tf
from tensorflow import set_random_seed

import keras
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, Flatten, Reshape, RepeatVector, Lambda
from keras.layers import Input, Conv2D, Conv1D, Conv3D, MaxPooling2D, MaxPooling1D
from keras.layers import Concatenate
from keras import backend as K
from keras.callbacks import EarlyStopping

import random as rn

Using TensorFlow backend.


In [53]:
BATCH_SIZE = 128
EPOCHS = 12
DROPOUT = 0.5

In [73]:
from sklearn.utils import class_weight
class_weight = class_weight.compute_class_weight('balanced', np.unique(y_train), y_train)
#class_weight = [1,1,1,1]
class_weight = dict(zip(range(len(class_weight)), class_weight))

In [74]:
sess = tf.Session(graph=tf.get_default_graph())
K.set_session(sess)

np.random.seed(RANDOM)
set_random_seed(RANDOM)
rn.seed(RANDOM)

input_shape = (x_train.shape[1], x_train.shape[2])
inputs = Input(shape=input_shape)
model = Conv1D(64, kernel_size=3,
                 strides=1,
                 activation='relu',
                 input_shape=input_shape,
                 padding='same',
                 name='conv2')(inputs)

model = (MaxPooling1D(pool_size=3, strides=1))(model)

model2 = Conv1D(64, kernel_size=4,
                 strides=1,
                 activation='relu',
                 input_shape=input_shape,
                 padding='same',
                 name='conv3')(inputs)

model2 = MaxPooling1D(pool_size=3, strides=1)(model2)

model3 = Conv1D(64, kernel_size=5,
                 strides=1,
                 activation='relu',
                 input_shape=input_shape,
                 padding='same',
                 name='conv4')(inputs)

model3 = MaxPooling1D(pool_size=3, strides=1)(model3)

models = [model, model2, model3]

full_model = keras.layers.concatenate(models)
full_model = Flatten()(full_model)
full_model = Dense(128, activation='relu')(full_model)
full_model = Dropout(DROPOUT)(full_model)
full_model = Dense(NUM_CLASSES, activation='softmax')(full_model)

full_model = keras.models.Model(input=inputs, outputs=full_model)

full_model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer=keras.optimizers.Adam(lr=.0005),
              metrics=['accuracy'])

early_stopping = EarlyStopping(monitor='val_loss', patience=2)

full_model.fit(x_train, y_train_classes,
          batch_size=BATCH_SIZE,
          epochs=EPOCHS,
          verbose=1,
          class_weight=class_weight,
          callbacks=[early_stopping],
          validation_data=(x_val, y_val_classes))



Train on 1107493 samples, validate on 161025 samples
Epoch 1/12
Epoch 2/12
Epoch 3/12


<keras.callbacks.History at 0x7fa94e9f2ac8>

In [75]:
test_preds_cnn = full_model.predict(x_test, batch_size=BATCH_SIZE)
idx = np.argmax(test_preds_cnn, axis=-1)
test_preds_cnn_label = np.zeros(test_preds_cnn.shape)
test_preds_cnn_label[np.arange(test_preds_cnn_label.shape[0]), idx] = 1
print("AUC:")
print(roc_auc_score(y_test_classes, test_preds_cnn, average=None))
print("AUC Macro:")
print(roc_auc_score(y_test_classes, test_preds_cnn, average='macro'))
print("Accuracy: ")
print(accuracy_score(y_test_classes, test_preds_cnn_label))
print("F1 Macro:")
print(f1_score(y_test_classes, test_preds_cnn_label, average='macro'))
print("AUPRC Macro: ")
print(average_precision_score(y_test_classes, test_preds_cnn_label, average='macro'))

AUC:
[0.72176515 0.98400569 0.98589619 0.93864793]
AUC Macro:
0.9075787404731244
Accuracy: 
0.6180106056945204
F1 Macro:
0.4442057432421319
AUPRC Macro: 
0.4239125752031193


# LSTM

In [115]:
import functools
from sklearn.utils import class_weight
class_weight = class_weight.compute_class_weight('balanced', np.unique(y_train), y_train)
#class_weight = [1,1,1,1]

In [117]:
np.unique(y_train)

array([0., 1., 2., 3.])

In [116]:
class_weight

array([19.4375    ,  0.3143531 ,  1.4578125 , 12.27631579])

In [43]:
BATCH_SIZE = 128
EPOCHS = 12
KEEP_PROB = 0.8
REGULARIZATION = 0.001
NUM_HIDDEN = [512, 512]

In [46]:
def lazy_property(function):
    attribute = '_' + function.__name__

    @property
    @functools.wraps(function)
    def wrapper(self):
        if not hasattr(self, attribute):
            setattr(self, attribute, function(self))
        return getattr(self, attribute)
    return wrapper


class VariableSequenceLabelling:

    def __init__(self, data, target, dropout_prob, reg, num_hidden, class_weights):
        self.data = data
        self.target = target
        self.dropout_prob = dropout_prob
        self.reg = reg
        self._num_hidden = num_hidden
        self._num_layers = len(num_hidden)
        self.num_classes = len(class_weights)
        self.attn_length = 0
        self.class_weights = class_weights
        self.prediction
        self.error
        self.optimize

    @lazy_property
    def make_rnn_cell(self,
                      attn_length=0,
                      base_cell=tf.nn.rnn_cell.BasicLSTMCell,
                      state_is_tuple=True):

        attn_length = self.attn_length
        input_dropout = self.dropout_prob
        output_dropout = self.dropout_prob

        cells = []
        for num_units in self._num_hidden:
            cell = base_cell(num_units, state_is_tuple=state_is_tuple)
            cell = tf.nn.rnn_cell.DropoutWrapper(cell, input_keep_prob=input_dropout, output_keep_prob=output_dropout,
                                                seed=RANDOM)
            cells.append(cell)

        cell = tf.nn.rnn_cell.MultiRNNCell(cells, state_is_tuple=state_is_tuple)

        return cell


    # predictor for slices
    @lazy_property
    def prediction(self):

        cell = self.make_rnn_cell

        # Recurrent network.
        output, final_state = tf.nn.dynamic_rnn(cell,
            self.data,
            dtype=tf.float32
        )

        with tf.variable_scope("model") as scope:
            tf.get_variable_scope().reuse_variables()

            # final weights
            num_classes = self.num_classes
            weight, bias = self._weight_and_bias(self._num_hidden[-1], num_classes)
    
            # flatten + sigmoid
            if self.attn_length > 0: 
                logits = tf.matmul(final_state[0][-1][-1], weight) + bias
            else: 
                logits = tf.matmul(final_state[-1][-1], weight) + bias

            prediction = tf.nn.softmax(logits)
            
            return logits, prediction

        
    @lazy_property
    def cross_ent(self):
        predictions = self.prediction[0]
        # tf.squeeze 删除所有为1 的维度
        real = tf.cast(tf.squeeze(self.target), tf.int32)
        # tf.gather 从params的axis维根据indices的参数值获取切片
        weights = tf.gather(self.class_weights, real)

        xent = tf.losses.sparse_softmax_cross_entropy(labels=real, logits=predictions, weights=weights)
        loss = tf.reduce_mean(xent) #shape 1
        ce = loss
        l2 = self.reg * sum(tf.nn.l2_loss(tf_var) for tf_var in tf.trainable_variables())
        ce += l2
        return ce

    @lazy_property
    def optimize(self):
        learning_rate = 0.0003
        optimizer = tf.train.AdamOptimizer(learning_rate)
        return optimizer.minimize(self.cross_ent)

    @lazy_property
    def error(self):
        prediction = tf.argmax(self.prediction[1], 1)
        real = tf.cast(self.target, tf.int32)
        prediction = tf.cast(prediction, tf.int32)
        mistakes = tf.not_equal(real, prediction)
        mistakes = tf.cast(mistakes, tf.float32)
        mistakes = tf.reduce_sum(mistakes, reduction_indices=0)
        total = 128
        mistakes = tf.divide(mistakes, tf.to_float(total))
        return mistakes

    @staticmethod
    def _weight_and_bias(in_size, out_size):
        weight = tf.truncated_normal([in_size, out_size], stddev=0.01)
        bias = tf.constant(0.1, shape=[out_size])
        return tf.Variable(weight), tf.Variable(bias)

    #这个方法是添加变量到直方图中，但是不配合其他的方法，根本就显示不出来它的意义！
    @lazy_property
    def summaries(self):
        tf.summary.scalar('loss', tf.reduce_mean(self.cross_ent))
        tf.summary.scalar('error', self.error)
        merged = tf.summary.merge_all()
        return merged

In [47]:
tf.reset_default_graph()
set_random_seed(RANDOM)

config = tf.ConfigProto(allow_soft_placement = True)
# if attn_length > 0:
#     # weights file initialized
#     weight_file = 'weights.txt'
#     with open(weight_file, 'a') as the_file:
#         pass

with tf.Session(config = config) as sess, tf.device('/cpu:0'):
    _, length, num_features = x_train.shape
    num_data_cols = num_features

    # placeholders
    data = tf.placeholder(tf.float32, [None, length, num_data_cols])
    target = tf.placeholder(tf.float32, [None])
    dropout_prob = tf.placeholder(tf.float32)
    reg = tf.placeholder(tf.float32)

    # initialization
    model = VariableSequenceLabelling(data, target, dropout_prob, reg, num_hidden=NUM_HIDDEN, class_weights=class_weight)
    sess.run(tf.global_variables_initializer())

    
    batch_size = BATCH_SIZE
    dp = KEEP_PROB
    rp = REGULARIZATION
    train_samples = x_train.shape[0]
    indices = list(range(train_samples))
    num_classes = NUM_CLASSES
    
    # for storing results
    test_data = x_test
    val_data = x_val

    val_aucs = []
    test_aucs = []
    val_aucs_macro = []
    test_aucs_macro = []
    test_accuracys = []
    test_f1s = []
    test_auprcs = []
    
    epoch = -1

    while (epoch < 3 or max(np.diff(early_stop[-3:])) > 0):
        epoch += 1
        np.random.seed(RANDOM)
        np.random.shuffle(indices)

        num_batches = train_samples//batch_size
        for batch_index in range(num_batches):

            sample_indices = indices[batch_index*batch_size:batch_index*batch_size+batch_size]
            batch_data = x_train[sample_indices, :, :num_data_cols]
            batch_target = y_train[sample_indices]
            _, loss = sess.run([model.optimize, model.cross_ent], {data: batch_data, target: batch_target, dropout_prob: dp, reg: rp})

            # write train accuracy to log files every 10 batches
            #if batch_index % 2000 == 0:
            #    loss, prediction, error = sess.run([model.cross_ent, model.prediction, model.error], {data: batch_data, target: batch_target, dropout_prob: dp, reg: rp})
            #    #train_writer.add_summary(summaries, global_step=epoch*batch_index)
            #    print('Epoch {:2d} Batch {:2d}'.format(epoch+1, batch_index))
            #    print('Loss = ', np.mean(loss))
            #    print('Error = ', error)

        cur_val_preds = sess.run(model.prediction, {data: x_val, target: y_val, dropout_prob: 1, reg: rp}) 
        val_preds = cur_val_preds[1]
        
        cur_test_preds = sess.run(model.prediction, {data: x_test, target: y_test, dropout_prob: 1, reg: rp}) 
        test_preds = cur_test_preds[1]
        
        val_auc_macro = roc_auc_score(y_val_classes, val_preds, average='macro')
        test_auc_macro = roc_auc_score(y_test_classes, test_preds, average='macro')
        val_aucs_macro.append(val_auc_macro)
        test_aucs_macro.append(test_auc_macro)

        val_auc = roc_auc_score(y_val_classes, val_preds, average=None)
        test_auc = roc_auc_score(y_test_classes, test_preds, average=None)
        val_aucs.append(val_auc)
        test_aucs.append(test_auc)
        
        
        idx = np.argmax(cur_test_preds, axis=-1)
        test_preds_label = np.zeros(test_preds.shape)
        test_preds_label[np.arange(test_preds_label.shape[0]), idx] = 1
        test_accuracy = accuracy_score(y_test_classes, test_preds_label)
        test_accuracys.append(test_accuracy)

        test_f1 = f1_score(y_test_classes, test_preds_label, average='macro')
        test_f1s.append(test_f1)

        test_auprc = average_precision_score(y_test_classes, test_preds_label, average='macro')
        test_auprcs.append(test_auprc)

        
        if isinstance(val_aucs_macro[-1], dict):
            early_stop = [val_auc_macro for val_auc_macro in val_aucs_macro]
        else: 
            early_stop = val_aucs_macro


    if isinstance(val_aucs_macro[-1], dict):
        best_epoch = np.argmax(np.array([val_auc_macro for val_auc_macro in val_aucs_macro]))
    else: 
        best_epoch = np.argmax(val_aucs_macro)

    best_val_auc = val_aucs[best_epoch]
    best_test_auc = test_aucs[best_epoch]
    best_test_auc_macro = test_aucs_macro[best_epoch]
    best_test_accuracy = test_accuracys[best_epoch]
    best_test_f1 = test_f1s[best_epoch]
    best_test_auprc = test_auprcs[best_epoch]
    
    print("AUC:")
    print(best_test_auc)
    print("AUC Macro:")
    print(best_test_auc_macro)
    print("Accuracy: ")
    print(best_test_accuracy)
    print("F1 Macro:")
    print(best_test_f1)
    print("AUPRC Macro: ")
    print(best_test_auprc)

AUC:
[0.70118422 0.98406029 0.98295723 0.93134933]
AUC Macro:
0.8998877686897263
Accuracy: 
0.8432226560016277
F1 Macro:
0.5010009037632867
AUPRC Macro: 
0.4440893566798846
