Physionet Challenge

- https://physionet.org/challenge/2019/

In [1]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from keras.layers import LSTM
from keras.models import Sequential
from keras.layers import Dense
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

Using TensorFlow backend.


In [2]:
pat_sample_df = pd.read_csv('../../../datasets/training_setA/training/p000001.psv', sep='|')
pat_sample_df.describe()
pat_sample_df.columns
# 40 time-dependent variables + sepsis_label (SepsisLabel is 1 if t≥tsepsis−6 and 0 if t<tsepsis−6. For non-sepsis patients, SepsisLabel is 0.)

Index(['HR', 'O2Sat', 'Temp', 'SBP', 'MAP', 'DBP', 'Resp', 'EtCO2', 'BaseExcess', 'HCO3', 'FiO2', 'pH', 'PaCO2', 'SaO2', 'AST', 'BUN', 'Alkalinephos', 'Calcium', 'Chloride', 'Creatinine', 'Bilirubin_direct', 'Glucose', 'Lactate', 'Magnesium', 'Phosphate', 'Potassium', 'Bilirubin_total', 'TroponinI', 'Hct', 'Hgb', 'PTT', 'WBC', 'Fibrinogen', 'Platelets', 'Age', 'Gender', 'Unit1', 'Unit2', 'HospAdmTime', 'ICULOS', 'SepsisLabel'], dtype='object')

- Vital signs (columns 1-8)
    - HR	Heart rate (beats per minute)
    - O2Sat	Pulse oximetry (%)
    - Temp	Temperature (Deg C)
    - SBP	Systolic BP (mm Hg)
    - MAP	Mean arterial pressure (mm Hg)
    - DBP	Diastolic BP (mm Hg)
    - Resp	Respiration rate (breaths per minute)
    - EtCO2	End tidal carbon dioxide (mm Hg)
- Laboratory values (columns 9-34)
    - BaseExcess	Measure of excess bicarbonate (mmol/L)
    - HCO3	Bicarbonate (mmol/L)
    - FiO2	Fraction of inspired oxygen (%)
    - pH	N/A
    - PaCO2	Partial pressure of carbon dioxide from arterial blood (mm Hg)
    - SaO2	Oxygen saturation from arterial blood (%)
    - AST	Aspartate transaminase (IU/L)
    - BUN	Blood urea nitrogen (mg/dL)
    - Alkalinephos	Alkaline phosphatase (IU/L)
    - Calcium	(mg/dL)
    - Chloride	(mmol/L)
    - Creatinine	(mg/dL)
    - Bilirubin_direct	Bilirubin direct (mg/dL)
    - Glucose	Serum glucose (mg/dL)
    - Lactate	Lactic acid (mg/dL)
    - Magnesium	(mmol/dL)
    - Phosphate	(mg/dL)
    - Potassium	(mmol/L)
    - Bilirubin_total	Total bilirubin (mg/dL)
    - TroponinI	Troponin I (ng/mL)
    - Hct	Hematocrit (%)
    - Hgb	Hemoglobin (g/dL)
    - PTT	partial thromboplastin time (seconds)
    - WBC	Leukocyte count (count*10^3/µL)
    - Fibrinogen	(mg/dL)
    - Platelets	(count*10^3/µL)
- Demographics (columns 35-40)
    - Age	Years (100 for patients 90 or above)
    - Gender	Female (0) or Male (1)
    - Unit1	Administrative identifier for ICU unit (MICU)
    - Unit2	Administrative identifier for ICU unit (SICU)
    - HospAdmTime	Hours between hospital admit and ICU admit
    - ICULOS	ICU length-of-stay (hours since ICU admit)

In [3]:
# check missing values
# get columns with missing values over 80%

In [5]:
ROOT_DIR = '../../../datasets/training_setA/training/'
total_pats = []
max_row = 0
min_row = 60
for file in os.listdir(ROOT_DIR):
    pat_df = pd.read_csv(ROOT_DIR+file, sep='|')
    pat_dict = (pat_df.isnull().sum() / pat_df.shape[0]).to_dict()
    pat_dict['pat']= file
    total_pats.append(pat_dict)
    if max_row < pat_df.shape[1]:
        max_row = pat_df.shape[1]
    if min_row > pat_df.shape[1]:
        min_row = pat_df.shape[1]

In [8]:
max_row

41

In [9]:
min_row

41

In [6]:
total_df = pd.DataFrame(total_pats)
total_df.describe()

Unnamed: 0,AST,Age,Alkalinephos,BUN,BaseExcess,Bilirubin_direct,Bilirubin_total,Calcium,Chloride,Creatinine,DBP,EtCO2,FiO2,Fibrinogen,Gender,Glucose,HCO3,HR,Hct,Hgb,HospAdmTime,ICULOS,Lactate,MAP,Magnesium,O2Sat,PTT,PaCO2,Phosphate,Platelets,Potassium,Resp,SBP,SaO2,SepsisLabel,Temp,TroponinI,Unit1,Unit2,WBC,pH
count,20336.0,20336.0,20336.0,20336.0,20336.0,20336.0,20336.0,20336.0,20336.0,20336.0,20336.0,20336.0,20336.0,20336.0,20336.0,20336.0,20336.0,20336.0,20336.0,20336.0,20336.0,20336.0,20336.0,20336.0,20336.0,20336.0,20336.0,20336.0,20336.0,20336.0,20336.0,20336.0,20336.0,20336.0,20336.0,20336.0,20336.0,20336.0,20336.0,20336.0,20336.0
mean,0.985239,0.0,0.985668,0.917105,0.897089,0.998583,0.987899,0.951119,0.91536,0.932715,0.505426,1.0,0.868923,0.99264,0.0,0.874364,0.918345,0.082537,0.879188,0.909895,4.9e-05,0.0,0.966439,0.108744,0.922877,0.127073,0.951533,0.913793,0.950586,0.933546,0.890158,0.103864,0.154459,0.949375,0.0,0.660033,0.998755,0.468234,0.468234,0.923271,0.885928
std,0.032535,0.0,0.032399,0.052101,0.127139,0.011021,0.025349,0.038369,0.05799,0.034599,0.42463,0.0,0.145305,0.028868,0.0,0.116203,0.052263,0.102391,0.08411,0.062169,0.007012,0.0,0.069675,0.125947,0.053689,0.154388,0.044905,0.104824,0.0384,0.037424,0.075777,0.139911,0.195842,0.093047,0.0,0.223556,0.009394,0.499002,0.499002,0.050486,0.135067
min,0.6,0.0,0.6,0.375,0.0,0.744681,0.642857,0.55,0.375,0.55,0.0,1.0,0.0,0.333333,0.0,0.041667,0.375,0.0,0.166667,0.25,0.0,0.0,0.083333,0.0,0.375,0.0,0.5,0.111111,0.55,0.558824,0.25,0.0,0.0,0.083333,0.0,0.0,0.761905,0.0,0.0,0.333333,0.0
25%,0.98,0.0,0.980769,0.896552,0.833333,1.0,0.981132,0.932203,0.894737,0.918367,0.075,1.0,0.75,1.0,0.0,0.846154,0.897436,0.019231,0.840909,0.885714,0.0,0.0,0.959184,0.023256,0.9,0.025641,0.931818,0.857143,0.932203,0.92,0.857143,0.021978,0.028571,0.932203,0.0,0.65,1.0,0.0,0.0,0.904762,0.808511
50%,1.0,0.0,1.0,0.930233,0.944444,1.0,1.0,0.953488,0.931034,0.941176,0.388889,1.0,0.914894,1.0,0.0,0.914894,0.931818,0.04878,0.9,0.927273,0.0,0.0,1.0,0.066667,0.933333,0.071429,0.958333,0.952381,0.953488,0.942857,0.913043,0.055556,0.084746,1.0,0.0,0.742857,1.0,0.0,0.0,0.9375,0.93617
75%,1.0,0.0,1.0,0.95,1.0,1.0,1.0,0.97619,0.95122,0.954545,1.0,1.0,1.0,1.0,0.0,0.946969,0.95122,0.111111,0.941272,0.95122,0.0,0.0,1.0,0.15,0.954545,0.166667,0.980769,1.0,0.97561,0.956057,0.944444,0.130435,0.2,1.0,0.0,0.782609,1.0,1.0,1.0,0.954545,1.0
max,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0


In [7]:
pat_df.head()

Unnamed: 0,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,EtCO2,BaseExcess,HCO3,FiO2,pH,PaCO2,SaO2,AST,BUN,Alkalinephos,Calcium,Chloride,Creatinine,Bilirubin_direct,Glucose,Lactate,Magnesium,Phosphate,Potassium,Bilirubin_total,TroponinI,Hct,Hgb,PTT,WBC,Fibrinogen,Platelets,Age,Gender,Unit1,Unit2,HospAdmTime,ICULOS,SepsisLabel
0,79.0,100.0,36.2,122.5,77.5,55.0,8.0,,1.0,,,7.36,43.0,,,,,,,,,195.0,,,,4.1,,,32.4,,30.3,,,171.0,70.56,1,0,1,-3.8,2,0
1,76.5,100.0,36.15,118.5,75.0,54.5,19.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,70.56,1,0,1,-3.8,3,0
2,82.0,96.0,36.4,103.0,68.0,53.0,24.0,,-6.0,,0.45,7.33,35.0,,,,,,,,,,,,,,,,,,,,,,70.56,1,0,1,-3.8,4,0
3,91.5,97.0,36.9,93.5,62.0,48.5,26.0,,-7.0,,,7.32,35.0,97.0,,,,,,,,178.0,,,,,,,37.0,12.2,,,,,70.56,1,0,1,-3.8,5,0
4,90.5,98.5,37.25,106.0,71.0,56.5,25.0,,-6.0,,,7.34,33.0,98.0,,,,,,,,,,,,,,,,,,,,,70.56,1,0,1,-3.8,6,0


In [10]:
removal = (total_df.mean()>.9).to_dict()
drop_cols = list(map(lambda x: x[0], filter(lambda x: x[1]==True, removal.items())))
drop_cols

['AST',
 'Alkalinephos',
 'BUN',
 'Bilirubin_direct',
 'Bilirubin_total',
 'Calcium',
 'Chloride',
 'Creatinine',
 'EtCO2',
 'Fibrinogen',
 'HCO3',
 'Hgb',
 'Lactate',
 'Magnesium',
 'PTT',
 'PaCO2',
 'Phosphate',
 'Platelets',
 'SaO2',
 'TroponinI',
 'WBC']

In [11]:
drop_cols.extend(['Unit1', 'Unit2'])
pat_df.drop(columns=drop_cols, inplace=True)
pat_df.head()

Unnamed: 0,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,BaseExcess,FiO2,pH,Glucose,Potassium,Hct,Age,Gender,HospAdmTime,ICULOS,SepsisLabel
0,79.0,100.0,36.2,122.5,77.5,55.0,8.0,1.0,,7.36,195.0,4.1,32.4,70.56,1,-3.8,2,0
1,76.5,100.0,36.15,118.5,75.0,54.5,19.0,,,,,,,70.56,1,-3.8,3,0
2,82.0,96.0,36.4,103.0,68.0,53.0,24.0,-6.0,0.45,7.33,,,,70.56,1,-3.8,4,0
3,91.5,97.0,36.9,93.5,62.0,48.5,26.0,-7.0,,7.32,178.0,,37.0,70.56,1,-3.8,5,0
4,90.5,98.5,37.25,106.0,71.0,56.5,25.0,-6.0,,7.34,,,,70.56,1,-3.8,6,0


In [12]:
ROOT_DIR = '../../../datasets/training_setA/training/'
total_pats_df = pd.DataFrame()
for file in os.listdir(ROOT_DIR):
    pat_df = pd.read_csv(ROOT_DIR+file, sep='|')
    pat_df.drop(columns=drop_cols, inplace=True)
    pat_df['patient']=file
    total_pats_df = pd.concat([total_pats_df, pat_df], ignore_index=True)   

In [13]:
total_pats_df.head()

Unnamed: 0,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,BaseExcess,FiO2,pH,Glucose,Potassium,Hct,Age,Gender,HospAdmTime,ICULOS,SepsisLabel,patient
0,,,,,,,,,,,,,,59.59,1,-1.8,1,0,p008854.psv
1,85.0,97.0,,130.0,90.0,,22.0,,,,,,,59.59,1,-1.8,2,0,p008854.psv
2,96.0,96.0,,124.0,84.67,,23.0,,,,142.0,3.9,38.6,59.59,1,-1.8,3,0,p008854.psv
3,95.0,95.0,,124.0,82.0,,23.0,,,,,,,59.59,1,-1.8,4,0,p008854.psv
4,94.0,95.0,,116.0,78.0,,20.0,,,,,,,59.59,1,-1.8,5,0,p008854.psv


In [None]:
total_pats_df.shape

In [None]:
total_pats_df = total_pats_df.fillna(0)
total_pats_df.head()

In [None]:
train, test = train_test_split(total_pats_df, test_size=0.2)

In [None]:
train.shape

In [None]:
test.shape

In [None]:
model = Sequential()
#input_shape(n_steps, n_features) -> univariate model so the number of features is one.
model.add(LSTM(100, activation='relu', input_shape=(17, 17)))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse')

In [None]:
SVG(model_to_dot(model, show_shapes=True).create(prog='dot', format='svg'))

In [None]:
y = train['SepsisLabel']
train.drop(columns='SepsisLabel', inplace=True)

In [None]:
model.fit(train, y, epochs=1000, verbose=0)