### Feature engineering

This notebook takes the raw clinical data and builds corresponding features. There are a mix of continuous and categorical variables from the clinical data, and some contain more missing values than others.

The general strategy is to window the data into 10 hour blocks, with a one hour prediction of sepsis/no sepsis. For each window, the following variables are retained as time series:
- HR
- MAP
- O2Sat
- SBP
- Resp

The remainder of the variables are summarized as a single value, the median of the ten values in that window. This is a strategy to deal with the fact that there may be > 90% missing data for some variables. Pas necessaire d'éxecuter ce notebook.

In [1]:
import pandas as pd
import numpy as np
import pdb
import os
import shutil



In [2]:
try:
    if os.path.exists('feats'):
        shutil.rmtree('feats')
    os.makedirs('feats')
except Exception as e:
    print(e)

In [None]:
!pip install -U -q PyDrive2

In [None]:
from pydrive2.auth import GoogleAuth

from pydrive2.drive import GoogleDrive

from google.colab import auth

from oauth2client.client import GoogleCredentials

auth.authenticate_user()

gauth = GoogleAuth()

gauth.credentials = GoogleCredentials.get_application_default()

drive = GoogleDrive(gauth)

In [None]:
#https://drive.google.com/file/d/14zYhaPPm0xSNTCvd0zluht_GM_3uwWF6/view?usp=sharing
fileDownloaded = drive.CreateFile({'id':'14zYhaPPm0xSNTCvd0zluht_GM_3uwWF6'})
fileDownloaded.GetContentFile('combined1.pkl')

In [3]:
df = pd.read_pickle("combined1.pkl")

In [6]:
df.head()

Unnamed: 0,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,EtCO2,BaseExcess,HCO3,...,Fibrinogen,Platelets,Age,Gender,Unit1,Unit2,HospAdmTime,ICULOS,SepsisLabel,patient
0,,,,,,,,,,,...,,,83.14,0,,,-0.03,1,0,p000001
1,97.0,95.0,,98.0,75.33,,19.0,,,,...,,,83.14,0,,,-0.03,2,0,p000001
2,89.0,99.0,,122.0,86.0,,22.0,,,,...,,,83.14,0,,,-0.03,3,0,p000001
3,90.0,95.0,,,,,30.0,,24.0,,...,,,83.14,0,,,-0.03,4,0,p000001
4,103.0,88.5,,122.0,91.33,,24.5,,,,...,,,83.14,0,,,-0.03,5,0,p000001


In [7]:
#get the percentage missing for each column
print('Percentage Missing:')
print(df.isna().sum()/len(df))

#columns to drop
#drop Unit2 because Unit1 and Unit2 are mutually exclusive
#drop ICULOS as it's basically just an index
cols_to_drop = ['Unit2', 'ICULOS']
df = df.drop(cols_to_drop, axis=1)

#columns with < 15% missing data, and continuous data. these will be retained as time series
cols_cont = ['HR', 'MAP', 'O2Sat', 'SBP', 'Resp']

#columns with continuous data and > 15% missing data
cols_to_bin = ['Unit1', 'Gender', 'HospAdmTime', 'Age', 'DBP', 'Temp', 'Glucose', 'Potassium', 'Hct', 'FiO2', 'Hgb', 'pH', 'BUN', 'WBC', 'Magnesium', 'Creatinine', 'Platelets', 'Calcium', 'PaCO2', 'BaseExcess', 'Chloride', 'HCO3', 'Phosphate', 'EtCO2', 'SaO2', 'PTT', 'Lactate', 'AST', 'Alkalinephos', 'Bilirubin_total', 'TroponinI', 'Fibrinogen', 'Bilirubin_direct']

Percentage Missing:
HR                  0.098857
O2Sat               0.130635
Temp                0.661465
SBP                 0.145699
MAP                 0.124523
DBP                 0.313383
Resp                0.153566
EtCO2               0.962979
BaseExcess          0.945823
HCO3                0.958127
FiO2                0.916660
pH                  0.930716
PaCO2               0.944377
SaO2                0.965488
AST                 0.983772
BUN                 0.931373
Alkalinephos        0.983927
Calcium             0.941151
Chloride            0.954634
Creatinine          0.939058
Bilirubin_direct    0.998071
Glucose             0.828941
Lactate             0.973322
Magnesium           0.936896
Phosphate           0.959864
Potassium           0.906892
Bilirubin_total     0.985090
TroponinI           0.990484
Hct                 0.911472
Hgb                 0.926161
PTT                 0.970548
WBC                 0.935932
Fibrinogen          0.993391
Platelets           0.9

Calculate the mean/std for standardization for each variable. Leave out a random 8000 patients as the test set. In other words don't include a random 4000 patients when calculating the mean/std scaling parameters.

In [8]:
patients_training_data = df['patient'].unique()
np.random.shuffle(patients_training_data)
patients_training_data = patients_training_data[0:-6000]

df_mean_std = df[df['patient'].isin(patients_training_data)].describe().loc[['mean', 'std']]
df_mean_std.to_pickle('mean_std_scaling.pkl')

In [9]:
print('Number of positive training examples:')
sum(df[df['patient'].isin(patients_training_data)]['SepsisLabel']==1)

Number of positive training examples:


47270

Loop through each subject and grab a window of 10 hours, with an output label associated with the 11th hour (ie predict one hour ahead).

In [11]:
#loop through each patient at a time
save_count = 0
windowed_df_list = []
grouped_by_patient = df.groupby('patient')
for patient, group in grouped_by_patient:
    #print(patient)
    group = group.reset_index(drop=True)

    #backfill any missing values for the continuous variables with < 15% missing data
    group['HR'] = group['HR'].bfill().ffill()
    group['MAP'] = group['MAP'].bfill().ffill()
    group['O2Sat'] = group['O2Sat'].bfill().ffill()
    group['SBP'] = group['SBP'].bfill().ffill()
    group['Resp'] = group['Resp'].bfill().ffill()
    
    #standardize the continous data
    group = group.assign(HR=(group['HR']-df_mean_std['HR']['mean'])/(df_mean_std['HR']['std']))
    group = group.assign(MAP=(group['MAP']-df_mean_std['MAP']['mean'])/(df_mean_std['MAP']['std']))
    group = group.assign(O2Sat=(group['O2Sat']-df_mean_std['O2Sat']['mean'])/(df_mean_std['O2Sat']['std']))
    group = group.assign(SBP=(group['SBP']-df_mean_std['SBP']['mean'])/(df_mean_std['SBP']['std']))
    group = group.assign(Resp=(group['Resp']-df_mean_std['Resp']['mean'])/(df_mean_std['Resp']['std']))

    #generate windows of 10 hours, predicting one sample into the future
    windowed_data = []
    N = len(group)
    win_len = 10
    pred_len = 1
    i = 0
    while(i+win_len+pred_len <= N):
        tmp_data = group.iloc[i:i+win_len]
        tmp_label = group.iloc[i+win_len:i+win_len+pred_len]
        tmp_label = int(any(tmp_label['SepsisLabel']))
        tmp_patient = patient

        #slide the window forward
        i = i+1

        #get all the continuous variables into one group
        X_cont = tmp_data[cols_cont]
        X_cont = X_cont.values

        #if any of the continuous variables is nan (in other words, there wasn't even a single value to 
        #backfill/forwardfill) then just skip this window
        if np.isnan(X_cont).any(): continue

        #process each of the variables to be binned
        X_binned_dict = {}
        for col_to_bin in cols_to_bin:
            tmp_val = tmp_data[col_to_bin].median()
            if col_to_bin not in ['Gender', 'Unit1']:
                tmp_val = (tmp_val-df_mean_std[col_to_bin]['mean'])/df_mean_std[col_to_bin]['std']
                
            X_binned_dict[col_to_bin] = tmp_val
        
        #package it all into a dictionary
        tmp_dict = X_binned_dict
        tmp_dict['X_cont'] = X_cont
        tmp_dict['label'] = tmp_label
        tmp_dict['patient'] = tmp_patient
        windowed_data.append(tmp_dict)
        
    #append the dataframe to the list of dataframes
    windowed_data_df = pd.DataFrame(windowed_data)
    windowed_df_list.append(windowed_data_df)

    #periodically save every 500 patients
    if (int(patient[-5:]) % 500) == 0:
        print('patient %i' % int(patient[-5:]))
        windowed_df = pd.concat(windowed_df_list).reset_index(drop=True)
        train = windowed_df[windowed_df['patient'].isin(patients_training_data)].drop('patient', axis=1)
        test = windowed_df[~windowed_df['patient'].isin(patients_training_data)].drop('patient', axis=1)

        train.to_pickle('feats/train_%i.pkl' % save_count)
        test.to_pickle('feats/test_%i.pkl' % save_count)

        windowed_df_list = []
        save_count = save_count+1

#save any remaining data
if len(windowed_df_list) > 0:
    #separate the training and testing data
    windowed_df = pd.concat(windowed_df_list).reset_index(drop=True)
    train = windowed_df[windowed_df['patient'].isin(patients_training_data)].drop('patient', axis=1)
    test = windowed_df[~windowed_df['patient'].isin(patients_training_data)].drop('patient', axis=1)

    train.to_pickle('feats/train_%i.pkl' % save_count)
    test.to_pickle('feats/test_%i.pkl' % save_count)

patient 500
patient 1000
patient 1500
patient 2000
patient 2500
patient 3000
patient 3500
patient 4000
patient 4500
patient 5000
patient 5500
patient 6000
patient 6500
patient 7000
patient 8000
patient 8500
patient 9000
patient 9500
patient 10000
patient 10500
patient 11000
patient 11500
patient 12000
patient 12500
patient 13000
patient 13500
patient 14000
patient 14500
patient 15000
patient 15500
patient 16000
patient 16500
patient 17000
patient 17500
patient 18000
patient 18500
patient 19000
patient 19500
patient 20000
patient 20500
patient 500
patient 1000
patient 1500
patient 2000
patient 2500
patient 3000
patient 3500
patient 4000
patient 4500
patient 5000
patient 5500
patient 6000
patient 6500
patient 7000
patient 7500
patient 8000
patient 8500
patient 9000
patient 9500
patient 10000
patient 10500
patient 11000
patient 11500
patient 12000
patient 12500
patient 13000
patient 13500
patient 14000
patient 14500
patient 15000
patient 15500
patient 16000
patient 16500
patient 17000
pat