In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.compose import make_column_transformer, ColumnTransformer
from sklearn.metrics import precision_score, accuracy_score, recall_score, \
average_precision_score, precision_recall_curve, confusion_matrix
import seaborn as sns
from subprocess import call
from IPython.display import Image
import warnings
warnings.filterwarnings('ignore')

In [2]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
from pandas.plotting import scatter_matrix

In [3]:
original_data = pd.read_csv('data.psv', sep ='|')
missing_data = original_data.isnull().sum()
missing_percent = (missing_data/original_data.shape[0])*100
refined_columns = list(missing_percent[missing_percent < 92].index)
sepsis_data = original_data[refined_columns]

In [4]:
refined_columns

['HR',
 'O2Sat',
 'Temp',
 'SBP',
 'MAP',
 'DBP',
 'Resp',
 'BaseExcess',
 'HCO3',
 'FiO2',
 'pH',
 'PaCO2',
 'BUN',
 'Chloride',
 'Glucose',
 'Potassium',
 'Hct',
 'Hgb',
 'Age',
 'Gender',
 'Unit1',
 'Unit2',
 'HospAdmTime',
 'ICULOS',
 'SepsisLabel']

In [5]:
refined_percent = list(missing_percent[missing_percent < 92])
refined_columns = list(missing_percent[missing_percent < 92].index)
for i in range(len(refined_columns)):
    print(refined_columns[i],refined_percent[i])

HR 7.74329727991248
O2Sat 12.034000460903016
Temp 66.22543222319862
SBP 15.214990997196596
MAP 10.234072888519716
DBP 48.137774547517324
Resp 9.777854876328579
BaseExcess 89.57751401069194
HCO3 91.94913859252375
FiO2 85.81203767249043
pH 88.53491087098007
PaCO2 91.23537202723885
BUN 91.84024392405736
Chloride 91.67538246086757
Glucose 87.76859629707478
Potassium 89.1385165507234
Hct 88.22316822706817
Hgb 91.16408400590564
Age 0.0
Gender 0.0
Unit1 48.86002476720599
Unit2 48.86002476720599
HospAdmTime 0.0010129736601524018
ICULOS 0.0
SepsisLabel 0.0


In [6]:
sepsis_data

Unnamed: 0,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,BaseExcess,HCO3,FiO2,...,Potassium,Hct,Hgb,Age,Gender,Unit1,Unit2,HospAdmTime,ICULOS,SepsisLabel
0,,,,,,,,,,,...,,,,83.14,0,,,-0.03,1,0
1,97.0,95.0,,98.0,75.33,,19.0,,,,...,,,,83.14,0,,,-0.03,2,0
2,89.0,99.0,,122.0,86.00,,22.0,,,,...,,,,83.14,0,,,-0.03,3,0
3,90.0,95.0,,,,,30.0,24.0,,,...,,,,83.14,0,,,-0.03,4,0
4,103.0,88.5,,122.0,91.33,,24.5,,,0.28,...,,,,83.14,0,,,-0.03,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
789749,97.0,100.0,,,72.00,,18.0,,,,...,,,,69.80,0,,,-10.58,38,0
789750,110.0,100.0,37.0,,91.00,,17.0,,,0.50,...,,,,69.80,0,,,-10.58,39,0
789751,114.0,100.0,,,89.00,,21.0,,,,...,,,,69.80,0,,,-10.58,40,0
789752,130.0,99.0,,,,,21.0,,,,...,,,,69.80,0,,,-10.58,41,0


In [7]:
sepsis_data.describe()

Unnamed: 0,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,BaseExcess,HCO3,FiO2,...,Potassium,Hct,Hgb,Age,Gender,Unit1,Unit2,HospAdmTime,ICULOS,SepsisLabel
count,728601.0,694715.0,266736.0,669593.0,708930.0,409584.0,712533.0,82312.0,63582.0,112050.0,...,85779.0,93008.0,69782.0,789754.0,789754.0,403880.0,403880.0,789746.0,789754.0,789754.0
mean,84.974555,97.265593,37.026488,120.959733,78.764153,59.978715,18.771166,-0.649428,24.093049,0.526259,...,4.161729,30.675707,10.582367,63.031303,0.57762,0.507084,0.492916,-52.055149,27.16623,0.021661
std,16.934077,2.90917,0.780226,21.522164,15.043296,12.569439,5.393348,4.285079,4.396253,0.185849,...,0.632784,4.874916,1.746029,16.121992,0.493939,0.49995,0.49995,155.905646,28.10514,0.145575
min,20.0,20.0,20.9,22.0,20.0,20.0,1.0,-32.0,0.0,0.0,...,1.0,5.5,2.2,18.11,0.0,0.0,0.0,-3710.66,1.0,0.0
25%,73.0,96.0,36.56,105.0,68.0,51.0,15.0,-3.0,22.0,0.4,...,3.8,27.4,9.4,52.75,0.0,0.0,0.0,-38.13,11.0,0.0
50%,84.0,98.0,37.06,118.5,77.0,58.5,18.0,0.0,24.0,0.5,...,4.1,30.2,10.4,65.28,1.0,1.0,0.0,-2.61,21.0,0.0
75%,96.0,99.5,37.55,134.0,87.33,67.0,22.0,1.0,27.0,0.55,...,4.5,33.5,11.6,75.9,1.0,1.0,1.0,-0.02,35.0,0.0
max,280.0,100.0,42.22,281.0,300.0,298.0,69.0,100.0,55.0,10.0,...,27.5,71.7,32.0,89.0,1.0,1.0,1.0,23.99,336.0,1.0


# Feature Engineering - Building newer features

## 1. Age
Three categories -

Child - Age less than 10 year

Adult - Age more than 10 year and less than 60 years

Senior - Age more than 60


In [8]:
def fe_new_age(data):
    data.loc[data['Age'] >=60, 'new_age'] = 'old'
    data.loc[data['Age'] <10, 'new_age'] = 'infant'
    data.loc[(data['Age'] >=10) & (data['Age'] <60), 
            'new_age'] = 'adult'
    return data

In [9]:
sepsis_data = fe_new_age(sepsis_data)

## 2. Heart Rate

The new feature designed for heart rate takes into account both Age and Heart Rate in a patient. It has three categories - normal, abnormal, missing

The 'normal' HR for a child (Age < 10) is in the range of 70 to 110 beats per minute.

The 'normal' HR for a adult and senior (Age 10+) is in the range of 60 to 100

Any other values recorded is marked as 'abnormal'.

The value 'missing' is filled in place of null/nan values

In [10]:
def fe_new_hr(data):
    data.loc[(data['HR'] >= 70) & (data['HR'] < 110 ) & (data['Age'] < 10), 'new_hr'] = 'normal'
    data.loc[(data['HR'] > 60) & (data['HR'] < 100) & data['Age'] >= 10, 'new_hr'] = 'normal'
    data.loc[((data['HR'] < 70) | (data['Age'] >= 110)) & (data['Age']<10), 'new_hr'] = 'abnormal'
    data.loc[(data['HR'] >= 100) & (data['Age'] >= 10), 'new_hr'] = 'abnormal'
    data['new_hr'].fillna('Missing', inplace=True)
    return data

In [11]:
sepsis_data = fe_new_hr(sepsis_data)

## 3. O2Sat
The blood oxygen level measured with an oximeter is called your oxygen saturation level. This is a percentage of how much oxygen your blood is carrying compared to the maximum it is capable of carrying.

The new feature designed for pulse oximetry takes into three catogories

'Normal' is found to be between 95% - 100% in healthy children and adults alike

'Abnormal' is for anything otherwise

'Missing' is a null or nan case is observed

In [12]:
def fe_new_o2sat(data):
    data.loc[(data['O2Sat'] >= 95) & (data['O2Sat'] < 100), 'new_o2sat'] = 'normal'
    data.loc[(data['O2Sat'] < 95) & (data['O2Sat'] >= 0), 'new_o2sat'] = 'abnormal'
    data['new_o2sat'].fillna('missing', inplace=True)
    return data

In [13]:
sepsis_data = fe_new_o2sat(sepsis_data)

## 4. Temperature

The new feature designed for temperature takes into three categories:

Body temperature for any healthy person (child, adult and senior alike) is 'normal' when found between 36 Deg C to 38 Dec C.

Anything above or below this range is labeled as 'abnormal'

'Missing' is a null or nan case is observed

In [14]:
def fe_new_temp(data):
    data.loc[(data['Temp'] >= 36) & (data['Temp'] < 38),'new_temp'] = 'normal'
    data.loc[(data['Temp'] < 36) | (data['Temp'] >= 38),'new_temp'] = 'abnormal'
    data['new_temp'].fillna('Missing', inplace=True)
    return data

In [15]:
sepsis_data = fe_new_temp(sepsis_data)

## 5. Blood Pressure

We will be combining two forms of Blood Pressure here - Systolic blood pressure (SBP) and Diastolic Blood Pressure(DBP) in the dataset.

SBP - When your heart beats, it squeezes and pushes blood through your arteries to the rest of your body. This force creates pressure on those blood vessels, and that's your systolic blood pressure

DBP - The diastolic reading, or the bottom number, is the pressure in the arteries when the heart rests between beats. This is the time when the heart fills with blood and gets oxygen.

The new feature will compare the two BP and according to the below table categorize into four categories - low, normal, elevated and high, and missing



In [16]:
def fe_new_bp(data):
    data.loc[(data['SBP'] < 90) & (data['DBP'] < 60), 'new_bp'] = 'low'
    data.loc[(data['SBP'].between(90,120, inclusive=True)) & (data['DBP'].between(60,80, inclusive=True)), 'new_bp'] = 'normal'
    data.loc[(data['SBP'].between(120,140, inclusive=True)) & (data['DBP'].between(80,90, inclusive=True)),'new_bp'] = 'elevated'
    data.loc[(data['SBP'] > 140 ) & (data['DBP'] > 90 ), 'new_bp'] = 'high'
    data['new_bp'].fillna('Missing', inplace=True)
    return data

In [17]:
sepsis_data = fe_new_bp(sepsis_data)

## 6. Respiration Rate

The new feature designed will have 3 categories - normal, abnormal and missing. The normal respiratory rate for different age groups are as shown below:

For healthy adults (Age > 18) is between 12 and 20 breaths per minute.
Normal respiratory rates for children in breaths per minute are as follows:

birth to 1 year: 30 to 60

1 to 3 years: 24 to 40

3 to 6 years: 22 to 34

6 to 12 years: 18 to 30

12 to 18 years: 12 to 16

Any other range for respiratory rates are labeled as 'abnormal' and the missing values are labeled as 'missing'

In [18]:
def fe_new_resp(data):
    data.loc[(data['Resp'].between(30, 60)) & (data['Age'] < 1), 'new_resp'] = 'normal'
    data.loc[(data['Resp'].between(24, 40)) & (data['Age'].between(1, 3)), 'new_resp'] = 'normal'
    data.loc[(data['Resp'].between(22, 34)) & (data['Age'].between(3, 6)), 'new_resp'] = 'normal'
    data.loc[(data['Resp'].between(18, 30)) & (data['Age'].between(6, 12)), 'new_resp'] = 'normal'
    data.loc[(data['Resp'].between(12, 16)) & (data['Age'].between(12, 18)), 'new_resp'] = 'normal'  
    data.loc[(data['Resp'].between(12, 20)) & (data['Age'] > 18), 'new_resp'] = 'normal'  
    
    data.loc[((data['Resp'] < 30) | (data['Resp'] > 60)) & (data['Age'] <1) ,'new_resp'] = 'abnormal'   
    data.loc[((data['Resp'] < 24) | (data['Resp'] > 40)) & (data['Age'].between(1, 3)) ,'new_resp'] = 'abnormal'   
    data.loc[((data['Resp'] < 22) | (data['Resp'] > 34)) & (data['Age'].between(3, 6)) ,'new_resp'] = 'abnormal'
    data.loc[((data['Resp'] < 18) | (data['Resp'] > 30)) & (data['Age'].between(6, 12)) ,'new_resp'] = 'abnormal'
    data.loc[((data['Resp'] < 12) | (data['Resp'] > 16)) & (data['Age'].between(12, 18)) ,'new_resp'] = 'abnormal'
    data.loc[((data['Resp'] < 12) | (data['Resp'] > 20)) & (data['Age'] > 18) ,'new_resp'] = 'abnormal'
    
    
    data['new_resp'].fillna('missing', inplace = True)
                                                            
    return data


In [19]:
sepsis_data = fe_new_resp(sepsis_data)

## 7. Mean Arterial Pressure

https://emtprep.com/free-training/post/map-understanding-mean-arterial-pressure

MAP is the measurement that explains the average blood pressure in a person's blood vessels during a single cardiac cycle. Mean arterial pressure is significant because it measures the pressure necessary for adequate perfusion of the organs of the body.

The normal MAP range is between 70 and 100 mmHg.

High MAP can cause stress on the heart because it has to work harder than normal to push against the elevated pressure in the vessels.
When the MAP gets below 60, vital organs in the body do not get the nourishment they need for survival
MAP is directly affected by factors such as:

• Amount of blood pumped out of the heart per minute (cardiac output)

• Heart rate (beats per minute)

• Blood pressure

• Resistance to blood flow in the vessels

A change in any of these factors will alter the mean arterial pressure and cause negative effects on the body

In [20]:
def fe_new_map(data):
    data.loc[(data['MAP'] >= 70) & (data['MAP'] < 100),'new_map'] = 'normal'
    data.loc[(data['MAP'] < 70) | (data['MAP'] >= 100),'new_map'] = 'abnormal'
    data['new_map'].fillna('Missing', inplace=True)
    return data

In [21]:
sepsis_data = fe_new_map(sepsis_data)

## 8. Fraction of inspired oxygen

The percentage of individual gases in air (oxygen, nitrogen, etc.) doesn't change with altitude, but the atmospheric (or barometric) pressure does. FIO2, the fraction of inspired oxygen in the air, is thus 21% (or .21) throughout the breathable atmosphere.

In [22]:
def fe_new_fio2(data):
    data.loc[(data['FiO2'] < 0.8 ) ,'new_fio2'] = 'normal'
    data.loc[(data['FiO2'] >= 0.8 ),'new_fio2'] = 'abnormal'
    data['new_fio2'].fillna('Missing', inplace=True)
    return data

In [23]:
sepsis_data = fe_new_fio2(sepsis_data)

## Feature Selection - Selecting relevant features for prediction

In [24]:
columns_new = ['Gender', 'new_age', 'new_o2sat', 'new_temp', 'new_bp', 'new_resp', 'new_map', 'new_fio2', 'new_hr', 'HospAdmTime', 'ICULOS']

In [25]:
target_col = ['SepsisLabel']

test_cols = columns_new + target_col

In [26]:
all_data_train = sepsis_data[test_cols]

In [27]:
all_data_train.head()

Unnamed: 0,Gender,new_age,new_o2sat,new_temp,new_bp,new_resp,new_map,new_fio2,new_hr,HospAdmTime,ICULOS,SepsisLabel
0,0,old,missing,Missing,Missing,missing,Missing,Missing,Missing,-0.03,1,0
1,0,old,normal,Missing,Missing,normal,normal,Missing,Missing,-0.03,2,0
2,0,old,normal,Missing,Missing,abnormal,normal,Missing,Missing,-0.03,3,0
3,0,old,normal,Missing,Missing,abnormal,Missing,Missing,Missing,-0.03,4,0
4,0,old,abnormal,Missing,Missing,abnormal,normal,normal,abnormal,-0.03,5,0
