## Background
The objective of this project is to construct a model which can predict a patient's mortality rate and length of stay in an ICU, given their biological information.

In [10]:
from __future__ import print_function
import numpy as np
import pandas as pd
import os
import glob
import math

from sklearn import svm
from sklearn.decomposition import PCA

In [15]:
# import data
static_variables = ['RecordID', 'Age', 'Gender', 'Height', 'ICUType']
def get_data_from_files(path):
    all_files = glob.glob(os.path.join(path, "*.txt"))
    
    df_fold = pd.DataFrame()
    
    for f in all_files:
        df_from_file = pd.read_csv(f)
        df_static = df_from_file[df_from_file['Parameter'].isin(static_variables)]
        df_temporal = df_from_file[~df_from_file['Parameter'].isin(static_variables)]
        df_static = df_static.drop('Time', axis=1)
        
        df_min = df_temporal.groupby(df_temporal['Parameter']).aggregate({'Value': 'min'})
        df_max = df_temporal.groupby(df_temporal['Parameter']).aggregate({'Value': 'max'})
        df_mean = df_temporal.groupby(df_temporal['Parameter']).aggregate({'Value': 'mean'})
        
        df_min = df_min.reset_index(level=0)
        df_max = df_max.reset_index(level=0)
        df_mean = df_mean.reset_index(level=0)
        
        df_min['Parameter'] = df_min['Parameter'] + '_min'
        df_max['Parameter'] = df_max['Parameter'] + '_max'
        df_mean['Parameter'] = df_mean['Parameter'] + '_mean'
        
        df_pivot_min = pd.pivot_table(df_min, values="Value", columns="Parameter")
        df_pivot_max = pd.pivot_table(df_max, values="Value", columns="Parameter")
        df_pivot_mean = pd.pivot_table(df_mean, values="Value", columns="Parameter")
        
        df_pivot_static = pd.pivot_table(df_static, values="Value", columns="Parameter")
        
        df_concat = pd.concat([df_pivot_static, df_pivot_min, df_pivot_max, df_pivot_mean], axis=1, sort=False)
        df_fold = df_fold.append(df_concat)
        if len(df_fold)%500 == 0:
            print("Imported {}".format(len(df_fold)))
    
    return df_fold

def get_results():
    return pd.read_csv("results.txt")

In [16]:
df_all = get_data_from_files("./data")

Imported 500
Imported 1000
Imported 1500
Imported 2000
Imported 2500
Imported 3000
Imported 3500
Imported 4000


In [19]:
print(df_all.shape)
""" May need to replace negative values in height and weight with -1 """

(4000, 116)


' May need to replace negative values in height and weight with -1 '

In [20]:
print(get_results())

      RecordID  SAPS-I  SOFA  Length_of_stay  Survival  In-hospital_death
0       132539       6     1               5        -1                  0
1       132540      16     8               8        -1                  0
2       132541      21    11              19        -1                  0
3       132543       7     1               9       575                  0
4       132545      17     2               4       918                  0
5       132547      14    11               6      1637                  0
6       132548      14     4               9        -1                  0
7       132551      19     8               6         5                  1
8       132554      11     0              17        38                  0
9       132555      14     6               8        -1                  0
10      132556      15     2              13        -1                  0
11      132567      13     7               7        -1                  0
12      132568       7     2          

In [21]:
### normalizer
def norm(df):
    for col in df.columns:
        if not col in static_variables:
            df[col] -= df[col].mean()
            df[col] /= np.sqrt(np.var(df[col])) if np.var(df[col]) != 0 else 1
        # print(df[col])
    return df


### k-fold split function - only works for k|df.shape[0]
def kfold_split(df, k=4):
    chunk = int(df.shape[0]/k)
    chunks = []
    for i in range(k):
        chunks.append(df.iloc[chunk*i: chunk*(i+1)])
    # print([i.shape for i in chunks])
    return chunks



In [22]:
df_norm = norm(df_all)
df_norm.dropna(thresh=2800, inplace=True, axis=1)
# print(df_norm.shape)


In [23]:
df_norm['RecordID'].astype(int)
df_norm.set_index('RecordID', inplace=True)

In [24]:
print(df_norm.shape)

(4000, 67)


## Binary Classification - Patient Mortality
We first use the predictors in a support vector classifier to determine if they die.

In [25]:
X = df_norm.sort_index()
y = get_results()
y.set_index('RecordID', inplace=True)
y = (y['Survival'] != -1).astype(int).sort_index()

In [26]:
# perform split
k = 4
X_folds = kfold_split(X, k)
y_folds = kfold_split(y, k)
VALIDATION_FOLD = 3

X_train = pd.DataFrame()
y_train = pd.DataFrame()

X_test= X_folds[VALIDATION_FOLD]
X_test = np.nan_to_num(X_test)
y_test = y_folds[VALIDATION_FOLD]

for i in range(k):
    if i == VALIDATION_FOLD:
        continue
    X_train = X_train.append(X_folds[i])
    y_train = pd.concat([y_train, y_folds[i]], axis=0)

X_train = np.nan_to_num(X_train)
y_train[0] = y_train[0].astype(int)
y_train.rename(columns={0:"survives"}, inplace=True)
print(X_train.shape)
print(y_train.shape)

(3000, 67)
(3000, 1)


In [27]:
clf = svm.SVC(kernel='linear', C=1.0)
clf.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [28]:
clf.score(X_test, y_test)

0.734

Cross Validation Results:

**Validation Fold**: Accuracy

0: 0.713

1: 0.742

2: 0.718

3: 0.734