In [1]:
import pandas as pd
import numpy as np
import json
import os

from sklearn.linear_model import LogisticRegression as LR
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.preprocessing import Imputer
from sklearn.ensemble import RandomForestClassifier as RF

from baseline_prism_iii import prism_iii

from mimic3benchmark.readers import InHospitalMortalityReader
from mimic3models import common_utils
from mimic3models.in_hospital_mortality.utils import save_results
from mimic3models.metrics import print_metrics_binary

In [2]:
test_data=pd.read_csv('mimiciii_data/test_data.csv')
train_data=pd.read_csv('mimiciii_data/train_data.csv')
val_data=pd.read_csv('mimiciii_data/val_data.csv')

# convert to float
columns = ['PID','age','SBPmin','SBPmax','Tempmin','Tempmax','Respmin','Respmax','ABEmin','ABEmax','Lacmin','Lacmax','pCO2','pO2','K','HCO3','sO2','PC','Glu','SBC','M_label']
for col in columns:
    train_data[col]=pd.to_numeric(train_data[col],errors='coerce')
    val_data[col]=pd.to_numeric(val_data[col],errors='coerce')
    test_data[col]=pd.to_numeric(test_data[col],errors='coerce')

In [3]:
imputer = Imputer(missing_values=np.nan, strategy='mean', axis=0, verbose=0, copy=True)
imputer.fit(train_data)



Imputer(axis=0, copy=True, missing_values=nan, strategy='mean', verbose=0)

In [4]:
#convert NaN to mean
train_data = np.array(imputer.transform(train_data), dtype=np.float32)
val_data = np.array(imputer.transform(val_data), dtype=np.float32)
test_data = np.array(imputer.transform(test_data), dtype=np.float32)

In [5]:
train_x=train_data[:,2:21]
train_y=train_data[:,21]
val_x=val_data[:,2:21]
val_y=val_data[:,21]
test_x=test_data[:,2:21]
test_y=test_data[:,21]
test_names=test_data[:,1]

# LR

In [6]:
penalty='l1'
C=1.0
logreg = LR(penalty=penalty, C=C, random_state=42)
logreg.fit(train_x, train_y)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l1',
                   random_state=42, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [7]:
output_dir='results'
result_dir = os.path.join(output_dir, 'LRresults')
common_utils.create_directory(result_dir)

In [8]:
with open(os.path.join(result_dir, 'train.json'), 'w') as res_file:
    ret = print_metrics_binary(train_y, logreg.predict_proba(train_x))
    ret = {k : float(v) for k, v in ret.items()}
    json.dump(ret, res_file)

with open(os.path.join(result_dir, 'val.json'), 'w') as res_file:
    ret = print_metrics_binary(val_y, logreg.predict_proba(val_x))
    ret = {k: float(v) for k, v in ret.items()}
    json.dump(ret, res_file)

prediction = logreg.predict_proba(test_x)[:, 1]

with open(os.path.join(result_dir, 'test.json'), 'w')as res_file:
    ret = print_metrics_binary(test_y, prediction)
    ret = {k: float(v) for k, v in ret.items()}
    json.dump(ret, res_file)

save_results(test_names, prediction, test_y,
             os.path.join(output_dir, 'predictions/LRtest.csv'))


confusion matrix:
[[12621    73]
 [ 1814   173]]
accuracy = 0.8714665174484253
precision class 0 = 0.8743332028388977
precision class 1 = 0.7032520174980164
recall class 0 = 0.9942492246627808
recall class 1 = 0.0870659276843071
AUC of ROC = 0.7277958613768762
AUC of PRC = 0.36658896429187604
min(+P, Se) = 0.37544036235530953
confusion matrix:
[[2780    6]
 [ 421   15]]
accuracy = 0.8674736022949219
precision class 0 = 0.8684785962104797
precision class 1 = 0.7142857313156128
recall class 0 = 0.9978463649749756
recall class 1 = 0.03440367057919502
AUC of ROC = 0.6850907552177664
AUC of PRC = 0.30120704766947065
min(+P, Se) = 0.30045871559633025
confusion matrix:
[[2843   19]
 [ 329   45]]
accuracy = 0.8924598097801208
precision class 0 = 0.8962799310684204
precision class 1 = 0.703125
recall class 0 = 0.9933612942695618
recall class 1 = 0.12032085657119751
AUC of ROC = 0.7528045904849456
AUC of PRC = 0.38022362263987425
min(+P, Se) = 0.376


# RF

In [9]:
rf = RF(n_estimators=100, random_state=0)
rf.fit(train_x, train_y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [10]:
output_dir='results'
result_dir = os.path.join(output_dir, 'RFresults')
common_utils.create_directory(result_dir)

In [11]:
with open(os.path.join(result_dir, 'train.json'), 'w') as res_file:
    ret = print_metrics_binary(train_y, rf.predict_proba(train_x))
    ret = {k : float(v) for k, v in ret.items()}
    json.dump(ret, res_file)

with open(os.path.join(result_dir, 'val.json'), 'w') as res_file:
    ret = print_metrics_binary(val_y, rf.predict_proba(val_x))
    ret = {k: float(v) for k, v in ret.items()}
    json.dump(ret, res_file)

prediction = rf.predict_proba(test_x)[:, 1]

with open(os.path.join(result_dir, 'test.json'), 'w')as res_file:
    ret = print_metrics_binary(test_y, prediction)
    ret = {k: float(v) for k, v in ret.items()}
    json.dump(ret, res_file)

save_results(test_names, prediction, test_y,
             os.path.join(output_dir, 'predictions/RFtest.csv'))



confusion matrix:
[[12694     0]
 [    2  1985]]
accuracy = 0.9998637437820435
precision class 0 = 0.9998424649238586
precision class 1 = 1.0
recall class 0 = 1.0
recall class 1 = 0.998993456363678
AUC of ROC = 1.0
AUC of PRC = 1.0
min(+P, Se) = 1.0
confusion matrix:
[[2779    7]
 [ 408   28]]
accuracy = 0.8711979985237122
precision class 0 = 0.8719798922538757
precision class 1 = 0.800000011920929
recall class 0 = 0.9974874258041382
recall class 1 = 0.06422018259763718
AUC of ROC = 0.7281249794187188
AUC of PRC = 0.3700375986987712
min(+P, Se) = 0.3632286995515695
confusion matrix:
[[2846   16]
 [ 317   57]]
accuracy = 0.8970952033996582
precision class 0 = 0.8997786641120911
precision class 1 = 0.7808219194412231
recall class 0 = 0.9944095015525818
recall class 1 = 0.15240642428398132
AUC of ROC = 0.7856992044006472
AUC of PRC = 0.4173718699760758
min(+P, Se) = 0.4005235602094241


# prism_iii

In [12]:
prediction = prism_iii(test_x)
output_dir='results'
result_dir = os.path.join(output_dir, 'Prismiiiresults')
common_utils.create_directory(result_dir)
with open(os.path.join(result_dir, 'test.json'), 'w')as res_file:
    ret = print_metrics_binary(test_y, prediction)
    ret = {k: float(v) for k, v in ret.items()}
    json.dump(ret, res_file)
save_results(test_names, prediction, test_y,
             os.path.join(output_dir, 'predictions/Prismiiitest.csv'))