# Logistic Regression Baseline
### Predicts MIMIC-III ICU patient mortality given the first 24 hours

In [1]:
from sklearn.linear_model import LogisticRegressionCV, Lasso, LassoCV, LogisticRegression
# from statsmodels.discrete.discrete_model import Logit
from sklearn import cross_validation
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix, roc_curve, auc, brier_score_loss
from sklearn.model_selection import StratifiedKFold

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
from random import shuffle

# fix random seed for reproducibility
np.random.seed(7)

%matplotlib inline



In [2]:
import sys
sys.path.append('/usr/local/lib/python2.7/dist-packages/')

import dbconfig as cfg
from sqlalchemy import create_engine

engine = create_engine('mysql+pymysql://{}:{}@{}:3306/mimic'.format(cfg.mysql['user'], cfg.mysql['password'],
                                                                cfg.mysql['host']), echo=False)

### Load data

In [3]:
mimic_df = pd.read_pickle('/home/andrea/data/mimic_nontimeseries_normalized')
# mimic_df = pd.get_dummies(mimic_df, columns=['GENDER', 'ADMISSION_LOCATION', 'ADMISSION_TYPE'])
label_col = mimic_df['HOSPITAL_EXPIRE_FLAG']
del mimic_df['HOSPITAL_EXPIRE_FLAG']
del mimic_df['AGE']
del mimic_df['GENDER']
del mimic_df['ADMISSION_LOCATION']
del mimic_df['ADMISSION_TYPE']
del mimic_df['glasgow_score']
del mimic_df['riker_sas']
del mimic_df['eye_open']
mimic_df['HOSPITAL_EXPIRE_FLAG'] = label_col
mimic_df.head()

Unnamed: 0_level_0,lab_hemoglobin,lab_monocyte,lactate_dh,lab_eosinophil,lab_glucose,lab_ck,lab_basophils,troponin_t,sodium_whole_blood,art_dia,resp_pattern,bp_dia,chart_temp,art_mean,bp_sys,art_sys,cvp,HOSPITAL_EXPIRE_FLAG
SUBJECT_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
3,0.0769231,0.0641026,-0.000482413,0,0.0322581,0.000836092,0,-0.000334448,0.781145,0.000171725,0,0,0.889908,0,0,0.0260664,0.22,0
4,0.0,0.025641,0.00406605,0,-0.0752688,-1.12479e-05,0,-0.000334448,-0.0213244,0.000171725,0,0,0.899083,0,0,0.0260664,0.0,0
6,0.0692308,0.0128205,-0.000482413,0,-0.0752688,-1.12479e-05,0,-0.000334448,0.736251,0.000171725,0,0,0.889908,0,0,0.0260664,0.24,0
9,0.107692,0.0,-0.000482413,0,-0.0752688,-1.12479e-05,0,-0.000334448,0.753086,0.000171725,0,0,0.899083,0,0,0.0260664,0.0,1
11,0.0,0.0,-0.000482413,0,-0.0752688,-1.12479e-05,0,-0.000334448,-0.0213244,0.000171725,0,0,0.899083,0,0,0.0260664,0.0,0


### Shuffle and split into train/test datasets

In [4]:
features = mimic_df.columns[:-1]

total_input = mimic_df[features].as_matrix()
total_labels = mimic_df['HOSPITAL_EXPIRE_FLAG'].values

def shuffle_split(l, train=0.6, test=0.2, val=0.2):
    shuffle(l)
    if val == 0:
        train = 0.7
    train_index = int(len(l)*train)
    test_index = train_index + int(len(l)*test)
    if val == 0:
        return l[:train_index], l[train_index:]
    return l[:train_index], l[train_index:test_index], l[test_index:]


patient_list = mimic_df.index.values
train, test = shuffle_split(patient_list, val=0)

train_df = mimic_df.ix[train]
test_df = mimic_df.ix[test]

features = train_df.columns[:-1]

X_train = train_df[features]
X_test = test_df[features]

y_train = train_df['HOSPITAL_EXPIRE_FLAG'].values
y_test = test_df['HOSPITAL_EXPIRE_FLAG'].values

print('{} observations in the training data'.format(len(train_df.index.values)))
print('{} observations in the test data'.format(len(test_df.index.values)))

26983 observations in the training data
11565 observations in the test data


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


In [5]:
# for i in y_test:
#     print(y_test[i])


# l = Logit(np.array(X_train),np.array(y_train))
# r = l.fit()

### Hyperparameter selection

In [9]:
lasso = LassoCV()
lasso.fit(X_train,y_train)
alpha = lasso.alpha_
alpha

0.00012720723659849934

### Train/test model

In [10]:
k = 10
kfold = StratifiedKFold(n_splits=k, shuffle=True, random_state=7)
clf = Lasso(alpha=alpha, random_state=7)

In [12]:
pred_vals = []
true_labels = []
run_ids = []

fpr_vals = []
tpr_vals = []
thresholds = []
auc_vals = []

count = 1
for train, test in kfold.split(total_input, total_labels):
    pred = clf.fit(total_input[train], total_labels[train]).predict(total_input[test])
    
    pred_vals.append(pred)
    true_labels.append(total_labels[test])
    run_ids.append(['alpha={}, fold={}'.format(alpha,count)]*len(total_input[test]))
    
    count += 1

In [13]:
def flatten(l):
    return [item for sublist in l for item in sublist]

In [15]:
results_df = pd.DataFrame()
results_df['pred_val'] = flatten(pred_vals)
results_df['true_label'] = flatten(true_labels)
results_df['run_ids'] = flatten(run_ids)
results_df.head()

Unnamed: 0,pred_val,true_label,run_ids
0,0.064832,0,"alpha=0.00012720723659849934, fold=1"
1,0.11416,0,"alpha=0.00012720723659849934, fold=1"
2,0.104709,0,"alpha=0.00012720723659849934, fold=1"
3,0.114625,0,"alpha=0.00012720723659849934, fold=1"
4,0.08836,0,"alpha=0.00012720723659849934, fold=1"


### Write results to RDS


In [16]:
results_df.to_sql(name='lasso_cv_results', con=engine, index=False, if_exists='replace')

### Evaluate results

In [17]:
results_df = pd.read_sql_table('lasso_cv_results', con=engine)
results_df.head()

Unnamed: 0,pred_val,true_label,run_ids
0,0.064832,0,"alpha=0.00012720723659849934, fold=1"
1,0.11416,0,"alpha=0.00012720723659849934, fold=1"
2,0.104709,0,"alpha=0.00012720723659849934, fold=1"
3,0.114625,0,"alpha=0.00012720723659849934, fold=1"
4,0.08836,0,"alpha=0.00012720723659849934, fold=1"


In [23]:
from collections import defaultdict
n_folds = set(results_df.run_ids.values)

auc_vals = []
brier_scores = []
errors = []

for fold in n_folds:
    temp_df = results_df.loc[results_df['run_ids']==fold]
#     pred_vals = temp_df.pred_val.values
    pred_vals = temp_df['pred_val'].values
    true_vals = temp_df.true_label.values
    fpr, tpr, threshold = roc_curve(true_vals, pred_vals)
    auc_vals.append(auc(fpr,tpr))
    try:
        brier_scores.append(brier_score_loss(true_vals, pred_vals))
    except:
#         print(pred_vals)
        errors.append(pred_vals)

print('# of vals skipped: {}'.format(len(errors)))
print(np.mean(auc_vals),np.mean(brier_scores))

# of vals skipped: 1
0.59399901883 0.098007481562
