In [1]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV
import pandas as pd 
import numpy as np 
import seaborn 
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score
import os
from collections import Counter

In [2]:
box_file_dir = os.path.join(os.getcwd(), "..", "..", "Box")
file_path_csv = os.path.join(box_file_dir, "covid_pts_enc_level_labs_dx_2021-02-02_deid.csv")
df = pd.read_csv(file_path_csv, index_col=False)

In [3]:
df.head()

Unnamed: 0,admit_dt_tm,discharge_dt_tm,facility,encounter_type_display,age_at_admit,gender,zip_cust_table,ethnicity_display,race_display,race_eth_comb,...,diabetes_com_flag,hypertension_com_flag,CAD_com_flag,heartfailure_com_flag,CKD_com_flag,cancer_com_flag,qSOFA_score,deid_empi_encounter,deid_mrn_encounter,deid_fin_encounter
0,2020-08-25 20:51:00,2020-09-01 14:30:00,Shoal Creek,IP Private/Semi Private room,49,Male,Unknown,Hispanic or Latino,White,White Hispanic,...,False,False,False,False,False,False,,4dMaH,61003,10650
1,2020-03-24 00:07:00,2020-03-26 18:30:00,Medical Center,IP ICU,44,Male,78130,Hispanic or Latino,White,White Hispanic,...,False,False,False,False,False,False,3.0,8wQnr,84969,27818
2,2020-03-25 13:25:00,2020-03-30 03:48:00,Medical Center,IP Private/Semi Private room,80,Female,78758,Not Hispanic or Latino,Black or African American,Other race Non Hispanic,...,True,False,True,True,True,False,1.0,6YGuu,69112,34412
3,2020-03-25 13:36:00,2020-03-27 13:15:00,Medical Center,IP Private/Semi Private room,56,Male,78746,Not Hispanic or Latino,White,White Non Hispanic,...,False,False,False,False,False,False,1.0,P1Tjg,84820,37981
4,2020-03-25 13:43:00,2020-03-27 12:44:00,Medical Center,IP Private/Semi Private room,46,Female,78757,Not Hispanic or Latino,White,White Non Hispanic,...,False,False,False,False,False,False,0.0,iouKO,76686,41152


In [4]:
icu_df = df[~df['icu_hours_summed'].isnull()]

In [5]:
icu_df = icu_df[~icu_df['qSOFA_score'].isnull()]

In [6]:
new_df = df[~df['qSOFA_score'].isnull()]

In [7]:
Counter(new_df['qSOFA_score'])

Counter({3.0: 16, 1.0: 731, 0.0: 709, 2.0: 146})

In [8]:
Counter(new_df['discharge_disposition_display'])

Counter({'Expired': 175,
         'Home': 952,
         'Skilled Nursing Care': 139,
         'Long Term Care Hospital': 57,
         'Home Care with Home Health': 82,
         'Rehab Care': 30,
         'Another Hospital': 20,
         'Home Hospice Care': 12,
         'Hospice Facility': 8,
         'Against Medical Advice': 16,
         'Still a patient': 62,
         nan: 23,
         'Psychiatric Hospital': 16,
         'Intermediate Care': 5,
         'Court/Law Enforcement': 5})

In [9]:
new_df['mortality'] = new_df.discharge_disposition_display.apply(lambda x: 1 if x == 'Expired' else 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [10]:
icu_df['mortality'] = icu_df.discharge_disposition_display.apply(lambda x: 1 if x == 'Expired' else 0)

In [11]:
Counter(new_df['mortality'])

Counter({1: 175, 0: 1427})

In [12]:
Counter(icu_df['mortality'])

Counter({1: 161, 0: 555})

In [13]:
new_df = new_df.sample(frac=1)

### Modeling On ICU Patients

In [14]:
data = icu_df[['qSOFA_score', 'mortality']]
data2 = icu_df[['age_at_admit', 'mortality']]
data3 = icu_df[['qSOFA_score','age_at_admit', 'mortality']]

#### qSOFA only

In [15]:
train, test = train_test_split(data, train_size = 0.9)

In [16]:
X, y = train['qSOFA_score'], train['mortality']

In [17]:
model = LogisticRegressionCV()

In [18]:
model.fit(X.values.reshape(-1,1),y)



LogisticRegressionCV(Cs=10, class_weight=None, cv='warn', dual=False,
                     fit_intercept=True, intercept_scaling=1.0, l1_ratios=None,
                     max_iter=100, multi_class='warn', n_jobs=None,
                     penalty='l2', random_state=None, refit=True, scoring=None,
                     solver='lbfgs', tol=0.0001, verbose=0)

In [19]:
# train accuracy 
model.score(X.values.reshape(-1,1),y)

0.7748447204968945

In [20]:
X_test, y_test = test['qSOFA_score'], test['mortality']

In [21]:
preds = model.predict(X_test.values.reshape(-1,1))

In [22]:
accuracy_score(preds, y_test)

0.7777777777777778

In [23]:
# auc is awful 
roc_auc_score(y_test, preds)

0.5

In [24]:
Counter(y_test)

Counter({1: 16, 0: 56})

In [25]:
train_preds = model.predict(X.values.reshape(-1,1))
cm = confusion_matrix(train_preds, y)

In [26]:
cm


array([[499, 145],
       [  0,   0]])

In [27]:
# qSOFA is not good with our dataset..........

#### Age

In [28]:
train, test = train_test_split(data2, train_size = 0.9)
X, y = train['age_at_admit'], train['mortality']
model = LogisticRegressionCV()
model.fit(X.values.reshape(-1,1),y)




LogisticRegressionCV(Cs=10, class_weight=None, cv='warn', dual=False,
                     fit_intercept=True, intercept_scaling=1.0, l1_ratios=None,
                     max_iter=100, multi_class='warn', n_jobs=None,
                     penalty='l2', random_state=None, refit=True, scoring=None,
                     solver='lbfgs', tol=0.0001, verbose=0)

In [29]:
model.score(X.values.reshape(-1,1),y)

0.7748447204968945

In [30]:
X_test, y_test = test['age_at_admit'], test['mortality']
preds = model.predict(X_test.values.reshape(-1,1))
accuracy_score(preds, y_test)

0.7638888888888888

In [31]:
roc_auc_score(y_test, preds)

0.5277777777777778

In [32]:
Counter(y_test)

Counter({0: 54, 1: 18})

In [33]:
train_preds = model.predict(X.values.reshape(-1,1))
cm = confusion_matrix(train_preds, y)

In [34]:
cm

array([[487, 131],
       [ 14,  12]])

#### Age and qSOFA

In [35]:
train, test = train_test_split(data3, train_size = 0.9)
X, y = train[['qSOFA_score','age_at_admit']], train['mortality']
model = LogisticRegressionCV()
model.fit(X,y)



LogisticRegressionCV(Cs=10, class_weight=None, cv='warn', dual=False,
                     fit_intercept=True, intercept_scaling=1.0, l1_ratios=None,
                     max_iter=100, multi_class='warn', n_jobs=None,
                     penalty='l2', random_state=None, refit=True, scoring=None,
                     solver='lbfgs', tol=0.0001, verbose=0)

In [36]:
model.score(X,y)

X_test, y_test = test[['qSOFA_score','age_at_admit']], test['mortality']
preds = model.predict(X_test)
accuracy_score(preds, y_test)

0.75

In [37]:
roc_auc_score(y_test, preds)

0.5431976166832174

In [38]:
Counter(y_test)

Counter({0: 53, 1: 19})

In [40]:
train_preds = model.predict(X.values)
cm = confusion_matrix(train_preds, y)

cm

array([[489, 131],
       [ 13,  11]])

In [41]:
# auc is low on all. this is probably because the calculated qSOFA is recorded when a patient was admitted to the 
# hospital 