Initially, we'll just look at all single predictors of 30-day readmission.

In [1]:
import pandas as pd
import numpy as np
import sklearn as skl
from sklearn import preprocessing as pp
from sklearn.preprocessing import Imputer
import statsmodels.discrete.discrete_model as sm
import math
import warnings 
warnings.filterwarnings("ignore") # Ignore annoying warnings

dataDir = './Data/'
mungedFileName = dataDir + 'mungedData.pkl'

cdf = pd.read_pickle(mungedFileName)

There are a few different variables in this dataset that include readmission data. READMISSION is depricated, but still has data for older records. READMISSION1 is the current variable. It makes sense to try to merge these to be our regression target. No records have data for both. Of the 19533 records, 4537 don't have readmission data recorded either way. To start we'll just assume that these patients were not readmitted.

In [2]:
# y is True if any readmission variable is hot. It's NaN if all variables are null.
y = cdf[['READMISSION1-Yes','READMISSION-Yes']].any(1)
y[cdf[['READMISSION1-Yes','READMISSION-Yes']].isnull().all(1)] = np.nan
nanIdx = np.nonzero(cdf[['READMISSION1-Yes','READMISSION-Yes']].isnull().all(1))

# Would like to drop rows with NaN y data

# For now we'll impute those missing values...
imp = Imputer(missing_values='NaN', strategy='mean', axis=0, verbose=False)
imp.fit(y.reshape(-1,1))
y = imp.transform(y.reshape(-1,1))

In [3]:
np.unique(y)

array([ 0.        ,  0.02894105,  1.        ])

Now we need to make a predictor array to use for logistic regression, dropping our source data columns in the process. 

In [4]:
cdf.drop(['READMISSION-Yes','READMISSION1-Yes'],1)

Unnamed: 0,ADMQTR-1,ADMQTR-2,ADMQTR-3,ADMQTR-4,ADMSYR-2007.0,ADMSYR-2008.0,ADMSYR-2009.0,ADMYR-2007,ADMYR-2008,ADMYR-2009,...,WOUND_CLOSURE-No layers of incision are surgically closed,WOUND_CLOSURE-Only deep layers closed; superficial left open,YRDEATH-2008.0,YRDEATH-2009.0,YRDEATH-2010.0,YRDEATH-2011.0,YRDEATH-2012.0,YRDEATH-2013.0,YRDEATH-2014.0,YRDEATH-2015.0
0,0,0,0,1,0.0,1.0,0.0,0,1,0,...,,,,,,,,,,
1,0,0,1,0,0.0,1.0,0.0,0,1,0,...,,,,,,,,,,
2,0,0,1,0,0.0,1.0,0.0,0,1,0,...,,,,,,,,,,
3,0,0,1,0,0.0,1.0,0.0,0,1,0,...,,,,,,,,,,
4,0,0,0,1,0.0,1.0,0.0,0,1,0,...,,,,,,,,,,
5,0,0,1,0,0.0,1.0,0.0,0,1,0,...,,,,,,,,,,
6,0,0,1,0,0.0,1.0,0.0,0,1,0,...,,,,,,,,,,
7,0,0,1,0,0.0,1.0,0.0,0,1,0,...,,,,,,,,,,
8,0,0,1,0,0.0,1.0,0.0,0,1,0,...,,,,,,,,,,
9,0,0,1,0,0.0,1.0,0.0,0,1,0,...,,,,,,,,,,


To start, let's just cycle through each variable independently, printing out variables that have significant (before multiple comparisons corrections) predictive value. We also need to impute missing data (for the numeric columns, categoricals should be fine).

In [5]:
fitResults = pd.DataFrame(np.nan, index=[], columns=['colName','pValue','coeff'])

for colName in cdf.columns:

    # colName = 'AGE'
    imp = Imputer(missing_values='NaN', strategy='mean', axis=0, verbose=False)
    imp.fit(cdf[colName].reshape(-1,1))
    X = imp.transform(cdf[colName].reshape(-1,1))
    X = np.asarray(X).reshape(-1,1)
    X = np.concatenate((X, np.ones([X.shape[0],1])),axis=1) # Add an intercept term

    try:
        logit = sm.Logit(y, X)
        smresult = logit.fit(disp=False)
        # print(smresult.summary())
        # if smresult.pvalues[0] < 0.05:
        #    print('Var: ' + colName + '  logP = %.2f    Coeff = %.2f' % (math.log10(smresult.pvalues[0]), \
        #                                                                   smresult.params[0]))
        fitResults = fitResults.append(pd.Series({'colName': colName, \
                                                  'pValue': smresult.pvalues[0], \
                                                  'coeff': smresult.params[0]}),ignore_index=True)
        # else:
        #    print('Var: ' + colName)
            
    except:
        print('*** ' + colName + ': Exception ***')


*** COMA-Yes: Exception ***
*** CONCPT10-11045.0: Exception ***
*** CONCPT5-11045.0: Exception ***
*** CONCPT5-27687.0: Exception ***
*** CONCPT6-11045.0: Exception ***
*** CONCPT7-11045.0: Exception ***
*** CONCPT8-11045.0: Exception ***
*** CONCPT9-11045.0: Exception ***
*** CONCURR5-DBRDMT SUBCUTANEOUS TISSUE EA ADDL 20 SQ CM: Exception ***
*** CONCURR5-GASTROCNEMIUS RECESSION: Exception ***
*** CONCURR6-DBRDMT SUBCUTANEOUS TISSUE EA ADDL 20 SQ CM: Exception ***
*** CONCURR7-DBRDMT SUBCUTANEOUS TISSUE EA ADDL 20 SQ CM: Exception ***
*** CONCURR8-DBRDMT SUBCUTANEOUS TISSUE EA ADDL 20 SQ CM: Exception ***
*** CONCURR9-DBRDMT SUBCUTANEOUS TISSUE EA ADDL 20 SQ CM: Exception ***
*** CONWRVU4-7.1: Exception ***
*** CONWRVU5-6.41: Exception ***
*** DCNSCOMA-4.0: Exception ***
*** PODIAG_OTHER10-T88.6XXA: Exception ***
*** READMRELICD101-S89.399A: Exception ***
*** READMUNRELSUSP2-Other (list ICD9 code): Exception ***
*** REOPOR1ICD101-T81.4XXA: Exception ***
*** TYPEINTOC-Unplanned Intubat

In [6]:
fitResults = fitResults.sort_values(by='pValue')
for index, row in fitResults.iterrows():
    print('\t %d \t %.4f \t %.2f \t %s' % (index, row['pValue'],row['coeff'], row['colName']))  

	 456 	 0.0000 	 3.92 	 REOPERATION1-Yes
	 258 	 0.0000 	 3.54 	 NWNDINFD-0
	 533 	 0.0000 	 -3.54 	 WNDINFD-Deep Incisional SSI
	 524 	 0.0000 	 11.55 	 UNPLANREADMISSION-Yes
	 126 	 0.0000 	 3.30 	 DEHIS-No Complication
	 234 	 0.0000 	 3.30 	 NDEHIS-0
	 225 	 0.0000 	 13.61 	 MORBPROB
	 455 	 0.0000 	 4.15 	 REOPERATION-Yes
	 507 	 0.0000 	 2.28 	 SUPINFEC-No Complication
	 254 	 0.0000 	 2.28 	 NSUPINFEC-0
	 493 	 0.0000 	 0.34 	 RETORPODAYS
	 247 	 0.0000 	 3.07 	 NOTHSYSEP-0
	 364 	 0.0000 	 3.07 	 OTHSYSEP-No Complication
	 249 	 0.0000 	 3.36 	 NPULEMBOL-0
	 391 	 0.0000 	 3.36 	 PULEMBOL-No Complication
	 135 	 0.0000 	 -1.22 	 DISCHDEST-Home
	 256 	 0.0000 	 2.36 	 NURNINFEC-1
	 525 	 0.0000 	 2.35 	 URNINFEC-No Complication
	 255 	 0.0000 	 -2.35 	 NURNINFEC-0
	 129 	 0.0000 	 -1.05 	 DIABETES-NO
	 376 	 0.0000 	 -0.10 	 PRHCT
	 128 	 0.0000 	 1.27 	 DIABETES-INSULIN
	 241 	 0.0000 	 3.37 	 NORGSPCSSI-0
	 268 	 0.0000 	 3.37 	 ORGSPCSSI-No Complication
	 370 	 0.0000 	 -1.04

In [7]:
cdf.shape

(19533, 569)

In [8]:
fitResults

Unnamed: 0,colName,pValue,coeff
456,REOPERATION1-Yes,1.919156e-152,3.919839e+00
258,NWNDINFD-0,1.364264e-57,3.542455e+00
533,WNDINFD-Deep Incisional SSI,1.364264e-57,-3.542455e+00
524,UNPLANREADMISSION-Yes,1.607998e-57,1.155216e+01
126,DEHIS-No Complication,4.141054e-44,3.303006e+00
234,NDEHIS-0,4.141054e-44,3.303006e+00
225,MORBPROB,4.890672e-39,1.360513e+01
455,REOPERATION-Yes,1.341180e-37,4.146434e+00
507,SUPINFEC-No Complication,3.281372e-37,2.280516e+00
254,NSUPINFEC-0,3.281372e-37,2.280516e+00
