In [1]:
import pandas as pd
import numpy as np
import statsmodels.discrete.discrete_model as sm
import statsmodels.stats.multitest as mt
from sklearn.preprocessing import Imputer
import warnings 
warnings.filterwarnings("ignore") # Ignore annoying warnings

dataDir = './Data/'
mungedFileName = dataDir + 'mungedData.pkl'

cdf = pd.read_pickle(mungedFileName)

In [2]:
# y is True if any readmission variable is hot. It's NaN if all variables are null.
y = cdf[['READMISSION1-Yes','READMISSION-Yes']].any(1)
y[cdf[['READMISSION1-Yes','READMISSION-Yes']].isnull().all(1)] = np.nan

# Would like to drop rows with NaN y data
nanIdx = np.isnan(y).nonzero()
y = np.delete(y.ravel(), nanIdx ,axis=0)
cdf.drop(cdf.index[nanIdx], axis=0, inplace=True)

# Either drop or keep columns from cdf
dropOrKeep = True
if dropOrKeep:
    dropList = ['READMISSION-','READMISSION1-','REOPERATION-',\
                'REOPERATION1-','NWNDINFD-','WNDINFD-','DEHIS-','NDEHIS-',\
                'MORBPROB','NSUPINFEC-','SUPINFEC-','RETORPODAYS','OTHSYSEP',\
                'NOTHSYSEP-']
    colsToDrop = [colName for colName in cdf.columns if np.any([dropItem in colName for dropItem in dropList])]
    cdf = cdf.drop(colsToDrop,1)
    print('Dropped some variables: ')
    print(colsToDrop)
    # These are being dropped automatically by Imputer
    colsToDrop = [ 4,5,6,16,17,18,137,191,192,193,194,195,212,213,214,236,237,238,\
                  239,284,285,316,317,350,351,411,412,413,414,505,506,507,514,517]
    cdf = cdf.drop(cdf.columns[colsToDrop],axis=1)
    
else:
    # Dropping these columns is super-cumbersome. Let's find a list to keep
    keepList = ['DISCHDEST-','URNINFEC-','DIABETES-','PRHCT', 'PRALBUM']
    colsToKeep = [colName for colName in cdf.columns if np.any([keepItem in colName for keepItem in keepList])]
    cdf = cdf[colsToKeep]
    
# Impute missing data in cdf
colNames = cdf.columns
imp = Imputer(missing_values='NaN', strategy='mean', axis=0, verbose=True)
imp.fit(cdf)
cdf = imp.transform(cdf)
cdf = pd.DataFrame(cdf, columns=colNames)

Dropped some variables: 
['DEHIS-No Complication', 'DOTHSYSEP', 'MORBPROB', 'NDEHIS-0', 'NOTHSYSEP-0', 'NSUPINFEC-0', 'NWNDINFD-0', 'OTHSYSEP-No Complication', 'READMISSION-Yes', 'READMISSION1-Yes', 'REOPERATION-Yes', 'REOPERATION1-Yes', 'RETORPODAYS', 'SUPINFEC-No Complication', 'UNPLANNEDREADMISSION1-Yes', 'UNPLANREADMISSION-Yes', 'WNDINFD-Deep Incisional SSI']


In [3]:
def tryCols( y, X, inclCol):
    
    anyChanges = False
    
    addThresh  = 0.05
    dumpThresh = 0.15
    
    baseCols = inclCol[inclCol==True].index.values.tolist()
    tryCols =  inclCol[inclCol==False].index.values.tolist()
    
    # Try to fit all the variables we should try out in the model
    fitResultList = []
    for colName in tryCols:
        modelCols = baseCols + [colName]
        
        try: 
            logit = sm.Logit(y, X[modelCols])
            smresult = logit.fit(disp=False)
            fitResultList = fitResultList + [(smresult.pvalues[colName], colName)]
        except:
            a = 1
            # print('*** ' + colName + ': Exception ***')
    
    # Correct the p-values for multiple comparisons
    pValues = [fitItem[0] for fitItem in fitResultList]
    reject, pValues, alphacSidak, alphacBonf = mt.multipletests(pValues, alpha=0.05, method='holm')
    corrResultList = [(pValues[index], item[1]) for index, item in enumerate(fitResultList)]
    fitResultList = corrResultList
        
    # If the most significant variable is below threshold, add it to the list
    fitResultList.sort()
    bestFit = fitResultList[0]
    if bestFit[0] < addThresh:
        print('\t Adding: %s p = %.5f' % (bestFit[1], bestFit[0]))
        inclCol[bestFit[1]] = True
        anyChanges = True
    else:
        print('\t Not adding term. Best variable: %s p = %.5f' % (bestFit[1], bestFit[0]))
        
    # Ok, now re-fit the model with the existing terms
    modelCols = inclCol[inclCol==True].index.values.tolist()
    logit = sm.Logit(y, X[modelCols])
    smresult = logit.fit(disp=False)  
    # print(smresult.summary())
    maxP = smresult.pvalues.max()
    if  maxP > dumpThresh:
        colToDump = smresult.pvalues.argmax()
        inclCol[colToDump] = False
        anyChanges = True
        print('\t Removing term: %s p = %.5f' % (colToDump, maxP))

#    reject, pvals_corrected, alphacSidak, alphacBonf = mt.multipletests(values, alpha=0.05, method='holm')
    return anyChanges, inclCol, smresult

In [4]:
inclCol = pd.Series([False for colName in cdf.columns], dtype=bool, index=cdf.columns)

In [5]:
# Loop while we're sitll adding or subtracting columns
anyChanges = True
while (anyChanges):
    anyChanges, inclCol, smresult = tryCols( y, cdf, inclCol)
print(smresult.summary())

	 Adding: ADMYR-2013 p = 0.00000
	 Adding: ADMYR-2014 p = 0.00000
	 Adding: AGE p = 0.00000
	 Adding: NURNINFEC-0 p = 0.00000
	 Adding: REOPOR2CPT1-27880.0 p = 0.00000
	 Removing term: ADMYR-2014 p = 0.69748
	 Adding: NPULEMBOL-0 p = 0.00000
	 Removing term: ADMYR-2013 p = 0.27340
	 Adding: OTHERCPT7-27606.0 p = 0.00000
	 Removing term: REOPOR2CPT1-27880.0 p = 0.92558
	 Adding: NORGSPCSSI-0 p = 0.00000
	 Adding: PRHCT p = 0.00000
	 Removing term: OTHERCPT7-27606.0 p = 0.63145
	 Adding: REOPERATION2-Yes p = 0.00000
	 Adding: WNDCLAS-3-Contaminated p = 0.00000
	 Adding: NOUPNEUMO-0 p = 0.00000
	 Adding: DIALYSIS-Yes p = 0.00000
	 Adding: ASACLAS-3-Severe Disturb p = 0.00006
	 Adding: READMISSION2-Yes p = 0.00013
	 Adding: NOTHDVT-0 p = 0.00026
	 Adding: DSSIPATOS-Yes p = 0.00472
	 Adding: ASACLAS-4-Life Threat p = 0.01072
	 Adding: READMSUSPREASON1-Organ/Space SSI p = 0.00841
	 Adding: OSSIPATOS-Yes p = 0.00000
	 Adding: READMRELATED1-Yes p = 0.00000
	 Adding: DORGSPCSSI p = 0.00000
	 Ad