In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing as pp
import warnings 
warnings.filterwarnings("ignore") # Ignore annoying warnings

dataDir = './Data/'
compiledFileName = dataDir + 'compiledData.pkl'
mungedFileName = dataDir + 'mungedData.pkl'

df = pd.read_pickle(compiledFileName)

The strategy should be:
- Numeric items should be checked for numeracy, and kept as numbers. If there are non-numeric things we need to prompt for appropriate replacement. If there are only a few numbers, we actually probably want them as categories.
- Items as text should be encoded as one-hot binary variables, and a LabelEncoder should be created for them.

In [2]:
# Utility function to see if a column has only real numeric values.
# NaN's are ok.
def numeracyCheck( column):
    nonNumerics = column[~column.apply(np.isreal)]
    if len(nonNumerics) > 0:
        print('\t Non-numeric values found at %d spots: ' % len(nonNumerics))
        print(pd.unique(nonNumerics))
        return False
    else:
        return True

# Utility function for encoding enumerated text as a collection of 
# one-hot binary vectors. This enumerates NaNs, so they don't show as
# missing data. We will reassign them as missing data.
def binarizeColumn( column):

    le = pp.LabelBinarizer(neg_label=0, pos_label=1, sparse_output=False)
    # Label binarizer will warn on NaNs
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
    y = le.fit_transform(column.map(str))
    # Fix bad behavior from LabelBinarizer
    if (y.shape[1] == 1) and (len(le.classes_)==2):
        y = np.concatenate((1-y,y),axis=1)
    print('\t Encoding %d enumerations: ' % len(le.classes_))
    
    # If there are any classes matching 'nan'
    if ('nan' in le.classes_):
        print('\t Found enumerated NaNs...')
        nanCols = [index for index, aClass in enumerate(le.classes_) if aClass == 'nan' ]
        y = np.delete(y, nanCols, axis=1).astype(float)   # Delete the column in y
        le.classes_ = np.delete(le.classes_, nanCols)     # Delete the nan class label
        y[np.nonzero(pd.isnull(column)),:] = np.nan       # Set the nan rows to nan

    # Now check to see if our only remaining choices are 'Yes' and 'No'. If so, we only have to return 'Yes'    
    matchList = ['Yes', 'No']
    if all([any([leClass == target for target in matchList]) for leClass in le.classes_]):
        print('\t Binary response; returning only Yes')
        noCols = [index for index, aClass in enumerate(le.classes_) if aClass == 'No' ]
        y = np.delete(y, noCols, axis=1).astype(float)   # Delete the column in y
        le.classes_ = np.delete(le.classes_, noCols)   # Delete the No class label

    return y, le

# Force a column to be categorical
def forceCat( colName):
    matchList = ['CPT', 'PODIAG', 'ICD']
    force = any(ext in colName for ext in matchList)
    return force

# Force a column to be continuous
def forceCont( colName):
    matchList = ['RBC']
    force = any(ext in colName for ext in matchList)
    return force

# Utility function to determine whether a column should be numeric or categorical.
def numOrCat( column, colName):
    
    if forceCat( colName):
        print('\t Forced categorical based on name match.')
        return False
    elif forceCont(colName):
        print('\t Forced continuous based on name match.')
        return True

    numericThresh = 0.5  # Assume categorical if < numericThresh are numeric
    uniqueThresh  = 10   # Assume categorical if < uniqueThresh unique numbers

    nonNumerics = column[~column.apply(np.isreal) & ~column.apply(pd.isnull)]
    numerics    = column[column.apply(np.isreal) & ~column.apply(pd.isnull)]
    Nuniques = len(column.unique())

    if ((len(nonNumerics)/(len(numerics)+1)) > numericThresh) or (Nuniques < uniqueThresh):
        print('\t Column detected categorical.')
        return False
    else:
        print('\t Column detected numeric.')
        return True
    


Now we loop through all the columns in our existing DataFrame and make a clean version that has our appropriately cleaned and coded variables. Some of these variables are text fields (like procedure descriptions) that have many options. We'd need to do some feature extraction here to make this useful; for now we'll just leave these features out.

This code should idenitfy any problems in the source DataFrame. I'll try to proactively address those by cleaning the data and giving a list of columns that should be forced categorical.

In [3]:
# Replace any '90+' age annotations with age 90.
df.loc[df['AGE'] == '90+','AGE'] = 90.0

In [4]:
# Make a clean DataFrame
cdf = pd.DataFrame()

for colName in df.columns:
    print('  ')
    print(colName)
    aColumn = df[colName]
    
    if len(aColumn.unique()) < 2:
        print('\t Skipping because there is only 1 value.')
        continue
    
    # If it looks numeric and isn't in our list to force categorical...
    if numOrCat(aColumn, colName):
        # If it's numeric
        if numeracyCheck(aColumn):
            # The column is a clean numeric column (but could have NaNs)
            nNaNs = sum(aColumn.isnull())
            print('\t Number NaN: %d' % nNaNs)
            
            # Add it to the clean DataFrame
            cdf[colName] = aColumn         
        else:
            # We need to clean it up
            print('*** Clean This Up ***')
    else:
        # If it needs to be categorical
        y, le = binarizeColumn(aColumn)
        nEnums = y.shape[1] + 1
        if (nEnums <= 20):
            for binColN in np.arange(y.shape[1]):
                print('\t Fract: %.3f \'%s\'' % (np.nanmean(y[:,binColN]),le.classes_[binColN]))
                subColName = colName + '-' + le.classes_[binColN]
                cdf[subColName] = y[:,binColN]
        else:
            
            print('\t Skipping due to %d enums.' % nEnums)
                

  
ADMQTR
	 Column detected categorical.
	 Encoding 4 enumerations: 
	 Fract: 0.261 '1'
	 Fract: 0.227 '2'
	 Fract: 0.248 '3'
	 Fract: 0.264 '4'
  
ADMSYR
	 Column detected categorical.
	 Encoding 4 enumerations: 
	 Found enumerated NaNs...
	 Fract: 0.001 '2007.0'
	 Fract: 0.381 '2008.0'
	 Fract: 0.619 '2009.0'
  
ADMYR
	 Column detected categorical.
	 Encoding 8 enumerations: 
	 Fract: 0.000 '2007'
	 Fract: 0.038 '2008'
	 Fract: 0.061 '2009'
	 Fract: 0.090 '2010'
	 Fract: 0.116 '2011'
	 Fract: 0.165 '2012'
	 Fract: 0.227 '2013'
	 Fract: 0.303 '2014'
  
AGE
	 Column detected numeric.
	 Number NaN: 0
  
AIRTRA
	 Column detected categorical.
	 Encoding 4 enumerations: 
	 Found enumerated NaNs...
	 Fract: 0.001 'Lip laceration or hematoma'
	 Fract: 0.998 'None'
	 Fract: 0.001 'Tooth chipped, loosened or lost'
  
ANESTHES
	 Column detected categorical.
	 Encoding 10 enumerations: 
	 Found enumerated NaNs...
	 Fract: 0.001 'Epidural'
	 Fract: 0.862 'General'
	 Fract: 0.001 'Local'
	 Fract: 

Ok, let's save this cleaned up data to disk. Note that there are still missing values for numeric variables.

In [5]:
cdf.to_pickle(mungedFileName) 

In [6]:
# cdf['intercept'] = 1.0
# X = cdf.copy()
# X.drop('READMISSION1-Yes',1)

# imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
# imp.fit(X)
# X = imp.transform(X) 
# X.shape

# logistic = linear_model.LogisticRegression(C=1e5, verbose=True)
# logistic.fit(X, cdf['READMISSION1-Yes'])