In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
from scipy import stats
from datetime import datetime
from itertools import combinations
import matplotlib.pyplot as plt
import re
import warnings
warnings.filterwarnings('ignore')
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [5]:
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, LabelEncoder, Imputer, label_binarize, PolynomialFeatures


## Read in Data

In [18]:
# Read in readmissions data
readmission_all = pd.read_csv('../Data/readmission_all_full.csv')
readmission_ip = pd.read_csv('../Data/readmission_ip_full.csv')
readmission_er = pd.read_csv('../Data/readmission_er_full.csv')

# Read in diagnosis data
readmission_diagnosis_all = pd.read_csv('../Data/readmission_diagnosis_all_full.csv')
readmission_diagnosis_ip = pd.read_csv('../Data/readmission_diagnosis_ip_full.csv')
readmission_diagnosis_er = pd.read_csv('../Data/readmission_diagnosis_er_full.csv')

# Read in crosswalks
icd9ccs = pd.read_csv('../Data/icd_crosswalk_icd9ccs.csv')
icd10ccs = pd.read_csv('../Data/icd_crosswalk_icd10ccs.csv')

## Pre-Processing

In [11]:
# Look at data types of variables
readmission_ip.dtypes

person_id                          object
Age_at_visit                        int64
patient_city                       object
patient_county                     object
patient_zipcode                     int64
hospital_poc                       object
Hospital_visit_date                object
discharge_date                     object
encounter_type                     object
payor_name                         object
last community care location       object
last community care visit date     object
ethnicity                          object
language                           object
uds ethnicity                      object
uds race                           object
uds homless                        object
LACE L score                        int64
LACE A score                        int64
LACE E score                        int64
Total LACE score                    int64
admission_count                     int64
encounter_prev_all                 object
Hospital_visit_prev_all           

### Select Features for Modeling

In [None]:
readmission_ip = readmission_ip[['person_id','Age_at_visit','patient_city','patient_county','hospital_poc','payor_name','last community care location','ethnicity','language','uds ethnicity','uds race','uds homless','LACE L score','LACE A score','LACE E score','Total LACE score','transfer_in_ER','transfer_in_IP','interval_prev','interval_prev_all','past_visits_ER','past_visits_IP','income','education','poverty','ccs10_category_top','Future_Inpatient']]

### Numeric Variables

In [19]:
# Get list of numeric variables
num = readmission_ip.select_dtypes(include=[int64, float64]).columns.values.tolist()
num

['Age_at_visit',
 'patient_zipcode',
 'LACE L score',
 'LACE A score',
 'LACE E score',
 'Total LACE score',
 'admission_count',
 'interval_prev_all',
 'transfer_in_ER',
 'transfer_in_IP',
 'interval_prev',
 'interval_next',
 'transfer_out_IP',
 'past_visits_total',
 'past_visits_ER',
 'past_visits_IP',
 'income',
 'education',
 'poverty',
 'ccs10_category_top']

In [30]:
# Replace 'NA' with numpy nan
readmission_ip['income'] = readmission_ip.income.apply(lambda x: np.nan if x == 'NA' else x)
readmission_ip['education'] = readmission_ip.education.apply(lambda x: np.nan if x == 'NA' else x)
readmission_ip['poverty'] = readmission_ip.poverty.apply(lambda x: np.nan if x == 'NA' else x)

# How many missing values does each numeric variable have?
readmission_ip[num].apply(lambda x: x.isnull().sum())

Age_at_visit              0
patient_zipcode           0
LACE L score              0
LACE A score              0
LACE E score              0
Total LACE score          0
admission_count           0
interval_prev_all      4175
transfer_in_ER            0
transfer_in_IP            0
interval_prev         12871
interval_next         12871
transfer_out_IP           0
past_visits_total         0
past_visits_ER            0
past_visits_IP            0
income                  412
education               412
poverty                 412
ccs10_category_top     2536
dtype: int64

We will need to do some imputing for the variables with missing values. However, due to the large number of missing values for the time based variables it may introduce bias and noise if we we 

In [23]:
# Impute missing values with variable mean
imputer = Imputer()
X_num_wnull = imputer.fit_transform(readmission_ip[['interval_prev','interval_prev_all', 'interval_next', 'income','education','poverty']])

In [27]:
# Store numeric variables without missing values
X_num_wonull = np.array(readmission_ip[['Age_at_visit','patient_zipcode', 'LACE L score', 'LACE A score', 'LACE E score', 
                                              'Total LACE score', 'admission_count', 'transfer_in_ER', 'transfer_in_IP', 
                                              'transfer_out_IP', 'past_visits_ER','past_visits_IP', 'past_visits_total']])

In [29]:
# Concatenate numeric variables back together
X_num = np.concatenate((X_num_wonull, X_num_wnull), axis = 1)

### Categorical Variables

In [None]:
cat = readmission_ip.select_dtypes(include=[object]).drop('Future_Inpatient',axis=1).columns.values.tolist()