# Making Predictive Models in Healthcare

## Improting the dataset

In [1]:
import pandas as pd
pd.set_option('mode.chained_assignment',None)

HOME_PATH = r"~/git_repo/DataSkeptic-Projects/DataSets/NHAMCS_Data_Files/"

df_helper = pd.read_csv(
    HOME_PATH + 'ED_metadata.csv',
    header=0,
    dtype={'width': int, 'column_name': str, 'variable_type': str}
)

print(df_helper.head(n=5))

   width column_name  variable_type
0      2      VMONTH    CATEGORICAL
1      1       VDAYR    CATEGORICAL
2      4     ARRTIME  NONPREDICTIVE
3      4    WAITTIME     CONTINUOUS
4      4         LOV  NONPREDICTIVE


In [2]:
width = df_helper['width'].tolist()
col_names = df_helper['column_name'].tolist()
var_types = df_helper['variable_type'].tolist()

In [3]:
df_ed = pd.read_fwf(
    HOME_PATH + 'ED2013',
    widths=width,
    header=None,
    dtype='str'
)

df_ed.columns = col_names

In [4]:
print(df_ed.head(n=5))

  VMONTH VDAYR ARRTIME WAITTIME   LOV  AGE AGER AGEDAYS RESIDNCE SEX ...   \
0     01     3    0647     0033  0058  046    4     -07       01   2 ...    
1     01     3    1841     0109  0150  056    4     -07       01   2 ...    
2     01     3    1333     0084  0198  037    3     -07       01   2 ...    
3     01     3    1401     0159  0276  007    1     -07       01   1 ...    
4     01     4    1947     0114  0248  053    4     -07       01   1 ...    

  RX12V3C1 RX12V3C2 RX12V3C3 RX12V3C4 SETTYPE  YEAR   CSTRATM   CPSUM   PATWT  \
0      nan      nan      nan      nan       3  2013  20113201  100020  002945   
1      nan      nan      nan      nan       3  2013  20113201  100020  002945   
2      nan      nan      nan      nan       3  2013  20113201  100020  002945   
3      nan      nan      nan      nan       3  2013  20113201  100020  002945   
4      nan      nan      nan      nan       3  2013  20113201  100020  002945   

  EDWT  
0  nan  
1  nan  
2  nan  
3  nan  
4  na

In [5]:
print(df_ed.shape)

(24777, 579)


## Making the response variable

In [6]:
response_cols = ['ADMITHOS','TRANOTH','TRANPSYC','OBSHOS','OBSDIS']

df_ed.loc[:, response_cols] = df_ed.loc[:, response_cols].apply(pd.to_numeric)

df_ed['ADMITTEMP'] = df_ed[response_cols].sum(axis=1)
df_ed['ADMITFINAL'] = 0
df_ed.loc[df_ed['ADMITTEMP'] >= 1, 'ADMITFINAL'] = 1

df_ed.drop(response_cols, axis=1, inplace=True)
df_ed.drop('ADMITTEMP', axis=1, inplace=True)

## Splitting the data into train and test sets

In [7]:
def split_target(data, target_name):
    target = data[[target_name]]
    data.drop(target_name, axis=1, inplace=True)
    return (data, target)

X, y = split_target(df_ed, 'ADMITFINAL')

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=1234
) 

In [9]:
print(y_train.groupby('ADMITFINAL').size())

ADMITFINAL
0    15996
1     2586
dtype: int64


## Preprocessing the predictor variables

In [10]:
print(X_train.groupby('VMONTH').size())

VMONTH
01    1757
02    1396
03    1409
04    1719
05    2032
06    1749
07    1696
08    1034
09    1240
10    1306
11    1693
12    1551
dtype: int64


In [11]:
def is_winter(vmonth):
    if vmonth in ['12','01','02','03']:
        return 1
    else:
        return 0

X_train.loc[:,'WINTER'] = df_ed.loc[:,'VMONTH'].apply(is_winter)
X_test.loc[:,'WINTER'] = df_ed.loc[:,'VMONTH'].apply(is_winter)

In [12]:
X_train.groupby('WINTER').size()

WINTER
0    12469
1     6113
dtype: int64

In [13]:
X_train.groupby('VDAYR').size()

VDAYR
1    2559
2    2972
3    2791
4    2632
5    2553
6    2569
7    2506
dtype: int64

In [14]:
def is_night(arrtime):
    arrtime_int = int(arrtime)
    if ((arrtime_int >= 0) & (arrtime_int < 800)):
        return 1
    elif ((arrtime_int >= 2000) & (arrtime_int < 2400)):
        return 1
    else:
        return 0

X_train.loc[:,'NIGHT'] = df_ed.loc[:,'ARRTIME'].apply(is_night)
X_test.loc[:,'NIGHT'] = df_ed.loc[:,'ARRTIME'].apply(is_night)

X_train.drop('ARRTIME', axis=1, inplace=True)
X_test.drop('ARRTIME', axis=1, inplace=True)

In [15]:
X_train.groupby('NIGHT').size()

NIGHT
0    12750
1     5832
dtype: int64

In [16]:
X_train.loc[:,'WAITTIME'] = X_train.loc[:,'WAITTIME'].apply(pd.to_numeric)
X_test.loc[:,'WAITTIME'] = X_test.loc[:,'WAITTIME'].apply(pd.to_numeric)

In [17]:
def mean_impute_values(data,col):
    temp_mean = data.loc[(data[col] != -7) & (data[col] != -9), col].mean()
    data.loc[(data[col] == -7) | (data[col] == -9), col] = temp_mean
    return data

X_train = mean_impute_values(X_train, 'WAITTIME')
X_test = mean_impute_values(X_test, 'WAITTIME')

In [18]:
X_train.drop('LOV', axis=1, inplace=True)
X_test.drop('LOV', axis=1, inplace=True)

In [19]:
X_train.loc[:,'AGE'] = X_train.loc[:,'AGE'].apply(pd.to_numeric)
X_test.loc[:,'AGE'] = X_test.loc[:,'AGE'].apply(pd.to_numeric)

X_train.drop('AGEDAYS', axis=1, inplace=True)
X_test.drop('AGEDAYS', axis=1, inplace=True)

In [20]:
X_train.drop(['ETHIM','RACER','RACERETH'], axis=1, inplace=True)
X_test.drop(['ETHIM','RACER','RACERETH'], axis=1, inplace=True)

In [21]:
X_train.head(n=5)

Unnamed: 0,VMONTH,VDAYR,WAITTIME,AGE,AGER,RESIDNCE,SEX,ETHUN,RACEUN,ARREMS,...,RX12V3C3,RX12V3C4,SETTYPE,YEAR,CSTRATM,CPSUM,PATWT,EDWT,WINTER,NIGHT
15938,11,3,27.0,58,4,1,1,2,1,1,...,,,3,2013,40300000,24,3201,,0,0
5905,10,3,5.0,91,6,2,1,2,2,1,...,,,3,2013,20213201,100091,3784,,0,1
4636,7,1,45.561676,29,3,1,1,2,2,2,...,,,3,2013,20213201,100075,2214,,0,0
9452,8,1,23.0,20,2,1,2,2,-9,2,...,,,3,2013,20413201,100227,2262,,0,0
7558,2,4,32.0,51,4,3,1,2,1,1,...,,,3,2013,20413201,100242,2108,,1,0


In [22]:
X_train.drop('PAYTYPER', axis=1, inplace=True)
X_test.drop('PAYTYPER', axis=1, inplace=True)

In [23]:
X_train.loc[:,'TEMPF'] = X_train.loc[:,'TEMPF'].apply(pd.to_numeric)
X_test.loc[:,'TEMPF'] = X_test.loc[:,'TEMPF'].apply(pd.to_numeric)

X_train = mean_impute_values(X_train,'TEMPF')
X_test = mean_impute_values(X_test,'TEMPF')

X_train.loc[:,'TEMPF'] = X_train.loc[:,'TEMPF'].apply(lambda x: float(x)/10)
X_test.loc[:,'TEMPF'] = X_test.loc[:,'TEMPF'].apply(lambda x: float(x)/10)

In [24]:
X_train['TEMPF'].head(n=30)

15938     98.200000
5905      98.100000
4636      98.200000
9452      98.200000
7558      99.300000
17878     99.000000
21071     97.800000
20990     98.600000
4537      98.200000
7025      99.300000
2134      97.500000
5212      97.400000
9213      97.900000
2306      97.000000
6106      98.600000
2727      98.282103
4098      99.100000
5233      98.800000
5107     100.000000
18327     98.900000
19242     98.282103
3868      97.900000
12903     98.600000
12763     98.700000
8858      99.400000
8955      97.900000
16360     98.282103
6857      97.100000
6842      97.700000
22073     97.900000
Name: TEMPF, dtype: float64

In [25]:
X_train.loc[:,'PULSE'] = X_train.loc[:,'PULSE'].apply(pd.to_numeric)
X_test.loc[:,'PULSE'] = X_test.loc[:,'PULSE'].apply(pd.to_numeric)

In [26]:
def mean_impute_vitals(data,col):
    temp_mean = data.loc[(data[col] != 998) & (data[col] != -9),col].mean()
    data.loc[(data[col] == 998) | (data[col] == -9),col] = temp_mean
    return data

X_train = mean_impute_vitals(X_train, 'PULSE')
X_test = mean_impute_vitals(X_test, 'PULSE')

In [27]:
X_train.loc[:,'RESPR'] = X_train.loc[:,'RESPR'].apply(pd.to_numeric)
X_test.loc[:,'RESPR'] = X_test.loc[:,'RESPR'].apply(pd.to_numeric)

X_train = mean_impute_values(X_train, 'RESPR')
X_test = mean_impute_values(X_test, 'RESPR')

In [28]:
X_train.loc[:,'BPSYS'] = X_train.loc[:,'BPSYS'].apply(pd.to_numeric)
X_test.loc[:,'BPSYS'] = X_test.loc[:,'BPSYS'].apply(pd.to_numeric)

X_train = mean_impute_values(X_train, 'BPSYS')
X_test = mean_impute_values(X_test, 'BPSYS')

In [29]:
X_train.loc[:,'BPDIAS'] = X_train.loc[:,'BPDIAS'].apply(pd.to_numeric)
X_test.loc[:,'BPDIAS'] = X_test.loc[:,'BPDIAS'].apply(pd.to_numeric)

def mean_impute_bp_diast(data,col):
    temp_mean = data.loc[(data[col] != 998) & (data[col] != -9),col].mean()
    data.loc[(data[col] == 998),col] = 40
    data.loc[(data[col] == -9),col] = temp_mean
    return data

X_train = mean_impute_values(X_train, 'BPDIAS')
X_test = mean_impute_values(X_test, 'BPDIAS')

In [30]:
X_train.loc[:,'POPCT'] = X_train.loc[:,'POPCT'].apply(pd.to_numeric)
X_test.loc[:,'POPCT'] = X_test.loc[:,'POPCT'].apply(pd.to_numeric)

X_train = mean_impute_values(X_train, 'POPCT')
X_test = mean_impute_values(X_test, 'POPCT')

In [31]:
X_train[['TEMPF','PULSE','RESPR','BPSYS','BPDIAS','POPCT']].head(n=20)

Unnamed: 0,TEMPF,PULSE,RESPR,BPSYS,BPDIAS,POPCT
15938,98.2,101.0,22.0,159.0,72.0,98.0
5905,98.1,70.0,18.0,167.0,79.0,96.0
4636,98.2,85.0,20.0,113.0,70.0,98.0
9452,98.2,84.0,20.0,146.0,72.0,98.0
7558,99.3,116.0,18.0,131.0,82.0,96.0
17878,99.0,73.0,16.0,144.0,91.0,99.0
21071,97.8,88.0,18.0,121.0,61.0,98.0
20990,98.6,67.0,16.0,112.0,65.0,95.0
4537,98.2,85.0,20.0,113.0,72.0,99.0
7025,99.3,172.0,40.0,124.0,80.0,100.0


In [32]:
X_train.loc[:,'PAINSCALE'] = X_train.loc[:,'PAINSCALE'].apply(pd.to_numeric)
X_test.loc[:,'PAINSCALE'] = X_test.loc[:,'PAINSCALE'].apply(pd.to_numeric)

def mean_impute_pain(data,col):
    temp_mean = data.loc[(data[col] != -8) & (data[col] != -9), col].mean()
    data.loc[(data[col] == -8) | (data[col] == -9), col] = temp_mean
    return data

X_train = mean_impute_pain(X_train, 'PAINSCALE')
X_test = mean_impute_pain(X_test, 'PAINSCALE')

In [33]:
rfv_codes_path = r"/home/dataskeptic/git_repo/DataSkeptic-Projects/DataSets/NHAMCS_Data_Files/RFV_CODES.csv"
rfv_codes = pd.read_csv(rfv_codes_path, header=0, dtype='str')

In [34]:
from re import sub

def add_rfv_column(data,code,desc,rfv_columns):
    column_name = "rfv_" + sub(' ', '_',desc)
    data[column_name] = (data[rfv_columns] == rfv_code).any(axis=1).astype('int')
    return data

rfv_columns = ['RFV1','RFV2','RFV3']
for (rfv_code,rfv_desc) in zip(rfv_codes['Code'].tolist(),rfv_codes['Description'].tolist()):
    X_train = add_rfv_column(
        X_train,
        rfv_code,
        rfv_desc,
        rfv_columns
    )
    X_test = add_rfv_column(
        X_test,
        rfv_code,
        rfv_desc,
        rfv_columns
    )

# Remove original RFV columns
X_train.drop(rfv_columns, axis=1, inplace=True)
X_test.drop(rfv_columns, axis=1, inplace=True)

X_train.head(n=5)

Unnamed: 0,VMONTH,VDAYR,WAITTIME,AGE,AGER,RESIDNCE,SEX,ETHUN,RACEUN,ARREMS,...,rfv_Entry_of_none_or_no_complaint,rfv_Insufficient_information,rfv_Driver's_license_examination_DOT_,rfv_Illegible_entry,rfv_Insurance_examination_,rfv_Disability_examination_,rfv_Worker’s_comp_exam,rfv_Premarital_examination,rfv_Premarital_blood_test,rfv_Direct_admission_to_hospital
15938,11,3,27.0,58,4,1,1,2,1,1,...,0,0,0,0,0,0,0,0,0,0
5905,10,3,5.0,91,6,2,1,2,2,1,...,0,0,0,0,0,0,0,0,0,0
4636,7,1,45.561676,29,3,1,1,2,2,2,...,0,0,0,0,0,0,0,0,0,0
9452,8,1,23.0,20,2,1,2,2,-9,2,...,0,0,0,0,0,0,0,0,0,0
7558,2,4,32.0,51,4,3,1,2,1,1,...,0,0,0,0,0,0,0,0,0,0


In [35]:
inj_cols = [
    'INJURY','INJR1','INJR2','INJPOISAD',
    'INJPOISADR1','INJPOISADR2','INTENT','INJDETR',
    'INJDETR1','INJDETR2','CAUSE1','CAUSE2',
    'CAUSE3','CAUSE1R','CAUSE2R','CAUSE3R'
]

X_train.drop(inj_cols, axis=1, inplace=True)
X_test.drop(inj_cols, axis=1, inplace=True)

In [36]:
diag_cols=[
    'DIAG1','DIAG2','DIAG3',
    'PRDIAG1','PRDIAG2','PRDIAG3',
    'DIAG1R','DIAG2R','DIAG3R',
]

X_train.drop(diag_cols, axis=1, inplace=True)
X_test.drop(diag_cols, axis=1, inplace=True)

In [37]:
X_train.loc[:,'TOTCHRON'] = X_train.loc[:,'TOTCHRON'].apply(pd.to_numeric)
X_test.loc[:,'TOTCHRON'] = X_test.loc[:,'TOTCHRON'].apply(pd.to_numeric)

X_train = mean_impute_values(X_train,'TOTCHRON')
X_test = mean_impute_values(X_test,'TOTCHRON')

In [38]:
testing_cols = [
    'ABG','BAC','BLOODCX','BNP','BUNCREAT',
    'CARDENZ','CBC','DDIMER','ELECTROL','GLUCOSE',
    'LACTATE','LFT','PTTINR','OTHERBLD','CARDMON',
    'EKG','HIVTEST','FLUTEST','PREGTEST','TOXSCREN',
    'URINE','WOUNDCX','URINECX','OTHRTEST','ANYIMAGE',
    'XRAY','IVCONTRAST','CATSCAN','CTAB','CTCHEST',
    'CTHEAD','CTOTHER','CTUNK','MRI','ULTRASND',
    'OTHIMAGE','TOTDIAG','DIAGSCRN'
]

X_train.drop(testing_cols, axis=1, inplace=True)
X_test.drop(testing_cols, axis=1, inplace=True)

In [39]:
proc_cols = [
    'PROC','BPAP','BLADCATH','CASTSPLINT','CENTLINE',
    'CPR','ENDOINT','INCDRAIN','IVFLUIDS','LUMBAR',
    'NEBUTHER','PELVIC','SKINADH','SUTURE','OTHPROC',
    'TOTPROC'
]

X_train.drop(proc_cols, axis=1, inplace=True)
X_test.drop(proc_cols, axis=1, inplace=True)

In [40]:
med_cols = [
    'MED1','MED2','MED3','MED4','MED5',
    'MED6','MED7','MED8','MED9','MED10',
    'MED11','MED12','GPMED1','GPMED2','GPMED3',
    'GPMED4','GPMED5','GPMED6','GPMED7','GPMED8',
    'GPMED9','GPMED10','GPMED11','GPMED12','NUMGIV',
    'NUMDIS','NUMMED',
]

X_train.drop(med_cols, axis=1, inplace=True)
X_test.drop(med_cols, axis=1, inplace=True)

In [41]:
prov_cols = [
    'NOPROVID','ATTPHYS','RESINT','CONSULT','RNLPN',
    'NURSEPR','PHYSASST','EMT','MHPROV','OTHPROV'
]

X_train.drop(prov_cols, axis=1, inplace=True)
X_test.drop(prov_cols, axis=1, inplace=True)

In [42]:
disp_cols = [
    'NODISP','NOFU','RETRNED','RETREFFU','LEFTBTRI',
    'LEFTAMA','DOA','DIEDED','TRANNH','OTHDISP',
    'ADMIT','ADMTPHYS','BOARDED','LOS','HDDIAG1',
    'HDDIAG2','HDDIAG3','HDDIAG1R','HDDIAG2R','HDDIAG3R',
    'HDSTAT','ADISP','OBSSTAY','STAY24'
]

X_train.drop(disp_cols, axis=1, inplace=True)
X_test.drop(disp_cols, axis=1, inplace=True)

In [43]:
imp_cols = [
    'AGEFL','BDATEFL','SEXFL','ETHNICFL','RACERFL'
]

X_train.drop(imp_cols, axis=1, inplace=True)
X_test.drop(imp_cols, axis=1, inplace=True)

In [44]:
id_cols = [
    'HOSPCODE','PATCODE'
]

X_train.drop(id_cols, axis=1, inplace=True)
X_test.drop(id_cols, axis=1, inplace=True)

In [45]:
emr_cols = [
    'EBILLANYE','EMRED','HHSMUE','EHRINSE','EDEMOGE',
    'EDEMOGER','EPROLSTE','EPROLSTER','EVITALE','EVITALER',
    'ESMOKEE','ESMOKEER','EPNOTESE','EPNOTESER','EMEDALGE',
    'EMEDALGER','ECPOEE','ECPOEER','ESCRIPE','ESCRIPER',
    'EWARNE','EWARNER','EREMINDE','EREMINDER','ECTOEE',
    'ECTOEER','EORDERE','EORDERER','ERESULTE','ERESULTER',
    'EGRAPHE','EGRAPHER','EIMGRESE','EIMGRESER','EPTEDUE',
    'EPTEDUER','ECQME','ECQMER','EGENLISTE','EGENLISTER',
    'EIMMREGE','EIMMREGER','ESUME','ESUMER','EMSGE',
    'EMSGER','EHLTHINFOE','EHLTHINFOER','EPTRECE','EPTRECER',
    'EMEDIDE','EMEDIDER','ESHAREE','ESHAREEHRE','ESHAREWEBE',
    'ESHAREOTHE','ESHAREUNKE','ESHAREREFE','LABRESE1','LABRESE2',
    'LABRESE3','LABRESE4','LABRESUNKE','LABRESREFE','IMAGREPE1',
    'IMAGREPE2','IMAGREPE3','IMAGREPE4','IMAGREPUNKE','IMAGREPREFE',
    'PTPROBE1','PTPROBE2','PTPROBE3','PTPROBE4','PTPROBUNKE',
    'PTPROBREFE','MEDLISTE1','MEDLISTE2','MEDLISTE3','MEDLISTE4',
    'MEDLISTUNKE','MEDLISTREFE','ALGLISTE1','ALGLISTE2','ALGLISTE3',
    'ALGLISTE4','ALGLISTUNKE','ALGLISTREFE','EDPRIM','EDINFO',
    'MUINC','MUYEAR'
]

X_train.drop(emr_cols, axis=1, inplace=True)
X_test.drop(emr_cols, axis=1, inplace=True)

In [46]:
drug_id_cols = [
    'DRUGID1','DRUGID2','DRUGID3','DRUGID4','DRUGID5',
    'DRUGID6','DRUGID7','DRUGID8','DRUGID9','DRUGID10',
    'DRUGID11','DRUGID12'
]

drug_lev1_cols = [
    'RX1V1C1','RX1V1C2','RX1V1C3','RX1V1C4',
    'RX2V1C1','RX2V1C2','RX2V1C3','RX2V1C4',
    'RX3V1C1','RX3V1C2','RX3V1C3','RX3V1C4',
    'RX4V1C1','RX4V1C2','RX4V1C3','RX4V1C4',
    'RX5V1C1','RX5V1C2','RX5V1C3','RX5V1C4',
    'RX6V1C1','RX6V1C2','RX6V1C3','RX6V1C4',
    'RX7V1C1','RX7V1C2','RX7V1C3','RX7V1C4',
    'RX8V1C1','RX8V1C2','RX8V1C3','RX8V1C4',
    'RX9V1C1','RX9V1C2','RX9V1C3','RX9V1C4',
    'RX10V1C1','RX10V1C2','RX10V1C3','RX10V1C4',
    'RX11V1C1','RX11V1C2','RX11V1C3','RX11V1C4',
    'RX12V1C1','RX12V1C2','RX12V1C3','RX12V1C4'
]

drug_lev2_cols = [
    'RX1V2C1','RX1V2C2','RX1V2C3','RX1V2C4',
    'RX2V2C1','RX2V2C2','RX2V2C3','RX2V2C4',
    'RX3V2C1','RX3V2C2','RX3V2C3','RX3V2C4',
    'RX4V2C1','RX4V2C2','RX4V2C3','RX4V2C4',
    'RX5V2C1','RX5V2C2','RX5V2C3','RX5V2C4',
    'RX6V2C1','RX6V2C2','RX6V2C3','RX6V2C4',
    'RX7V2C1','RX7V2C2','RX7V2C3','RX7V2C4',
    'RX8V2C1','RX8V2C2','RX8V2C3','RX8V2C4',
    'RX9V2C1','RX9V2C2','RX9V2C3','RX9V2C4',
    'RX10V2C1','RX10V2C2','RX10V2C3','RX10V2C4',
    'RX11V2C1','RX11V2C2','RX11V2C3','RX11V2C4',
    'RX12V2C1','RX12V2C2','RX12V2C3','RX12V2C4'
]

drug_lev3_cols = [
    'RX1V3C1','RX1V3C2','RX1V3C3','RX1V3C4',
    'RX2V3C1','RX2V3C2','RX2V3C3','RX2V3C4',
    'RX3V3C1','RX3V3C2','RX3V3C3','RX3V3C4',
    'RX4V3C1','RX4V3C2','RX4V3C3','RX4V3C4',
    'RX5V3C1','RX5V3C2','RX5V3C3','RX5V3C4',
    'RX6V3C1','RX6V3C2','RX6V3C3','RX6V3C4',
    'RX7V3C1','RX7V3C2','RX7V3C3','RX7V3C4',
    'RX8V3C1','RX8V3C2','RX8V3C3','RX8V3C4',
    'RX9V3C1','RX9V3C2','RX9V3C3','RX9V3C4',
    'RX10V3C1','RX10V3C2','RX10V3C3','RX10V3C4',
    'RX11V3C1','RX11V3C2','RX11V3C3','RX11V3C4',
    'RX12V3C1','RX12V3C2','RX12V3C3','RX12V3C4'
]

addl_drug_cols = [
    'PRESCR1','CONTSUB1','COMSTAT1','RX1CAT1','RX1CAT2',
    'RX1CAT3','RX1CAT4','PRESCR2','CONTSUB2','COMSTAT2',
    'RX2CAT1','RX2CAT2','RX2CAT3','RX2CAT4','PRESCR3','CONTSUB3',
    'COMSTAT3','RX3CAT1','RX3CAT2','RX3CAT3','RX3CAT4','PRESCR4',
    'CONTSUB4','COMSTAT4','RX4CAT1','RX4CAT2','RX4CAT3',
    'RX4CAT4','PRESCR5','CONTSUB5','COMSTAT5','RX5CAT1',
    'RX5CAT2','RX5CAT3','RX5CAT4','PRESCR6','CONTSUB6',
    'COMSTAT6','RX6CAT1','RX6CAT2','RX6CAT3','RX6CAT4','PRESCR7',
    'CONTSUB7','COMSTAT7','RX7CAT1','RX7CAT2','RX7CAT3',
    'RX7CAT4','PRESCR8','CONTSUB8','COMSTAT8','RX8CAT1',
    'RX8CAT2','RX8CAT3','RX8CAT4','PRESCR9','CONTSUB9',
    'COMSTAT9','RX9CAT1','RX9CAT2','RX9CAT3','RX9CAT4',
    'PRESCR10','CONTSUB10','COMSTAT10','RX10CAT1','RX10CAT2',
    'RX10CAT3','RX10CAT4','PRESCR11','CONTSUB11','COMSTAT11',
    'RX11CAT1','RX11CAT2','RX11CAT3','RX11CAT4','PRESCR12',
    'CONTSUB12','COMSTAT12','RX12CAT1','RX12CAT2','RX12CAT3',
    'RX12CAT4'
]

X_train.drop(drug_id_cols, axis=1, inplace=True)
X_train.drop(drug_lev1_cols, axis=1, inplace=True)
X_train.drop(drug_lev2_cols, axis=1, inplace=True)
X_train.drop(drug_lev3_cols, axis=1, inplace=True)
X_train.drop(addl_drug_cols, axis=1, inplace=True)

X_test.drop(drug_id_cols, axis=1, inplace=True)
X_test.drop(drug_lev1_cols, axis=1, inplace=True)
X_test.drop(drug_lev2_cols, axis=1, inplace=True)
X_test.drop(drug_lev3_cols, axis=1, inplace=True)
X_test.drop(addl_drug_cols, axis=1, inplace=True)

In [47]:
design_cols = ['CSTRATM','CPSUM','PATWT','EDWT']

X_train.drop(design_cols, axis=1, inplace=True)
X_test.drop(design_cols, axis=1, inplace=True)

In [48]:
categ_cols = df_helper.loc[
    df_helper['variable_type'] == 'CATEGORICAL', 'column_name'
]
one_hot_cols = list(set(categ_cols) & set(X_train.columns))

X_train = pd.get_dummies(X_train, columns=one_hot_cols)

X_test = pd.get_dummies(X_test, columns=one_hot_cols)

In [49]:
X_train.loc[:,X_train.columns] = X_train.loc[:,X_train.columns].apply(pd.to_numeric)
X_test.loc[:,X_test.columns] = X_test.loc[:,X_test.columns].apply(pd.to_numeric)

In [50]:
X_train_cols = X_train.columns
X_test_cols = X_test.columns

X_train = X_train.values
X_test = X_test.values

## Building the models

In [51]:
from sklearn.linear_model import LogisticRegression

clfs = [LogisticRegression()]

for clf in clfs:
    clf.fit(X_train, y_train.values.ravel())
    print(type(clf))
    print('Training accuracy: ' + str(clf.score(X_train, y_train.values)))
    print('Validation accuracy: ' + str(clf.score(X_test, y_test.values)))
    coefs = {
        'column': [X_train_cols[i] for i in range(len(X_train_cols))],
        'coef': [clf.coef_[0,i] for i in range(len(X_train_cols))]
    }
    df_coefs = pd.DataFrame(coefs)
    print(df_coefs.sort_values('coef', axis=0, ascending=False))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


<class 'sklearn.linear_model._logistic.LogisticRegression'>
Training accuracy: 0.863200947153159
Validation accuracy: 0.8684422921711057
                                                column      coef
29                                            TOTCHRON  0.064214
799                                          ARREMS_01  0.044415
1                                                  AGE  0.038154
824                                          IMMEDR_02  0.033150
840                                          NOCHRON_0  0.032122
14                                               RESPR  0.027405
838                                          REGDIV_01  0.018929
31                                             SURGDAY  0.018715
893                                        ZONENURS_01  0.018165
856                                            ONO2_01  0.017830
55                             rfv_Chest_pain_soreness  0.017310
777                                    PHYSPRACTRIA_01  0.016926
902               

In [52]:
from sklearn.ensemble import RandomForestClassifier

clfs_rf = [RandomForestClassifier(n_estimators=100)]

for clf in clfs_rf:
    clf.fit(X_train, y_train.values.ravel())
    print(type(clf))
    print('Training accuracy: ' + str(clf.score(X_train, y_train.values)))
    print('Validation accuracy: ' + str(clf.score(X_test, y_test.values)))
    imps = {
        'column': [X_train_cols[i] for i in range(len(X_train_cols))],
        'imp': [clf.feature_importances_[i] for i in range(len(X_train_cols))]
    }
    df_imps = pd.DataFrame(imps)
    print(df_imps.sort_values('imp', axis=0, ascending=False))

<class 'sklearn.ensemble._forest.RandomForestClassifier'>
Training accuracy: 0.9999461844796039
Validation accuracy: 0.8860371267150928
                                                column       imp
1                                                  AGE  0.041547
13                                               PULSE  0.027663
15                                               BPSYS  0.026291
16                                              BPDIAS  0.026208
0                                             WAITTIME  0.024478
12                                               TEMPF  0.024179
17                                               POPCT  0.021141
14                                               RESPR  0.020849
29                                            TOTCHRON  0.017294
18                                           PAINSCALE  0.016473
800                                          ARREMS_02  0.016127
824                                          IMMEDR_02  0.015723
799                

In [53]:
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier

# Scale data
scaler = StandardScaler()
scaler.fit(X_train)
X_train_Tx = scaler.transform(X_train)
X_test_Tx = scaler.transform(X_test)

# Fit models that require scaling (e.g. neural networks)
hl_sizes = [150, 100, 80, 60, 40, 20]
nn_clfs = [MLPClassifier(hidden_layer_sizes=(size,), random_state=2345, verbose=True) for size in hl_sizes]

for num, nn_clf in enumerate(nn_clfs):
    print(str(hl_sizes[num]) + '-unit network:')
    nn_clf.fit(X_train_Tx, y_train.values.ravel())
    print('Training accuracy: ' + str(nn_clf.score(X_train_Tx, y_train.values)))
    print('Validation accuracy: ' + str(nn_clf.score(X_test_Tx, y_test.values)))

150-unit network:
Iteration 1, loss = inf
Iteration 2, loss = 0.26256205
Iteration 3, loss = 0.22600882
Iteration 4, loss = 0.20554737
Iteration 5, loss = 0.18788855
Iteration 6, loss = 0.17089986
Iteration 7, loss = 0.15399132
Iteration 8, loss = 0.13804192
Iteration 9, loss = 0.12242161
Iteration 10, loss = 0.10969702
Iteration 11, loss = 0.09644333
Iteration 12, loss = 0.08600169
Iteration 13, loss = 0.07545349
Iteration 14, loss = 0.06576426
Iteration 15, loss = 0.05744720
Iteration 16, loss = 0.05057766
Iteration 17, loss = 0.04424809
Iteration 18, loss = 0.03876858
Iteration 19, loss = 0.03392434
Iteration 20, loss = 0.03008661
Iteration 21, loss = 0.02604014
Iteration 22, loss = 0.02280625
Iteration 23, loss = 0.02095042
Iteration 24, loss = 0.01865465
Iteration 25, loss = 0.01618499
Iteration 26, loss = 0.01559927
Iteration 27, loss = 0.01284502
Iteration 28, loss = 0.01190597
Iteration 29, loss = 0.01417864
Iteration 30, loss = 0.01043900
Iteration 31, loss = 0.00889180
Iterat