In [1]:
import argparse
import joblib
import os

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

from sklearn.compose import ColumnTransformer
#, make_column_selector
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Binarizer, StandardScaler, OneHotEncoder

import UnknownFeatureGenerator as ufg

In [2]:
print('reading data')
train_df = pd.read_csv("../../data/partitioned/train.csv")
test_df = pd.read_csv("../../data/partitioned/test.csv")

reading data


In [3]:
target_variable="readmitted"
features = list(train_df.columns)
features.remove(target_variable)
print(features)

['encounter_id', 'patient_nbr', 'race', 'gender', 'age', 'weight', 'admission_type_id', 'discharge_disposition_id', 'admission_source_id', 'time_in_hospital', 'payer_code', 'medical_specialty', 'num_lab_procedures', 'num_procedures', 'num_medications', 'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1', 'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult', 'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton', 'insulin', 'glyburide-metformin', 'glipizide-metformin', 'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone', 'change', 'diabetesMed']


In [4]:
print('building training and testing datasets')
X_train = train_df[features]
X_test = test_df[features]
y_train = train_df[target_variable]
y_test = test_df[target_variable]
    
numeric_cols = list( X_train.select_dtypes(include="number").columns)
categorical_cols = list( X_train.select_dtypes(exclude="number").columns)
print(categorical_cols)

building training and testing datasets
['race', 'gender', 'age', 'weight', 'admission_type_id', 'payer_code', 'medical_specialty', 'diag_1', 'diag_2', 'diag_3', 'max_glu_serum', 'A1Cresult', 'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton', 'insulin', 'glyburide-metformin', 'glipizide-metformin', 'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone', 'change', 'diabetesMed']


In [5]:
  
feature_gen = Pipeline([
    ("unknown", ufg.UnknownFeatureGenerator("admission_type_id", "UNKNOWN_admission_type") )
])

numeric_transformer = make_pipeline(
        SimpleImputer(strategy='median'),
        StandardScaler()
    )

categorical_transformer = make_pipeline(
        SimpleImputer(strategy='constant', fill_value='missing'),
        OneHotEncoder(handle_unknown='ignore')
    )

preprocessor = ColumnTransformer(
        transformers=[
            ("num", numeric_transformer, numeric_cols),
            ("cat", categorical_transformer, categorical_cols)
        ]
    )


In [7]:
feature_gen.fit(X_train)
rez = feature_gen.transform(X_test)
rez

<class 'pandas.core.frame.DataFrame'>
(20354, 49)


Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,UNKNOWN_admission_type
0,151159752,96417243,AfricanAmerican,Female,[70-80),?,Physician Referral,11,7,2,...,No,Steady,No,No,No,No,No,No,Yes,0
1,74937114,23486292,Caucasian,Female,[80-90),?,Transfer from another health care facility,25,17,2,...,No,No,No,No,No,No,No,No,No,0
2,421416038,132312542,?,Male,[70-80),?,Clinic Referral,1,1,3,...,No,Steady,No,No,No,No,No,Ch,Yes,0
3,243417654,1868706,Caucasian,Male,[80-90),?,Physician Referral,6,7,8,...,No,Steady,No,No,No,No,No,No,Yes,0
4,166810920,43926273,Caucasian,Male,[60-70),?,HMO Referral,1,1,3,...,No,Steady,No,No,No,No,No,No,Yes,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20349,304821266,114159384,AfricanAmerican,Female,[50-60),?,Court/Law Enforcement,1,7,3,...,No,Steady,No,No,No,No,No,Ch,Yes,0
20350,86554038,60955848,Caucasian,Female,[50-60),?,Physician Referral,1,7,2,...,No,Steady,No,No,No,No,No,Ch,Yes,0
20351,81196470,1367838,Caucasian,Male,[50-60),?,Physician Referral,1,7,3,...,No,No,No,No,No,No,No,No,No,0
20352,36980256,1242459,AfricanAmerican,Female,[30-40),?,Clinic Referral,1,1,3,...,No,Steady,No,No,No,No,No,No,Yes,0


In [8]:
print('training model')
rfcl = RandomForestClassifier(
        n_estimators=200,
        min_samples_leaf=8,
        max_depth=9,
        n_jobs=-1)
    
model = Pipeline(steps=[
        ('feature_gen', feature_gen),
        ('preprocessor', preprocessor),
        ('rf', rfcl )
    ])
    

training model


In [9]:
print(model)

Pipeline(steps=[('feature_gen',
                 Pipeline(steps=[('unknown',
                                  <UnknownFeatureGenerator.UnknownFeatureGenerator object at 0x1109b4240>)])),
                ('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('standardscaler',
                                                                   StandardScaler())]),
                                                  ['encounter_id',
                                                   'patient_nbr',
                                                   'discharge_disposition_id',
                                                   'a...
                                                   'max_glu_serum', 'A1Cresult',
             

In [10]:

model.fit(X_train, y_train)

<class 'pandas.core.frame.DataFrame'>
(61059, 49)


Pipeline(steps=[('feature_gen',
                 Pipeline(steps=[('unknown',
                                  <UnknownFeatureGenerator.UnknownFeatureGenerator object at 0x1109b4240>)])),
                ('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('standardscaler',
                                                                   StandardScaler())]),
                                                  ['encounter_id',
                                                   'patient_nbr',
                                                   'discharge_disposition_id',
                                                   'a...
                                                   'max_glu_serum', 'A1Cresult',
             