In [11]:
import pandas as pd
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler, SMOTE
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.metrics import f1_score, roc_auc_score, make_scorer, recall_score, precision_score
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV, RandomizedSearchCV


In [2]:
df = pd.read_csv('data/health_survey.csv')
df.loc[:,'null_cnt'] = df.isnull().sum(axis=1)
df = df[df['null_cnt']<=1].reset_index(drop=True)
df = df.drop('null_cnt', axis=1)
X = df.drop(['is_diabetes','state'], axis=1)
y = df['is_diabetes']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3)
print(X_train.shape, X_test.shape)

(246847, 13) (105792, 13)


In [3]:
preprocessor = ColumnTransformer(transformers=[
                                ('num', Pipeline([
                                    ('num_imputer', SimpleImputer(strategy='median')), 
                                    ('scaler', MinMaxScaler())]), make_column_selector(dtype_include='number')),
                                ('cat', Pipeline([
                                    ('cat_imputer', SimpleImputer(strategy='most_frequent')),
                                    ('one_hot', OneHotEncoder(handle_unknown='ignore'))]), make_column_selector(dtype_include=object))
                                    ], remainder='drop')
scorer = make_scorer(roc_auc_score)
kf = KFold(n_splits=10, shuffle=True, random_state=2)

def ppline(model):
    result = Pipeline([
                    ('preprocessing', preprocessor), 
                    ('over_sampler', SMOTE(random_state=2)),
                   ('model', model)])
    return(result)

In [4]:
param_grid = {
    'model__criterion': ['gini','entropy'],
    'model__max_depth': [7, 9, 11],
    'model__min_samples_leaf': [20, 40],
}

dt_gs = GridSearchCV(
    estimator=ppline(DecisionTreeClassifier(random_state=2)),
    param_grid=param_grid,
    cv=kf,
    scoring=scorer,  
    n_jobs=-1, 
    verbose=1
)

dt_gs.fit(X_train, y_train)

print("train set roc_auc_score : ", roc_auc_score(y_train, dt_gs.predict(X_train)))
print("test set roc_auc_score : ", roc_auc_score(y_test, dt_gs.predict(X_test)))

Fitting 10 folds for each of 12 candidates, totalling 120 fits
train set roc_auc_score :  0.7058107749180837
test set roc_auc_score :  0.704291562781905


In [5]:
dt_gs.best_estimator_['model'].get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'entropy',
 'max_depth': 7,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 20,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'random_state': 2,
 'splitter': 'best'}

In [15]:
param_grid = {
    'model__n_estimators': [100, 300, 500],
    'model__criterion': ['gini','entropy'],
    'model__max_depth': [3, 5, 7],
    'model__min_samples_leaf': [5, 10],
}

rf_gs = RandomizedSearchCV(
    estimator=ppline(RandomForestClassifier(random_state=2)),
    param_distributions=param_grid,
    cv=kf,
    scoring=scorer,  
    n_jobs=-1, 
    verbose=1
)

rf_gs.fit(X_train, y_train)

print("train set roc_auc_score : ", roc_auc_score(y_train, rf_gs.predict(X_train)))
print("test set roc_auc_score : ", roc_auc_score(y_test, rf_gs.predict(X_test)))

Fitting 10 folds for each of 10 candidates, totalling 100 fits
train set roc_auc_score :  0.7259009350382046
test set roc_auc_score :  0.7220977429217869


In [17]:
rf_gs.best_estimator_['model'].get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'entropy',
 'max_depth': 7,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 10,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 2,
 'verbose': 0,
 'warm_start': False}

In [22]:
rf = ppline(RandomForestClassifier(random_state=2, max_depth=10, min_samples_leaf=2, n_estimators=500)).fit(X_train, y_train)

print("train set roc_auc_score : ", roc_auc_score(y_train, rf.predict(X_train)))
print("test set roc_auc_score : ", roc_auc_score(y_test, rf.predict(X_test)))

train set roc_auc_score :  0.733307988078033
test set roc_auc_score :  0.7234134963940452
