Hyperparameter Turing for rf with drug dataset

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier, plot_importance, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, roc_curve, auc, roc_auc_score, classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from imblearn.over_sampling import SMOTE

In [2]:
df = pd.read_csv("../data/trainDRUG.csv", encoding="utf8")
X = df.drop(labels=['再犯註記'], axis=1).values
y = df['再犯註記'].values
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=0)

In [3]:
model1 = RandomForestClassifier(random_state=42, n_jobs=-1)

In [4]:
n_estimators = np.arange(100, 2000, step=200)
max_features = ["auto", "sqrt", "log2"]
max_depth = [2,4,6,8]
min_samples_split = np.arange(2, 10, step=2)
min_samples_leaf = [1, 2, 3, 4, 5]
bootstrap = [True, False]

In [5]:
param_grid={
    "n_estimators": n_estimators,
    "max_features": max_features,
    "max_depth": max_depth,
    "min_samples_split": min_samples_split,
    "min_samples_leaf": min_samples_leaf,
    "bootstrap": bootstrap,
}

In [6]:
from sklearn.model_selection import RandomizedSearchCV
rand_cv = RandomizedSearchCV(model1, param_grid, n_iter=100, cv=5, scoring="roc_auc", n_jobs=-1)
rand_cv.fit(train_X, train_y)
print(rand_cv.best_score_)
print(rand_cv.best_params_) # obtain params by random search

0.7696156105583729
{'n_estimators': 300, 'min_samples_split': 4, 'min_samples_leaf': 3, 'max_features': 'auto', 'max_depth': 8, 'bootstrap': True}


In [None]:
from sklearn.model_selection import RepeatedStratifiedKFold
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=5, random_state=1)
for d in [2,3,4,5,6,7,8]:
    rand_cv.best_param_['max_depth'] = d
    print("auc score of max depth[%d] is %f" % (d, round(np.mean(cross_val_score(rand_cv, test_X, test_y, scoring='roc_auc', cv=cv, n_jobs=-1)), 6)))

In [9]:
rand_cv.best_params_['max_depth']=5

In [10]:
rand_cv.best_params_

{'n_estimators': 300,
 'min_samples_split': 4,
 'min_samples_leaf': 3,
 'max_features': 'auto',
 'max_depth': 5,
 'bootstrap': True}