In [9]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.decomposition import PCA
from xgboost.sklearn import XGBClassifier

dataset = load_breast_cancer()

# set dataFrame
x = pd.DataFrame(dataset.data, columns=dataset.feature_names)
y = pd.DataFrame(dataset.target, columns=['y'])
y = y.as_matrix().ravel()

# set pipeline
pipe_knn = Pipeline([('scl', StandardScaler()), ('pca', PCA()), ('est', KNeighborsClassifier())])
pipe_loigistics = Pipeline([('scl', StandardScaler()), ('pca', PCA()), ('est', LogisticRegression(random_state=1))])
pipe_rf = Pipeline([('scl', StandardScaler()), ('pca', PCA()), ('est', RandomForestClassifier(random_state=1))])
pipe_gbr = Pipeline([('scl', StandardScaler()), ('pca', PCA()), ('est', GradientBoostingClassifier(random_state=1))])
pipe_xgb = Pipeline([('scl', StandardScaler()), ('pca', PCA()), ('est', XGBClassifier(random_state=1))])

# crossvalidation
cross_val_score_knn = cross_val_score(pipe_knn, x, y, cv=10, scoring='roc_auc')
cross_val_score_logistics = cross_val_score(pipe_loigistics, x, y, cv=10, scoring='roc_auc')
cross_val_score_rf = cross_val_score(pipe_rf, x, y, cv=10, scoring='roc_auc')
cross_val_score_gbr = cross_val_score(pipe_gbr, x, y, cv=10, scoring='roc_auc')
cross_val_score_xgb = cross_val_score(pipe_xgb, x, y, cv=10, scoring='roc_auc')

In [10]:
# print the performance
print('AUC_KNN: %.6f' %cross_val_score_knn.mean(), '+- %.3f' %cross_val_score_knn.std())
print('AUC_Logistics: %.6f' %cross_val_score_logistics.mean(), '+- %.3f' %cross_val_score_logistics.std())
print('AUC_RF: %.6f' %cross_val_score_rf.mean(), '+- %.3f' %cross_val_score_rf.std())
print('AUC_GBR: %.6f' %cross_val_score_gbr.mean(), '+- %.3f' %cross_val_score_gbr.std())
print('AUC_XGB: %.6f' %cross_val_score_xgb.mean(), '+- %.3f' %cross_val_score_xgb.std())

AUC_KNN: 0.989155 +- 0.012
AUC_Logistics: 0.996064 +- 0.005
AUC_RF: 0.968443 +- 0.019
AUC_GBR: 0.986562 +- 0.015
AUC_XGB: 0.989917 +- 0.011


In [12]:
# set search space
params_grid_logistics = {
    'pca__n_components': [5, 10, 15],
    'est__penalty': ['l1', 'l2'],
    'est__C': [0.1, 1.0, 10.0, 100.0]
}

params_grid_xgb = {
    'pca__n_components': [5, 10, 15],
    'est__n_estimators': [200, 300, 400],
    'est__max_depth': [3, 5, 7]
}
XGBClassifier()
gs_logistics = GridSearchCV(pipe_loigistics, params_grid_logistics, cv=10, scoring='roc_auc')
gs_xgb = GridSearchCV(pipe_xgb, params_grid_xgb, cv=10, scoring='roc_auc')

gs_logistics.fit(x, y)
gs_xgb.fit(x, y)

GridSearchCV(cv=10, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('scl', StandardScaler(copy=True, with_mean=True, with_std=True)), ('pca', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('est', XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_by...
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'pca__n_components': [5, 10, 15], 'est__n_estimators': [200, 300, 400], 'est__max_depth': [3, 5, 7]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=0)

In [13]:
print('Best score of logistics: %.6f' %gs_logistics.best_score_)
print('Best params of logistics:', gs_logistics.best_params_)
print('Best score of xgb : %.6f' %gs_xgb.best_score_)
print('Best params of xgb:', gs_xgb.best_params_)

Best score of logistics: 0.996175
Best params of logistics: {'est__C': 1.0, 'est__penalty': 'l2', 'pca__n_components': 15}
Best score of xgb : 0.993922
Best params of xgb: {'est__max_depth': 3, 'est__n_estimators': 300, 'pca__n_components': 10}


In [14]:
print('result of logistics:\n', confusion_matrix(y, gs_logistics.predict(x)))
print('result of xgb:\n', confusion_matrix(y, gs_xgb.predict(x)))

result of logistics:
 [[207   5]
 [  2 355]]
result of xgb:
 [[212   0]
 [  0 357]]


In [15]:
print(gs_xgb.predict_proba(x))

[[9.9524796e-01 4.7520576e-03]
 [9.9983031e-01 1.6969482e-04]
 [9.9860287e-01 1.3971312e-03]
 ...
 [9.9661928e-01 3.3807307e-03]
 [9.9929458e-01 7.0543849e-04]
 [3.3771992e-03 9.9662280e-01]]
