In [2]:
from numpy import loadtxt
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator
from sklearn.metrics import accuracy_score

In [3]:
# load data
dataset = loadtxt('pima-indians-diabetes.csv', delimiter=",")
dataset

array([[  6.   , 148.   ,  72.   , ...,   0.627,  50.   ,   1.   ],
       [  1.   ,  85.   ,  66.   , ...,   0.351,  31.   ,   0.   ],
       [  8.   , 183.   ,  64.   , ...,   0.672,  32.   ,   1.   ],
       ...,
       [  5.   , 121.   ,  72.   , ...,   0.245,  30.   ,   0.   ],
       [  1.   , 126.   ,  60.   , ...,   0.349,  47.   ,   1.   ],
       [  1.   ,  93.   ,  70.   , ...,   0.315,  23.   ,   0.   ]])

In [4]:
# split data into X and y
X = dataset[:,0:8]
Y = dataset[:,8]
X.shape, Y.shape

((768, 8), (768,))

In [5]:
# split data into train and test sets
seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

In [6]:
def isolate():
    # fit model no training data
    xgb_clf = XGBClassifier()
    xgb_clf.fit(X_train, y_train)

    # make predictions for test data
    y_pred = xgb_clf.predict(X_test)

    # evaluate predictions
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy: %.2f%%" % (accuracy * 100.0))

isolate()

Accuracy: 74.02%


In [7]:
def isolate():
    # fit model no training data
    rf_clf = RandomForestClassifier(random_state=17)
    rf_clf.fit(X_train, y_train)

    # make predictions for test data
    y_pred = rf_clf.predict(X_test)

    # evaluate predictions
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy: %.2f%%" % (accuracy * 100.0))

isolate()

Accuracy: 77.17%


In [8]:
def isolate():
    # fit model no training data
    ab_clf = AdaBoostClassifier(random_state=17)
    ab_clf.fit(X_train, y_train)

    # make predictions for test data
    y_pred = ab_clf.predict(X_test)

    # evaluate predictions
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy: %.2f%%" % (accuracy * 100.0))

isolate()

Accuracy: 72.44%


In [9]:
def isolate():
    xgb_clf = XGBClassifier()
    ada_clf = AdaBoostClassifier()
    rf_clf = RandomForestClassifier()

    class DummyEstimator(BaseEstimator):
        def fit(self): pass
        def score(self): pass

    pipe = Pipeline(steps=[
        ('clf', DummyEstimator())
    ])

    cv = GridSearchCV(estimator=pipe, 
                      param_grid=[
                        {'clf': [xgb_clf],
                         'clf__n_estimators': [90, 100, 110]},
                        {'clf': [ada_clf],
                         'clf__n_estimators': [80, 100, 120]},
                        {'clf': [rf_clf],
                         'clf__n_estimators': [70, 100, 130]}
                      ])
    cv.fit(X_train, y_train)

    print(cv.best_estimator_)
    print("Accuracy: %.2f%%" % (cv.best_score_ * 100.0))

isolate()

Pipeline(steps=[('clf', RandomForestClassifier(n_estimators=130))])
Accuracy: 75.67%
{'mean_fit_time': array([0.07287035, 0.08118114, 0.09069028, 0.04184036, 0.05037961,
       0.05903792, 0.03806953, 0.053548  , 0.06744938]), 'std_fit_time': array([0.02308397, 0.01794632, 0.01989827, 0.00061985, 0.00079203,
       0.00014079, 0.00161534, 0.00238485, 0.00065202]), 'mean_score_time': array([0.00093837, 0.0012044 , 0.00102692, 0.00295043, 0.00351319,
       0.00410094, 0.00236769, 0.00307064, 0.00398192]), 'std_score_time': array([2.48886159e-05, 2.70265042e-04, 1.20281301e-04, 5.99725811e-05,
       8.27675308e-05, 6.77227292e-05, 9.31688315e-05, 5.25104458e-05,
       1.41247392e-04]), 'param_clf': masked_array(data=[XGBClassifier(base_score=None, booster=None, callbacks=None,
                                 colsample_bylevel=None, colsample_bynode=None,
                                 colsample_bytree=None, early_stopping_rounds=None,
                                 enable_categori