# ensemble

In [1]:
import numpy as np
from sklearn.model_selection import StratifiedKFold,train_test_split
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.datasets import load_iris
import xgboost as xgb

### load data

In [2]:
iris = load_iris()
data = iris.data
label = iris.target

In [3]:
x_train, x_test, y_train, y_test = train_test_split(data, label, test_size=0.4)

In [4]:
np.random.seed(2018)
n_folds = 5
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=2018)

In [5]:
clfs = [
    RandomForestClassifier(n_estimators=100, max_features="log2", max_depth=10, min_samples_leaf=1, bootstrap=True, n_jobs=-1, random_state=1),
    ExtraTreesClassifier(n_estimators=100, criterion="gini", max_features="log2", max_depth=10, min_samples_split=2, min_samples_leaf=1,bootstrap=True, n_jobs=-1, random_state=1),
    GradientBoostingClassifier(learning_rate=0.05),
    AdaBoostClassifier(n_estimators=200)
]

In [6]:
dataset_blend_train = np.zeros((x_train.shape[0], len(clfs)))
dataset_blend_test = np.zeros((x_test.shape[0], len(clfs)))

In [7]:
for i, clf in enumerate(clfs):
    dataset_blend_test_j = np.zeros((x_test.shape[0], n_folds))
    for j,(train_index, test_index) in enumerate(skf.split(x_train, y_train)):
        x_t = x_train[train_index]
        y_t = y_train[train_index]
        clf.fit(x_t, y_t)
        dataset_blend_train[test_index, i] = clf.predict(x_train[test_index])
        dataset_blend_test_j[:, j] = clf.predict(x_test)
    dataset_blend_test[:, i] = dataset_blend_test_j.sum(axis=1) // (n_folds//2 + 1)

### L2. XBD

In [8]:
def cross_validate_xgb(params, x_train, y_train, x_test, y_test, kf, verbose=True,
                      verbose_eval=50, num_boost_round=4000, use_rank=True):
    train_pred = np.zeros((x_train.shape[0]))
    test_pred = np.zeros((x_test.shape[0]))
    dataset_blend_test_j = np.zeros((x_test.shape[0], n_folds))
    for i, (train_index, test_index) in enumerate(skf.split(x_train, y_train)):
        x_train_kf, x_val_kf = x_train[train_index], x_train[test_index]
        y_train_kf, y_val_kf = y_train[train_index], y_train[test_index]
        d_train_kf = xgb.DMatrix(x_train_kf, y_train_kf)
        d_val_kf = xgb.DMatrix(x_val_kf, y_val_kf)
        d_test = xgb.DMatrix(x_test, y_test)
        
        bst = xgb.train(params, d_train_kf, num_boost_round=num_boost_round)
        train_pred[test_index] = bst.predict(d_val_kf, ntree_limit=bst.best_ntree_limit)
        dataset_blend_test_j[:, i] = bst.predict(d_test)
        
    test_pred = dataset_blend_test_j.sum(axis=1) // (n_folds //2 + 1)
    if verbose:
        print('xgb train:{}'.format(classification_report(y_train, train_pred)))
        print('xgb test:{}'.format(classification_report(y_test, test_pred)))
    return train_pred, test_pred

In [9]:
xgb_params = {
    "objective" :  'multi:softmax',
    "eta": 0.1,
    "max_depth": 5,
    "min_child_weight": 10,
    "gamma": 0.70,
    "subsample": 0.76,
    "colsample_bytree": 0.95,
    "nthread": 6,
    "seed": 0,
    'silent': 1,
    'num_class': 3,
}

d_train = xgb.DMatrix(dataset_blend_train, y_train)
d_test = xgb.DMatrix(dataset_blend_test, y_test)

bst = xgb.train(xgb_params, d_train, num_boost_round=10)
preds = bst.predict(d_test)

In [10]:
print(classification_report(y_test, preds))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00        14
          1       1.00      0.91      0.95        23
          2       0.92      1.00      0.96        23

avg / total       0.97      0.97      0.97        60



### Logistic Regression

In [12]:
clf = LogisticRegression()
clf.fit(dataset_blend_train, y_train)  
prediction = clf.predict(dataset_blend_test)

In [13]:
print(classification_report(y_test, prediction))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00        14
          1       1.00      0.91      0.95        23
          2       0.92      1.00      0.96        23

avg / total       0.97      0.97      0.97        60

