In [1]:
import numpy as np

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

cancer_data = load_breast_cancer()

X_data = cancer_data.data
y_label = cancer_data.target

X_train, X_test, y_train, y_test = train_test_split(X_data, y_label, test_size=0.2, random_state=0)

In [2]:
# each ML model classifier creation
knn_clf = KNeighborsClassifier(n_neighbors=4)
rf_clf = RandomForestClassifier(n_estimators=100, random_state=0)
dt_clf = DecisionTreeClassifier()
ada_clf = AdaBoostClassifier(n_estimators=100)

# for final stacking model
lr_final = LogisticRegression(C=10)

In [4]:
# train each model
knn_clf.fit(X_train, y_train)
rf_clf.fit(X_train, y_train)
dt_clf.fit(X_train, y_train)
ada_clf.fit(X_train, y_train)

AdaBoostClassifier(n_estimators=100)

In [6]:
# Prediction by each model
knn_pred = knn_clf.predict(X_test)
rf_pred = rf_clf.predict(X_test)
dt_pred = dt_clf.predict(X_test)
ada_pred = ada_clf.predict(X_test)

print('KNN accuracy: {0:.4f}'.format(accuracy_score(y_test, knn_pred)))
print('RandomForest accuracy: {0:.4f}'.format(accuracy_score(y_test, rf_pred)))
print('DecisionTree accuracy: {0:.4f}'.format(accuracy_score(y_test, dt_pred)))
print('AdaBoost accuracy: {0:.4f}'.format(accuracy_score(y_test, ada_pred)))

KNN accuracy: 0.9211
RandomForest accuracy: 0.9649
DecisionTree accuracy: 0.9035
AdaBoost accuracy: 0.9561


In [9]:
# stacking each prediction results
pred = np.array([knn_pred, rf_pred, dt_pred, ada_pred])

# transpose
pred = np.transpose(pred)
print(pred.shape)

(114, 4)


In [11]:
lr_final.fit(pred, y_test)
final=lr_final.predict(pred)

print('Final meta model accuracy: {0:.4f}'.format(accuracy_score(y_test, final))) # possibly overfitted -> CV set os needed

Final meta model accuracy: 0.9737


### CV set based Stacking

In [22]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
# Generating train/test data set for final meta model
def get_stacking_base_datasets(model, X_train_n, y_train_n, X_test_n, n_folds):
    # Kfold split
    kf = KFold(n_folds, shuffle=True, random_state=0)
    # Initialization of train/test array for return
    train_fold_pred = np.zeros((X_train_n.shape[0],1))
    test_pred = np.zeros((X_test_n.shape[0], n_folds))
    print(model.__class__.__name__, 'model start')
    
    for folder_counter, (train_index, valid_index) in enumerate(kf.split(X_train_n)):
        print('\t fold set', folder_counter, 'start')
        X_tr = X_train_n[train_index]
        y_tr = y_train_n[train_index]
        X_te = X_train_n[valid_index]
        # learning using training set split by KFold
        model.fit(X_tr, y_tr)
        # Prediction using validation set split by KFold
        train_fold_pred[valid_index,:]=model.predict(X_te).reshape(-1,1)
        # Prediction using input test data
        test_pred[:, folder_counter]=model.predict(X_test_n)
        
    # averaging 
    test_pred_mean = np.mean(test_pred, axis=1).reshape(-1,1)
    
    # train_fold_pred -> training data for meta model, test_pred_mean -> testing data for meta model
    return train_fold_pred, test_pred_mean       

In [27]:
knn_train, knn_test = get_stacking_base_datasets(knn_clf, X_train, y_train, X_test, 7)
rf_train, rf_test = get_stacking_base_datasets(rf_clf, X_train, y_train, X_test, 7)
dt_train, dt_test = get_stacking_base_datasets(dt_clf, X_train, y_train, X_test, 7)
ada_train, ada_test = get_stacking_base_datasets(ada_clf, X_train, y_train, X_test, 7)

KNeighborsClassifier model start
	 fold set 0 start
	 fold set 1 start
	 fold set 2 start
	 fold set 3 start
	 fold set 4 start
	 fold set 5 start
	 fold set 6 start
RandomForestClassifier model start
	 fold set 0 start
	 fold set 1 start
	 fold set 2 start
	 fold set 3 start
	 fold set 4 start
	 fold set 5 start
	 fold set 6 start
DecisionTreeClassifier model start
	 fold set 0 start
	 fold set 1 start
	 fold set 2 start
	 fold set 3 start
	 fold set 4 start
	 fold set 5 start
	 fold set 6 start
AdaBoostClassifier model start
	 fold set 0 start
	 fold set 1 start
	 fold set 2 start
	 fold set 3 start
	 fold set 4 start
	 fold set 5 start
	 fold set 6 start


In [43]:
Stack_final_X_train = np.c_[knn_train, rf_train, dt_train, ada_train]
# Stack_final_X_train = np.concatenate((knn_train, rf_train, dt_train, ada_train), axis=1)
Stack_final_X_test = np.c_[knn_test, rf_test, dt_test, ada_test]
# Stack_final_X_test = np.concatenate([knn_test, rf_test, dt_test, ada_test],axis=1)

In [46]:
print('original training feature data shape:', X_train.shape, 'original testing feature data shape:', X_test.shape)
print('Stacking training feature data shape:', Stack_final_X_train.shape, 'Stacking testing feature data shape:', Stack_final_X_test.shape)

original training feature data shape: (455, 30) original testing feature data shape: (114, 30)
Stacking training feature data shape: (455, 4) Stacking testing feature data shape: (114, 4)


In [47]:
lr_final.fit(Stack_final_X_train, y_train)
stack_final = lr_final.predict(Stack_final_X_test)
print('Final meta model accuracy: {0:.4f}'.format(accuracy_score(y_test, stack_final)))

Final meta model accuracy: 0.9737
