# Stacking

In [2]:
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [3]:
def stacking(models, meta_alg, data_train, targets_train, data_test, targets_test=None, random_state=None, test_size=None, cv=5):
    
    #Если размер тестовой выборки не задан
    if test_size is None:
        meta_mtrx = np.empty((targets_train.shape[0], len(models))) 
       
    #Запускаем цикл по заполнению мета-матрицы и обучению базовых моделей на полном объеме данных
        
        for n, model in enumerate(models):
            meta_mtrx[:, n] = cross_val_predict(model, data_train, 
                                                targets_train, cv=cv, 
                                                method='predict')
            model.fit(data_train, targets_train)
        
        #Обучаем мета-алгоритм на данных из мета-матрицы
        
        meta_alg.fit(meta_mtrx, targets_train)        
        meta_mtrx_test = np.empty((data_test.shape[0], len(models)))
        
        #Заполняем тестовую мета-матрицу предсказаниям базовых моделей
        
        for n, model in enumerate(models):
            meta_mtrx_test[:, n] = model.predict(data_test)

        #Делаем предсказания мета-алгоритма
        
        predicted = meta_alg.predict(meta_mtrx_test)

        #Запускаем проверку: не пустая ли матрица targets_test
        
        if targets_test is None:
            return predicted   
        else:
            print(f'{n} auc: {roc_auc_score(y_test, predicted)}')
        
    
    #Если размер тестовой выборки задан то разбиваем данные
    
    elif test_size > 0 and test_size < 1:

        
        train, valid, train_true, valid_true = train_test_split(data_train, 
                                                    targets_train,
                                                    test_size=test_size,
                                                    random_state=0)
        
      #Определяем мета-матрицу и заполняем значениями в цикле   
        meta_mtrx = np.empty((valid.shape[0], len(models))) 
      #В качестве значений мета-матрицы предсказания базовых моделей
        for n, model in enumerate(models):
            model.fit(data_train, targets_train)
            meta_mtrx[:, n] = model.predict(valid)
        #Обучаем мета-алгоритм на мета-матрице и валидирующей выборки от второго разбиения
        meta_alg.fit(meta_mtrx, valid_true)
    
        meta_mtrx_test = np.empty((x_test.shape[0], len(models))) 
        for n, model in enumerate(models):
            meta_mtrx_test[:, n] = model.predict(x_test)  
            
        predicted = meta_alg.predict(meta_mtrx_test)

        if targets_test is None:
            return predicted   
        else:
            print(f'{n} auc: {roc_auc_score(y_test, predicted)}')     

    else:
        raise ValueError("test_size must be between 0 and 1")


In [4]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier


titanic = pd.read_csv('9.7_titanic.csv')
targets = titanic.Survived
data = titanic.drop(columns='Survived')

x_train, x_test, y_train, y_test = train_test_split(data, 
                                                    targets,
                                                    train_size=0.8,
                                                    random_state=17)

knn = KNeighborsClassifier(n_neighbors=3)
lr = LogisticRegression(random_state=17)
svc = SVC(random_state=17)
models = [knn, lr, svc]

meta = XGBClassifier(n_estimators=40)

In [5]:

stacking(models, meta, x_train, y_train, x_test, y_test, random_state=17, test_size = 0, cv=5)

ValueError: test_size must be between 0 and 1

In [6]:
stacking(models, meta, x_train, y_train, x_test, y_test, random_state=17, test_size = 0.3, cv=5)

2 auc: 0.646291031274231


In [7]:
stacking(models, meta, x_train, y_train, x_test, y_test, random_state=17, cv=5)

2 auc: 0.7601447402429569


In [8]:
stacking(models, meta, x_train, y_train, x_test, random_state=17, test_size = 0.3, cv=5)



array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       1, 0, 1], dtype=int64)