In [1]:
import numpy as np

In [2]:
import pandas as pd

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
from sklearn.datasets import load_breast_cancer

In [5]:
import warnings

In [6]:
warnings.filterwarnings(action='ignore')

In [7]:
cancer = load_breast_cancer()

In [8]:
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target)

In [9]:
from sklearn.linear_model import LogisticRegression

In [10]:
model_logistic = LogisticRegression()

In [11]:
model_logistic.fit(X_train, y_train)

LogisticRegression()

In [12]:
score_logistic = model_logistic.score(X_test, y_test)

In [13]:
score_logistic

0.9370629370629371

In [14]:
from sklearn.svm import SVC

In [15]:
model_svc = SVC(C=1)

In [16]:
model_svc.fit(X_train, y_train)

SVC(C=1)

In [17]:
score_svc = model_svc.score(X_test, y_test)

In [18]:
score_svc

0.9370629370629371

In [19]:
from sklearn.ensemble import RandomForestClassifier

In [20]:
model_forest = RandomForestClassifier(n_estimators=10)

In [21]:
model_forest.fit(X_train, y_train)

RandomForestClassifier(n_estimators=10)

In [22]:
score_forest = model_forest.score(X_test, y_test)

In [23]:
score_forest

0.965034965034965

In [24]:
lr_final = LogisticRegression(C=10)

In [25]:
from sklearn.model_selection import KFold

In [26]:
from sklearn.metrics import accuracy_score

In [27]:
def get_stacking_datasets(model, x_train_n, y_train_n, x_test_n, n_folds) :
    kf = KFold(n_splits=n_folds, shuffle=False, random_state=42)
    
    train_fold_pred = np.zeros((x_train_n.shape[0], 1))
    test_pred = np.zeros((x_test_n.shape[0], n_folds))
    print(model.__class__.__name__, '모델 시작')
    
    for folder_counter, (train_idx, valid_idx) in enumerate(kf.split(x_train_n)) :
        print(f"Fold 횟수 : {folder_counter + 1}")
        x_tr = x_train_n[train_idx]
        y_tr = y_train_n[train_idx]
        x_te = x_train_n[valid_idx]
        
        model.fit(x_tr, y_tr)
        train_fold_pred[valid_idx, :] = model.predict(x_te).reshape(-1, 1)
        test_pred[:, folder_counter] = model.predict(x_test_n)
    
    test_pred_mean = np.mean(test_pred, axis=1).reshape(-1, 1)
    
    return train_fold_pred, test_pred_mean

In [28]:
train_logistic, test_logistic = get_stacking_datasets(model_logistic, X_train, y_train, X_test, 5)
train_svm, test_svm = get_stacking_datasets(model_svc, X_train, y_train, X_test, 5)
train_forest, test_forest = get_stacking_datasets(model_forest, X_train, y_train, X_test, 5)

LogisticRegression 모델 시작
Fold 횟수 : 1
Fold 횟수 : 2
Fold 횟수 : 3
Fold 횟수 : 4
Fold 횟수 : 5
SVC 모델 시작
Fold 횟수 : 1
Fold 횟수 : 2
Fold 횟수 : 3
Fold 횟수 : 4
Fold 횟수 : 5
RandomForestClassifier 모델 시작
Fold 횟수 : 1
Fold 횟수 : 2
Fold 횟수 : 3
Fold 횟수 : 4
Fold 횟수 : 5


In [29]:
stack_final_x_train = np.concatenate((train_logistic, train_svm, train_forest), axis=1)

In [30]:
stack_final_x_test = np.concatenate((test_logistic, test_svm, test_forest), axis=1)

In [31]:
lr_final.fit(stack_final_x_train, y_train)

LogisticRegression(C=10)

In [32]:
stack_final_pred = lr_final.predict(stack_final_x_test)

In [33]:
print(f"최종 메타모델 정확도 : {accuracy_score(y_test, stack_final_pred):.4f}")

최종 메타모델 정확도 : 0.9580
