In [238]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score
from sklearn.model_selection import KFold
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import SVC

model_RF = RandomForestClassifier()
model_DT = DecisionTreeClassifier()
model_LR = LogisticRegression()
model_LGBM = LGBMClassifier()
model_XGB = XGBClassifier()


In [239]:
df = pd.read_csv("./타이타닉/train.csv")
X_test = pd.read_csv("./타이타닉/X_test.csv")
y_test = pd.read_csv("./타이타닉/y_test.csv")
y_train =df["Survived"]
X_train = df.drop("Survived", axis=1)
y_test = y_test["Survived"]

X_train = X_train.select_dtypes(exclude="object")
X_train = X_train.fillna(0)
X_test = X_test.select_dtypes(exclude="object")
X_test = X_test.fillna(0)

X_train_re = X_train.copy()
X_test_re = X_test.copy()

X_train_re.columns = list(range(1,len(X_train.columns)+1))
X_test_re.columns = list(range(1,len(X_train.columns)+1))

In [240]:
# 개별모델 내부에서 CV 적용해 Stacking하는 함수 구현(k=5)
from sklearn.model_selection import KFold

def get_stacking_datasets(model):

    # 최종 메타 모델이 사용할 학습 데이터 반환을 위해서 넘파이 배열을 0으로 만들어서 초기화
    train_fold_pred = np.zeros((X_train_re.shape[0], 1)) # 2차원으로
    test_pred = np.zeros((X_test_re.shape[0], 5)) # n_splits=5
    print(model.__class__.__name__, '모델 시작')
    
    for folder_counter, (train_idx, valid_idx) in enumerate(KFold(n_splits=5, shuffle=True, random_state=42).split(X_train_re)):
        # 개별 모델 내부에서 학습하고 1개의 fold로 예측할 데이터 셋 추출
        print(f" Fold 횟수 : {folder_counter+1}")
        X_train_ = X_train_re.iloc[train_idx]
        y_train_ = y_train.iloc[train_idx]
        X_test_ = X_train_re.iloc[valid_idx]
        
        # 개별 모델이 학습한 후 1개의 fold데이터셋으로 예측값 반환 후 최종 메타모델이 학습할 데이터셋에 첨가
        model.fit(X_train_, y_train_)
        train_fold_pred[valid_idx, :] = model.predict(X_test_).reshape(-1,1)
        # 개별 모델이 원본 데이터셋의 검증 데이터셋을 기반으로 예측 결과값 반환 후 최종 메타모델이 검증할 데이터셋에 첨가
        test_pred[:, folder_counter] = model.predict(X_test_re)
    
    # 개별모델안에서 테스트 데이터셋을 기반으로 예측한 결과값들 mean취해주고 2차원으로 바꾸어주기
    test_pred_mean = np.mean(test_pred, axis=1).reshape(-1,1)
    
    return train_fold_pred, test_pred_mean

In [241]:

LR_train, LR_test = get_stacking_datasets(model_LR)
DT_train, DT_test = get_stacking_datasets(model_DT)
RF_train, RF_test = get_stacking_datasets(model_RF)
LGBM_train, LGBM_test = get_stacking_datasets(model_LGBM)

new_x_train = np.concatenate((LR_train, DT_train, RF_train,LGBM_train), axis = 1)
new_x_test = np.concatenate(( LR_test, DT_test,  RF_test,LGBM_test), axis = 1)

# # meta learner
model_RF.fit(new_x_train, y_train) # 최종모델 XGB
y_test = pd.DataFrame(y_test)
y_hat_train = pd.DataFrame(model_RF.predict(new_x_train))
y_hat = pd.DataFrame(model_RF.predict(new_x_test))

LogisticRegression 모델 시작
 Fold 횟수 : 1
 Fold 횟수 : 2
 Fold 횟수 : 3
 Fold 횟수 : 4
 Fold 횟수 : 5
DecisionTreeClassifier 모델 시작
 Fold 횟수 : 1
 Fold 횟수 : 2
 Fold 횟수 : 3
 Fold 횟수 : 4
 Fold 횟수 : 5
RandomForestClassifier 모델 시작
 Fold 횟수 : 1


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


 Fold 횟수 : 2
 Fold 횟수 : 3
 Fold 횟수 : 4
 Fold 횟수 : 5
LGBMClassifier 모델 시작
 Fold 횟수 : 1
 Fold 횟수 : 2
 Fold 횟수 : 3
 Fold 횟수 : 4
 Fold 횟수 : 5


In [242]:
rdict={'model':[], "acc_train":[], "auc_train":[], 'acc_test':[],'precision':[],'recall':[],'f1_score':[], 'AUC_test':[]}

results_train  = (round(accuracy_score(y_train,y_hat_train),2),round(roc_auc_score(y_train,y_hat_train),2))

results = (round(accuracy_score(y_test,y_hat),2),
                round(precision_score(y_test,y_hat),2),
                round(recall_score(y_test,y_hat),2),
                round(f1_score(y_test,y_hat),2),
                round(roc_auc_score(y_test,y_hat),2))

rdict['model'].append("SV"); 
rdict['acc_train'].append(results_train[0])
rdict['auc_train'].append(results_train[1])
rdict['acc_test'].append(results[0])
rdict['precision'].append(results[1])
rdict['recall'].append(results[2])
rdict['f1_score'].append(results[3])
rdict['AUC_test'].append(results[4])

rdf_stacking = pd.DataFrame(data=rdict)
rdf_stacking

Unnamed: 0,model,acc_train,auc_train,acc_test,precision,recall,f1_score,AUC_test
0,SV,0.73,0.69,0.64,0.51,0.47,0.49,0.61
