In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

fold = KFold(n_splits=5, shuffle=True, random_state=156)

lr_rg = LogisticRegression()
dt_clf = DecisionTreeClassifier()
rf_clf = RandomForestClassifier()
xgb = XGBClassifier()
lgbm = LGBMClassifier()


In [2]:
#  데이터 로드
train = pd.read_csv("./data/train.csv")
test = pd.read_csv("./data/test.csv")

#  데이터 합치기
df = pd.concat([train, test], axis=0)

# 결측치 대체
df.fillna(0, inplace=True)

# 레이블 int화
df["Survived"] = df["Survived"].astype("int")

# 수치형 데이터만 추출
df = df.select_dtypes(exclude="float")
df = df.select_dtypes(exclude="object")

# 레이블과 피쳐 분리
x = df.drop(columns="Survived", axis=1)
y = df["Survived"]


# 트레인 테스트 분리 
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=15)


train = pd.concat([x_train, y_train], axis=1)


In [None]:
model_ls = [lr_rg, dt_clf, rf_clf, xgb]
average_accuracy = []  

for model in model_ls:
    fold_accuracy = []  # 각 폴드에서의 정확도를 저장할 리스트
    print(model.__class__.__name__, '모델 시작')
    for i, (train_index, test_index) in enumerate(fold.split(train)):
        print("Fold 횟수:", i+1)
        feature = x_train.iloc[train_index]
        target = y_train.iloc[train_index]
        test_predioct = x_train.iloc[test_index] 
        test_test =  y_train.iloc[test_index]
        
        clf = model.fit(feature, target)
        pred = clf.predict(test_predioct)
        
        acc = accuracy_score(test_test, pred)
        fold_accuracy.append(acc)  # 폴드 정확도를 리스트에 추가
    
    avg_acc = sum(fold_accuracy) / len(fold_accuracy)  # 폴드 정확도의 평균 계산
    average_accuracy.append(avg_acc)  # 모델별 평균 정확도를 리스트에 추가

# 모델별 평균 정확도 출력
for model, avg_acc in zip(model_ls, average_accuracy):
    print(f"{model.__class__.__name__}의 평균 정확도:", avg_acc)


In [12]:
# 사용할 모델 ls
model_ls = [lr_rg, dt_clf, rf_clf, xgb]

# 결과를 담을 딕셔너리
rdict={'model':[], 'acc':[],'precision':[],'recall':[],'f1_score':[], 'AUC':[]}


# 스태킹 모델을 위한 test 데이터 만들기
test_list = []
for model in model_ls:
    clf = model.fit(x_train,y_train)
    pred_stk = clf.predict(x_test)
    test_list.append(pred_stk)
x_test_stk = pd.DataFrame(test_list).T
x_test_stk.columns = ["result_lr","result_dt", "result_rf", "result_xgb"] 
y_test.reset_index(drop=True,inplace=True)
x_test_stk["y_test"] = y_test


# 스태킹 모델을 위한 train 데이터 만들기
fold_list = []  # 메타 데이터를 담을 ls
df_pred = pd.DataFrame() # ls담긴 데이터를 데이터프레임으로 만들기 위해 빈 데이터프레임 생성
for model in model_ls:
    pred_list = []
    print(model.__class__.__name__, '모델 시작')
    for count_i, (train_index, test_index) in enumerate(fold.split(train)):
        print("Fold 횟수:", count_i+1)
        feature = x_train.iloc[train_index]
        target = y_train.iloc[train_index]
        test_predioct = x_train.iloc[test_index]
        clf = model.fit(feature, target) 
        pred = clf.predict(test_predioct) 
        
        pred_list.append(pred)

    df_pred["result"] = pd.DataFrame([arr for sublist in pred_list for arr in sublist])
    fold_list.append(df_pred["result"])

df_stk = pd.DataFrame(fold_list).T
df_stk.columns = ["result_lr","result_dt", "result_rf", "result_xgb"]        
y_train.reset_index(drop=True,inplace=True)
df_stk["target"] = y_train

# 메타 데이터 학습을 위한 x_train, y_train 정의
x_train_stk = df_stk.drop(columns="target", axis=1)
y_train_stk = df_stk["target"]

# lgbm이용해서 메타 데이터 학습 후 예측
lgbm.fit(x_train_stk, y_train_stk)
pred_stk = lgbm.predict(x_test_stk.drop(columns="y_test", axis=1))


rdict['model'].append("LGBMClassifier"); 
rdict['acc'].append(accuracy_score(np.array(x_test_stk["y_test"]),pred_stk))
rdict['precision'].append(precision_score(np.array(x_test_stk["y_test"]),pred_stk))
rdict['recall'].append(recall_score(np.array(x_test_stk["y_test"]),pred_stk))
rdict['f1_score'].append(f1_score(np.array(x_test_stk["y_test"]),pred_stk))
rdict['AUC'].append(roc_auc_score(np.array(x_test_stk["y_test"]),pred_stk))
pd.DataFrame(rdict)


LogisticRegression 모델 시작
Fold 횟수: 1
Fold 횟수: 2
Fold 횟수: 3
Fold 횟수: 4
Fold 횟수: 5
DecisionTreeClassifier 모델 시작
Fold 횟수: 1
Fold 횟수: 2
Fold 횟수: 3
Fold 횟수: 4
Fold 횟수: 5
RandomForestClassifier 모델 시작
Fold 횟수: 1
Fold 횟수: 2
Fold 횟수: 3
Fold 횟수: 4
Fold 횟수: 5
XGBClassifier 모델 시작
Fold 횟수: 1
Fold 횟수: 2
Fold 횟수: 3
Fold 횟수: 4
Fold 횟수: 5


  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,model,acc,precision,recall,f1_score,AUC
0,LGBMClassifier,0.740458,0.0,0.0,0.0,0.5


In [14]:
def stacking_data_making(model):
    train_pred = np.zeros((x_train.shape[0], 1))
    test_pred = np.zeros((x_test.shape[0], 5))
    print(model.__class__.__name__, '모델 시작')

    for count_i, (train_index, test_index) in enumerate(fold.split(train)):
        print("Fold 횟수:", count_i+1)
        feature = x_train.iloc[train_index]
        target = y_train.iloc[train_index]
        test_predioct = x_train.iloc[test_index]

        model.fit(feature, target) 
        train_pred[test_index, :] = model.predict(test_predioct).reshape(-1,1)
        test_pred[:, count_i] = model.predict(x_test)
        
    test_pred_mean = np.mean(test_pred, axis=1).reshape(-1, 1)  
    return train_pred, test_pred_mean


LR_train, LR_test = stacking_data_making(lr_rg)
DT_train, DT_test = stacking_data_making(dt_clf)
RF_train, RF_test = stacking_data_making(rf_clf)
xgb_train, xgb_test = stacking_data_making(xgb)


rdict={'model':[], 'acc':[],'precision':[],'recall':[],'f1_score':[], 'AUC':[]}

# 스태킹 모델을 위한 메타 데이터 생성
meta_train = np.concatenate((LR_train, DT_train, RF_train, xgb_train), axis=1)
meta_test = np.concatenate((LR_test, DT_test, RF_test, xgb_test), axis=1)

# 스태킹 모델 학습 및 예측
stacking_model = LGBMClassifier()
stacking_model.fit(meta_train, y_train)
pred_stacking = stacking_model.predict(meta_test)

rdict['model'].append("LGBMClassifier"); 
rdict['acc'].append(accuracy_score(y_test,pred_stacking))
rdict['precision'].append(precision_score(y_test,pred_stacking))
rdict['recall'].append(recall_score(y_test,pred_stacking))
rdict['f1_score'].append(f1_score(y_test,pred_stacking))
rdict['AUC'].append(roc_auc_score(y_test,pred_stacking))
pd.DataFrame(rdict)



LogisticRegression 모델 시작
Fold 횟수: 1
Fold 횟수: 2
Fold 횟수: 3
Fold 횟수: 4
Fold 횟수: 5
DecisionTreeClassifier 모델 시작
Fold 횟수: 1
Fold 횟수: 2
Fold 횟수: 3
Fold 횟수: 4
Fold 횟수: 5
RandomForestClassifier 모델 시작
Fold 횟수: 1
Fold 횟수: 2
Fold 횟수: 3
Fold 횟수: 4
Fold 횟수: 5
XGBClassifier 모델 시작
Fold 횟수: 1
Fold 횟수: 2
Fold 횟수: 3
Fold 횟수: 4
Fold 횟수: 5


Unnamed: 0,model,acc,precision,recall,f1_score,AUC
0,LGBMClassifier,0.748092,0.513158,0.573529,0.541667,0.691404
