In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import  OrdinalEncoder, StandardScaler
from sklearn.metrics import roc_auc_score, roc_curve, precision_recall_curve

import lightgbm as lgb
import xgboost as xgb
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek
from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

numeric_columns = [
    "임신 시도 또는 마지막 임신 경과 연수",
    "총 생성 배아 수",
    "미세주입된 난자 수",
    "미세주입에서 생성된 배아 수",
    "이식된 배아 수",
    "미세주입 배아 이식 수",
    "저장된 배아 수",
    "미세주입 후 저장된 배아 수",
    "해동된 배아 수",
    "해동 난자 수",
    "수집된 신선 난자 수",
    "저장된 신선 난자 수",
    "혼합된 난자 수",
    "파트너 정자와 혼합된 난자 수",
    "기증자 정자와 혼합된 난자 수",
    "난자 채취 경과일",
    "난자 해동 경과일",
    "난자 혼합 경과일",
    "배아 이식 경과일",
    "배아 해동 경과일"
]

categorical_columns = [
    "시술 시기 코드",
    "시술 당시 나이",
    "시술 유형",
    "특정 시술 유형",
    "배란 자극 여부",
    "배란 유도 유형",
    "단일 배아 이식 여부",
    "착상 전 유전 검사 사용 여부",
    "착상 전 유전 진단 사용 여부",
    "남성 주 불임 원인",
    "남성 부 불임 원인",
    "여성 주 불임 원인",
    "여성 부 불임 원인",
    "부부 주 불임 원인",
    "부부 부 불임 원인",
    "불명확 불임 원인",
    "불임 원인 - 난관 질환",
    "불임 원인 - 남성 요인",
    "불임 원인 - 배란 장애",
    "불임 원인 - 여성 요인",
    "불임 원인 - 자궁경부 문제",
    "불임 원인 - 자궁내막증",
    "불임 원인 - 정자 농도",
    "불임 원인 - 정자 면역학적 요인",
    "불임 원인 - 정자 운동성",
    "불임 원인 - 정자 형태",
    "배아 생성 주요 이유",
    "총 시술 횟수",
    "클리닉 내 총 시술 횟수",
    "IVF 시술 횟수",
    "DI 시술 횟수",
    "총 임신 횟수",
    "IVF 임신 횟수",
    "DI 임신 횟수",
    "총 출산 횟수",
    "IVF 출산 횟수",
    "DI 출산 횟수",
    "난자 출처",
    "정자 출처",
    "난자 기증자 나이",
    "정자 기증자 나이",
    "동결 배아 사용 여부",
    "신선 배아 사용 여부",
    "기증 배아 사용 여부",
    "대리모 여부",
    "PGD 시술 여부",
    "PGS 시술 여부"
]

# === 이상치 대체 (Winsorizing) 함수 정의 ===
def winsorize(df, numeric_cols, factor=1.5):
    df_new = df.copy()
    for col in numeric_cols:
        Q1 = df_new[col].quantile(0.25)
        Q3 = df_new[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - factor * IQR
        upper_bound = Q3 + factor * IQR
        df_new[col] = df_new[col].clip(lower=lower_bound, upper=upper_bound)
    return df_new

train = pd.read_csv('./data/train.csv').drop(columns=['ID'])
test = pd.read_csv('./data/test.csv').drop(columns=['ID'])

train = winsorize(train, numeric_columns, factor=1.5)
test = winsorize(test, numeric_columns, factor=1.5)

X = train.drop('임신 성공 여부', axis=1)
y = train['임신 성공 여부']

# 카테고리형 컬럼들을 문자열로 변환
for col in categorical_columns:
    X[col] = X[col].astype(str)
    test[col] = test[col].astype(str)

ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

X_train_encoded = X.copy()
X_train_encoded[categorical_columns] = ordinal_encoder.fit_transform(X[categorical_columns])

X_test_encoded = test.copy()
X_test_encoded[categorical_columns] = ordinal_encoder.transform(test[categorical_columns])

# 수치형 컬럼들을 0으로 채움
X_train_encoded[numeric_columns] = X_train_encoded[numeric_columns].fillna(0)
X_test_encoded[numeric_columns] = X_test_encoded[numeric_columns].fillna(0)

# categorical_feature에 인덱스를 전달
categorical_feature_indices = [X_train_encoded.columns.get_loc(col) for col in categorical_columns]

# 스케일링 적용
scaler = StandardScaler()
X_train_encoded[numeric_columns] = scaler.fit_transform(X_train_encoded[numeric_columns])
X_test_encoded[numeric_columns] = scaler.transform(X_test_encoded[numeric_columns])

# # 데이터 불균형 확인 및 SMOTE 적용
# smote = SMOTE(random_state=42)
# X_train_encoded, y = smote.fit_resample(X_train_encoded, y)

# 데이터 불균형 확인 및 SMOTE + Tomek 적용
smt = SMOTETomek(sampling_strategy=0.8, random_state=42)
X_train_encoded, y = smt.fit_resample(X_train_encoded, y)

# Feature Selection (SelectKBest)
selector = SelectKBest(score_func=mutual_info_classif, k=40)  # 가장 중요한 40개의 변수를 선택
X_train_encoded = selector.fit_transform(X_train_encoded, y)
X_test_encoded = selector.transform(X_test_encoded)

In [2]:
# -------------------------- 3. 하이퍼파라미터 튜닝 --------------------------
# LightGBM 하이퍼파라미터 그리드 (scikit-learn API 사용)
lgb_param_grid = {
    'n_estimators': [1000],
    'learning_rate': [0.01],
    'num_leaves': [70],
    'max_depth': [-1]
}

lgb_est = LGBMClassifier(random_state=42, objective='binary')
grid_lgb = GridSearchCV(lgb_est, lgb_param_grid, scoring='roc_auc', cv=3, n_jobs=-1, verbose=1)
grid_lgb.fit(X_train_encoded, y)
best_lgb = grid_lgb.best_estimator_
print("LightGBM 최적 파라미터:", grid_lgb.best_params_)
# LightGBM 최적 파라미터: {'learning_rate': 0.01, 'max_depth': -1, 'n_estimators': 1000, 'num_leaves': 70}


Fitting 3 folds for each of 1 candidates, totalling 3 fits




[LightGBM] [Info] Number of positive: 96877, number of negative: 122227
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.044740 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8005
[LightGBM] [Info] Number of positive: 96877, number of negative: 122226
[LightGBM] [Info] Number of positive: 96876, number of negative: 122227
[LightGBM] [Info] Number of data points in the train set: 219104, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.442151 -> initscore=-0.232438
[LightGBM] [Info] Start training from score -0.232438
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.052966 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8931
[LightGBM] [Info] Number of data points in the train set: 219103, number of used features: 40
[Light



[LightGBM] [Info] Number of positive: 145315, number of negative: 183340
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.042933 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8851
[LightGBM] [Info] Number of data points in the train set: 328655, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.442151 -> initscore=-0.232439
[LightGBM] [Info] Start training from score -0.232439
LightGBM 최적 파라미터: {'learning_rate': 0.01, 'max_depth': -1, 'n_estimators': 1000, 'num_leaves': 70}


In [None]:
# XGBoost 하이퍼파라미터 그리드
xgb_param_grid = {
    'n_estimators': [300],
    'learning_rate': [0.05],
    'max_depth': [11],
    'subsample': [0.8],
    'colsample_bytree': [0.7],
    'gamma': [0]
}

xgb_est = XGBClassifier(eval_metric='auc', random_state=42)
grid_xgb = GridSearchCV(xgb_est, xgb_param_grid, scoring='roc_auc', cv=3, n_jobs=-1, verbose=1)
grid_xgb.fit(X_train_encoded, y)
best_xgb = grid_xgb.best_estimator_
print("XGBoost 최적 파라미터:", grid_xgb.best_params_)
# XGBoost 최적 파라미터: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.05, 'max_depth': 7, 'n_estimators': 300, 'subsample': 0.8}
print("________________________________________________________________________________ \n")

Fitting 3 folds for each of 9 candidates, totalling 27 fits
XGBoost 최적 파라미터: {'colsample_bytree': 0.7, 'gamma': 0, 'learning_rate': 0.05, 'max_depth': 11, 'n_estimators': 300, 'subsample': 0.8}
________________________________________________________________________________ 



In [4]:
# -------------------------- 4. Stacking Ensemble 구성 --------------------------
# StackingClassifier: base estimator로 최적의 LightGBM과 XGBoost, 메타 모델로 LogisticRegression 사용
stacking_clf = StackingClassifier(
    estimators=[('lgb', best_lgb), ('xgb', best_xgb)],
    final_estimator=LogisticRegression(C=0.9, penalty='l2', solver='lbfgs',random_state=42),
    cv=3,
    n_jobs=-1,
    passthrough=True
)

# -------------------------- 5. 교차 검증을 통한 성능 비교 --------------------------
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores_lgb = []
cv_scores_xgb = []
cv_scores_stack = []

# 교차 검증을 위해 numpy 배열로 변환 (SelectKBest 결과는 numpy array)
X_fs = X_train_encoded  # 이미 numpy array
y_array = y.values  # numpy array로 변환

for train_idx, val_idx in cv.split(X_fs, y_array):
    X_cv_train, X_cv_val = X_fs[train_idx], X_fs[val_idx]
    y_cv_train, y_cv_val = y_array[train_idx], y_array[val_idx]
    
    # 개별 모델 예측 (최적 모델로 재학습)
    best_lgb.fit(X_cv_train, y_cv_train)
    lgb_proba = best_lgb.predict_proba(X_cv_val)[:, 1]
    auc_lgb = roc_auc_score(y_cv_val, lgb_proba)
    cv_scores_lgb.append(auc_lgb)
    
    best_xgb.fit(X_cv_train, y_cv_train)
    xgb_proba = best_xgb.predict_proba(X_cv_val)[:, 1]
    auc_xgb = roc_auc_score(y_cv_val, xgb_proba)
    cv_scores_xgb.append(auc_xgb)
    
    # Stacking 앙상블 예측
    stacking_clf.fit(X_cv_train, y_cv_train)
    stack_proba = stacking_clf.predict_proba(X_cv_val)[:, 1]
    auc_stack = roc_auc_score(y_cv_val, stack_proba)
    cv_scores_stack.append(auc_stack)

print("------------- 교차 검증 평균 ROC-AUC -------------")
print(f"LightGBM: {np.mean(cv_scores_lgb):.4f}")
print(f"XGBoost: {np.mean(cv_scores_xgb):.4f}")
print(f"Stacking Ensemble: {np.mean(cv_scores_stack):.4f}")
print("---------------------------------------------------\n")

# ------------- 교차 검증 평균 ROC-AUC -------------
# LightGBM: 0.9141
# XGBoost: 0.9117
# Stacking Ensemble: 0.8943
# ---------------------------------------------------

[LightGBM] [Info] Number of positive: 116252, number of negative: 146672
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.020018 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8835
[LightGBM] [Info] Number of data points in the train set: 262924, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.442151 -> initscore=-0.232439
[LightGBM] [Info] Start training from score -0.232439




[LightGBM] [Info] Number of positive: 116252, number of negative: 146672
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.017285 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8835
[LightGBM] [Info] Number of data points in the train set: 262924, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.442151 -> initscore=-0.232439
[LightGBM] [Info] Start training from score -0.232439




[LightGBM] [Info] Number of positive: 77502, number of negative: 97781
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.017852 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8056
[LightGBM] [Info] Number of data points in the train set: 175283, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.442154 -> initscore=-0.232427
[LightGBM] [Info] Start training from score -0.232427
[LightGBM] [Info] Number of positive: 77501, number of negative: 97782
[LightGBM] [Info] Number of positive: 77501, number of negative: 97781
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.045775 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8009
[LightGBM] [Info] Number of data points in the train set: 175283, number of used features: 40
[LightGBM

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[LightGBM] [Info] Number of positive: 116252, number of negative: 146672
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.022940 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8669
[LightGBM] [Info] Number of data points in the train set: 262924, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.442151 -> initscore=-0.232439
[LightGBM] [Info] Start training from score -0.232439




[LightGBM] [Info] Number of positive: 116252, number of negative: 146672
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.021723 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8669
[LightGBM] [Info] Number of data points in the train set: 262924, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.442151 -> initscore=-0.232439
[LightGBM] [Info] Start training from score -0.232439




[LightGBM] [Info] [LightGBM] [Info] Number of positive: 77501, number of negative: 97781
Number of positive: 77502, number of negative: 97781
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.054603 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8061
[LightGBM] [Info] Number of positive: 77501, number of negative: 97782
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.058048 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8905
[LightGBM] [Info] Number of data points in the train set: 175283, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.442154 -> initscore=-0.232427
[LightGBM] [Info] Start training from score -0.232427
[LightGBM] [Info] Number of data points

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[LightGBM] [Info] Number of positive: 116252, number of negative: 146672
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.023238 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8838
[LightGBM] [Info] Number of data points in the train set: 262924, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.442151 -> initscore=-0.232439
[LightGBM] [Info] Start training from score -0.232439




[LightGBM] [Info] Number of positive: 116252, number of negative: 146672
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.019821 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8838
[LightGBM] [Info] Number of data points in the train set: 262924, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.442151 -> initscore=-0.232439
[LightGBM] [Info] Start training from score -0.232439




[LightGBM] [Info] Number of positive: 77501, number of negative: 97781
[LightGBM] [Info] Number of positive: 77502, number of negative: 97781
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.044879 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8904
[LightGBM] [Info] Number of data points in the train set: 175282, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.442150 -> initscore=-0.232439
[LightGBM] [Info] Start training from score -0.232439
[LightGBM] [Info] Number of positive: 77501, number of negative: 97782
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.052783 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8065
[LightGBM] [Info] Number of data points in the train set: 175283, number of used features: 40
[LightGBM

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[LightGBM] [Info] Number of positive: 116252, number of negative: 146672
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.017584 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8664
[LightGBM] [Info] Number of data points in the train set: 262924, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.442151 -> initscore=-0.232439
[LightGBM] [Info] Start training from score -0.232439




[LightGBM] [Info] Number of positive: 116252, number of negative: 146672
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014129 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8664
[LightGBM] [Info] Number of data points in the train set: 262924, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.442151 -> initscore=-0.232439
[LightGBM] [Info] Start training from score -0.232439




[LightGBM] [Info] Number of positive: 77501, number of negative: 97781
[LightGBM] [Info] Number of positive: 77501, number of negative: 97782
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.079902 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8917
[LightGBM] [Info] Number of data points in the train set: 175282, number of used features: 40
[LightGBM] [Info] Number of positive: 77502, number of negative: 97781
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.442150 -> initscore=-0.232439
[LightGBM] [Info] Start training from score -0.232439
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.047453 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8000
[LightGBM] [Info] Number of data points in the train set: 175283, number of used features: 40
[LightGBM

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[LightGBM] [Info] Number of positive: 116252, number of negative: 146672
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018573 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8837
[LightGBM] [Info] Number of data points in the train set: 262924, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.442151 -> initscore=-0.232439
[LightGBM] [Info] Start training from score -0.232439




[LightGBM] [Info] Number of positive: 116252, number of negative: 146672
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.025007 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8837
[LightGBM] [Info] Number of data points in the train set: 262924, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.442151 -> initscore=-0.232439
[LightGBM] [Info] Start training from score -0.232439




[LightGBM] [Info] Number of positive: 77502, number of negative: 97781
[LightGBM] [Info] Number of positive: 77501, number of negative: 97781
[LightGBM] [Info] Number of positive: 77501, number of negative: 97782
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.044499 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8053
[LightGBM] [Info] Number of data points in the train set: 175283, number of used features: 40
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.047386 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8911
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.442154 -> initscore=-0.232427
[LightGBM] [Info] Start training from score -0.232427
[LightGBM] [Info] Number of data points

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


------------- 교차 검증 평균 ROC-AUC -------------
LightGBM: 0.8950
XGBoost: 0.8951
Stacking Ensemble: 0.8918
---------------------------------------------------



In [5]:
# -------------------------- 6. 최종 모델 학습 및 제출 파일 생성 --------------------------

# 전체 학습 데이터(Feature Selection 결과)로 Stacking 모델 학습
stacking_clf.fit(X_train_encoded, y)
train_pred_proba = stacking_clf.predict_proba(X_train_encoded)[:, 1]

# 테스트 데이터에 대한 예측 (Stacking 앙상블)
final_pred_proba = stacking_clf.predict_proba(X_test_encoded)[:, 1]

# ROC Curve 계산
fpr, tpr, thresholds = roc_curve(y, train_pred_proba)

# Optimal Cut-off (Youden's Index)
optimal_idx = np.argmax(tpr - fpr)
optimal_threshold = thresholds[optimal_idx]
print(f"최적 임계값 (ROC 기준): {optimal_threshold:.2f}")

# 최적 임계값 적용
final_predictions = (final_pred_proba >= optimal_threshold).astype(int)

# 제출 파일 생성
sample_submission = pd.read_csv('./data/sample_submission.csv')
sample_submission['probability'] = final_predictions
sample_submission.to_csv('./submit/stacking_ensemble_submit.csv', index=False)

print("최종 제출 파일 'stacking_ensemble_submit.csv' 생성 완료!")



[LightGBM] [Info] Number of positive: 145315, number of negative: 183340
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.026213 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8851
[LightGBM] [Info] Number of data points in the train set: 328655, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.442151 -> initscore=-0.232439
[LightGBM] [Info] Start training from score -0.232439




[LightGBM] [Info] Number of positive: 96876, number of negative: 122227
[LightGBM] [Info] Number of positive: 96877, number of negative: 122227
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.029132 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8112
[LightGBM] [Info] Number of data points in the train set: 219103, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.442148 -> initscore=-0.232448
[LightGBM] [Info] Start training from score -0.232448
[LightGBM] [Info] Number of positive: 96877, number of negative: 122226
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.090538 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8005
[LightGBM] [Info] Number of data poi

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


최적 임계값 (ROC 기준): 0.55
최종 제출 파일 'stacking_ensemble_submit.csv' 생성 완료!
