<a href="https://colab.research.google.com/github/kyungddin/dofbot_project/blob/main/0225_%E1%84%80%E1%85%A7%E1%86%BC%E1%84%86%E1%85%B5%E1%86%AB_%E1%84%83%E1%85%A1%E1%84%8B%E1%85%B5%E1%84%8B%E1%85%A5%E1%84%90%E1%85%B3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import + CSV

In [None]:
!pip install prince

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import prince

from sklearn.ensemble import ExtraTreesClassifier
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

train = pd.read_csv('./train.csv').drop(columns=['ID'])
test = pd.read_csv('./test.csv').drop(columns=['ID'])

# EDA

### 단순 drop (수정: 그냥 결측치 많은 거 전부 제거 + 난자 채취 경과일 살리기)

In [None]:
train.drop(columns=['임신 시도 또는 마지막 임신 경과 연수'], inplace=True)
test.drop(columns=['임신 시도 또는 마지막 임신 경과 연수'], inplace=True)

train.drop(columns=['착상 전 유전 검사 사용 여부'], inplace=True)
test.drop(columns=['착상 전 유전 검사 사용 여부'], inplace=True)

train.drop(columns=['난자 해동 경과일'], inplace=True)
test.drop(columns=['난자 해동 경과일'], inplace=True)

train.drop(columns=['배아 해동 경과일'], inplace=True)
test.drop(columns=['배아 해동 경과일'], inplace=True)

train.drop(columns=['PGS 시술 여부'], inplace=True)
test.drop(columns=['PGS 시술 여부'], inplace=True)

train.drop(columns=['PGD 시술 여부'], inplace=True)
test.drop(columns=['PGD 시술 여부'], inplace=True)

train.drop(columns=['동결 배아 사용 여부'], inplace=True)
test.drop(columns=['동결 배아 사용 여부'], inplace=True)

train.drop(columns=['난자 채취 경과일'], inplace=True)
test.drop(columns=['난자 채취 경과일'], inplace=True)

train.drop(columns=['난자 혼합 경과일'], inplace=True)
test.drop(columns=['난자 혼합 경과일'], inplace=True)

train.drop(columns=['배아 이식 경과일'], inplace=True)
test.drop(columns=['배아 이식 경과일'], inplace=True)

### DI 시술 여부일 때 결측치 -1 처리

In [None]:
train.drop(columns=['미세주입된 난자 수'], inplace=True)
test.drop(columns=['미세주입된 난자 수'], inplace=True)

train.drop(columns=['미세주입에서 생성된 난자 수'], inplace=True)
test.drop(columns=['미세주입에서 생성된 난자 수'], inplace=True)

train.drop(columns=['미세주입에서 배아 이식 수'], inplace=True)
test.drop(columns=['미세주입에서 배아 이식 수'], inplace=True)

train.drop(columns=['미세주입 후 저장된 배아 수'], inplace=True)
test.drop(columns=['미세주입 후 저장된 배아 수'], inplace=True)

train.drop(columns=['혼합된 난자 수'], inplace=True)
test.drop(columns=['혼합된 난자 수'], inplace=True)

train.drop(columns=['파트너 정자와 혼합된 난자 수'], inplace=True)
test.drop(columns=['파트너 정자와 혼합된 난자 수'], inplace=True)

train.drop(columns=['기증자 정자와 혼합된 난자 수'], inplace=True)
test.drop(columns=['기증자 정자와 혼합된 난자 수'], inplace=True)

features = ['총 생성 배아 수', '이식된 배아 수', '저장된 배아 수',
            '해동된 배아 수', '해동 난자 수', '수집된 신선 난자 수', '저장된 신선 난자 수']

train[features] = train[features].fillna(-1)
test[features] = test[features].fillna(-1)

### DI 시술 여부일 때 결측치 -1 처리 후 원 핫 인코딩 (추가됨)

In [None]:
# 결측치를 -1로 채우기
cols_to_fill = ['신선 배아 사용 여부', '기증 배아 사용 여부', '단일 배아 이식 여부']

train[cols_to_fill] = train[cols_to_fill].fillna(-1)

train.drop(columns=['대리모 여부'], inplace=True)
test.drop(columns=['대리모 여부'], inplace=True)

train.drop(columns=['착상 전 유전 진단 사용 여부'], inplace=True)
test.drop(columns=['착상 전 유전 진단 사용 여부'], inplace=True)

# 원핫인코딩 적용
train_encoded = pd.get_dummies(train, columns=cols_to_fill)

# 기존 열을 삭제한 train 데이터프레임
train = train_encoded

# 결측치를 -1로 채우기
cols_to_fill = ['신선 배아 사용 여부', '기증 배아 사용 여부', '단일 배아 이식 여부']

test[cols_to_fill] = test[cols_to_fill].fillna(-1)

# 원핫인코딩 적용
test_encoded = pd.get_dummies(test, columns=cols_to_fill)

# 기존 열을 삭제한 test 데이터프레임
test = test_encoded


### 주/부 count feature 추가

In [None]:
train["주 불임 원인 개수"] = (
    train["남성 주 불임 원인"] + train["여성 주 불임 원인"] + train["부부 주 불임 원인"]
)
test["주 불임 원인 개수"] = (
    test["남성 주 불임 원인"] + test["여성 주 불임 원인"] + test["부부 주 불임 원인"]
)

train["부 불임 원인 개수"] = (
    train["남성 부 불임 원인"] + train["여성 부 불임 원인"] + train["부부 부 불임 원인"]
)
test["부 불임 원인 개수"] = (
    test["남성 부 불임 원인"] + test["여성 부 불임 원인"] + test["부부 부 불임 원인"]
)

train.drop(columns=['남성 주 불임 원인'], inplace=True)
test.drop(columns=['남성 주 불임 원인'], inplace=True)

train.drop(columns=['남성 부 불임 원인'], inplace=True)
test.drop(columns=['남성 부 불임 원인'], inplace=True)

train.drop(columns=['여성 주 불임 원인'], inplace=True)
test.drop(columns=['여성 주 불임 원인'], inplace=True)

train.drop(columns=['여성 부 불임 원인'], inplace=True)
test.drop(columns=['여성 부 불임 원인'], inplace=True)

train.drop(columns=['부부 주 불임 원인'], inplace=True)
test.drop(columns=['부부 주 불임 원인'], inplace=True)

train.drop(columns=['부부 부 불임 원인'], inplace=True)
test.drop(columns=['부부 부 불임 원인'], inplace=True)

### 여성 요인 합체

In [None]:
train["불임 원인 - 여성 요인"] = (
    train["불임 원인 - 자궁내막증"] + train["불임 원인 - 자궁경부 문제"] + train["불임 원인 - 난관 질환"]
)
test["불임 원인 - 여성 요인"] = (
    test["불임 원인 - 자궁내막증"] + test["불임 원인 - 자궁경부 문제"] + test["불임 원인 - 난관 질환"]
)

train.drop(columns=['불임 원인 - 자궁내막증'], inplace=True)
test.drop(columns=['불임 원인 - 자궁내막증'], inplace=True)

train.drop(columns=['불임 원인 - 자궁경부 문제'], inplace=True)
test.drop(columns=['불임 원인 - 자궁경부 문제'], inplace=True)

train.drop(columns=['불임 원인 - 난관 질환'], inplace=True)
test.drop(columns=['불임 원인 - 난관 질환'], inplace=True)

### 남성 요인 개수 feature 추가

In [None]:
train["불임 원인 - 남성 요인 개수"] = (
    train["불임 원인 - 정자 형태"] +
    train["불임 원인 - 정자 운동성"] +
    train["불임 원인 - 정자 면역학적 요인"] +
    train["불임 원인 - 정자 농도"]
)

test["불임 원인 - 남성 요인 개수"] = (
    test["불임 원인 - 정자 형태"] +
    test["불임 원인 - 정자 운동성"] +
    test["불임 원인 - 정자 면역학적 요인"] +
    test["불임 원인 - 정자 농도"]
)

train.drop(columns=["불임 원인 - 정자 형태"], inplace=True)
test.drop(columns=["불임 원인 - 정자 형태"], inplace=True)

train.drop(columns=['불임 원인 - 정자 운동성'], inplace=True)
test.drop(columns=['불임 원인 - 정자 운동성'], inplace=True)

train.drop(columns=['불임 원인 - 정자 면역학적 요인'], inplace=True)
test.drop(columns=['불임 원인 - 정자 면역학적 요인'], inplace=True)

train.drop(columns=['불임 원인 - 정자 농도'], inplace=True)
test.drop(columns=['불임 원인 - 정자 농도'], inplace=True)

### 배아 생성 주요 이유 (원핫)

In [None]:
train.drop(columns=['배아 생성 주요 이유'], inplace=True)
test.drop(columns=['배아 생성 주요 이유'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['배아 생성 주요 이유'] = train['배아 생성 주요 이유'].fillna('Nan')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train[col_name] = train['배아 생성 주요 이유'].apply(lambda x: 1 if isinstance(x, str) and reason in x else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train[col_name] = train['배아 생성 주요 이유'].apply

### 시술 횟수 관련

In [None]:
valid_values = ['0회', '1회', '2회', '3회', '4회', '5회', '6회 이상']

## train ##
train = train[
    (train['총 시술 횟수'].isin(valid_values)) &
    (train['클리닉 내 총 시술 횟수'].isin(valid_values)) &
    (train['IVF 시술 횟수'].isin(valid_values)) &
    (train['DI 시술 횟수'].isin(valid_values)) &
    (train['총 출산 횟수'].isin(valid_values)) &
    (train['IVF 출산 횟수'].isin(valid_values)) &
    (train['DI 출산 횟수'].isin(valid_values)) &
    (train['총 임신 횟수'].isin(valid_values)) &
    (train['IVF 임신 횟수'].isin(valid_values)) &
    (train['DI 임신 횟수'].isin(valid_values))
]

for col in ['총 시술 횟수', '클리닉 내 총 시술 횟수', 'IVF 시술 횟수', 'DI 시술 횟수', '총 임신 횟수', 'IVF 임신 횟수', 'DI 임신 횟수', '총 출산 횟수', 'IVF 출산 횟수', 'DI 출산 횟수']:
    train[col] = train[col].replace('6회 이상', '6')
    train[col] = train[col].str.replace('회', '', regex=True).astype(int)

train['총 시술 횟수'] = train['IVF 시술 횟수'] + train['DI 시술 횟수']

## test ##
test = test[
    (test['총 시술 횟수'].isin(valid_values)) &
    (test['클리닉 내 총 시술 횟수'].isin(valid_values)) &
    (test['IVF 시술 횟수'].isin(valid_values)) &
    (test['DI 시술 횟수'].isin(valid_values)) &
    (test['총 출산 횟수'].isin(valid_values)) &
    (test['IVF 출산 횟수'].isin(valid_values)) &
    (test['DI 출산 횟수'].isin(valid_values)) &
    (test['총 임신 횟수'].isin(valid_values)) &
    (test['IVF 임신 횟수'].isin(valid_values)) &
    (test['DI 임신 횟수'].isin(valid_values))
]

for col in ['총 시술 횟수', '클리닉 내 총 시술 횟수', 'IVF 시술 횟수', 'DI 시술 횟수', '총 임신 횟수', 'IVF 임신 횟수', 'DI 임신 횟수', '총 출산 횟수', 'IVF 출산 횟수', 'DI 출산 횟수']:
    test[col] = test[col].replace('6회 이상', '6')
    test[col] = test[col].str.replace('회', '', regex=True).astype(int)

test['총 시술 횟수'] = test['IVF 시술 횟수'] + test['DI 시술 횟수']

train.drop(columns=['클리닉 내 총 시술 횟수'], inplace=True)
test.drop(columns=['클리닉 내 총 시술 횟수'], inplace=True)

train.drop(columns=['IVF 시술 횟수'], inplace=True)
test.drop(columns=['IVF 시술 횟수'], inplace=True)

train.drop(columns=['DI 시술 횟수'], inplace=True)
test.drop(columns=['DI 시술 횟수'], inplace=True)

train.drop(columns=['DI 임신 횟수'], inplace=True)
test.drop(columns=['DI 임신 횟수'], inplace=True)

train.drop(columns=['IVF 임신 횟수'], inplace=True)
test.drop(columns=['IVF 임신 횟수'], inplace=True)

train.drop(columns=['DI 출산 횟수'], inplace=True)
test.drop(columns=['DI 출산 횟수'], inplace=True)

train.drop(columns=['IVF 출산 횟수'], inplace=True)
test.drop(columns=['IVF 출산 횟수'], inplace=True)

### 특정 시술 유형 그룹화 + 원핫

In [None]:
train.drop(columns=['특정 시술 유형'], inplace=True)
test.drop(columns=['특정 시술 유형'], inplace=True)

### 배란 유도 유형 정상화

In [None]:
train.drop(columns=['배란 유도 유형'], inplace=True)
test.drop(columns=['배란 유도 유형'], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train["배란 유도 유형"].replace(["세트로타이드 (억제제)", "생식선 자극 호르몬"], "알 수 없음", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test["배란 유도 유형"].replace(["세트로타이드 (억제제)", "생식선 자극 호르몬"], "알 수 없음", inplace=True)


### 난자출처/정자출처 원핫인코딩

In [None]:
train.drop(columns=['난자 출처'], inplace=True)
test.drop(columns=['난자 출처'], inplace=True)

train.drop(columns=['정자 출처'], inplace=True)
test.drop(columns=['정자 출처'], inplace=True)

### 난자/정자 기증자 나이 라벨 인코딩 (수정: 오타 해결 + 결측치 -1 처리)

In [None]:
from sklearn.preprocessing import LabelEncoder

# 결측치를 -1로 처리 (난자 기증자 나이, 정자 기증자 나이)
train["난자 기증자 나이"].fillna("-1", inplace=True)
test["난자 기증자 나이"].fillna("-1", inplace=True)

# 라벨 인코딩 매핑 딕셔너리
age_mapping_nanja = {
    "만20세 이하": 0,
    "만21-25세": 1,
    "만26-30세": 2,
    "만31-35세": 3,
    "알 수 없음": -1
}

# 라벨 인코딩 적용(난자)
train["난자 기증자 나이 (라벨 인코딩)"] = train["난자 기증자 나이"].map(age_mapping_nanja)
test["난자 기증자 나이 (라벨 인코딩)"] = test["난자 기증자 나이"].map(age_mapping_nanja)

# 원본 "난자 기증자 나이" 컬럼 제거(난자)
train.drop(columns=["난자 기증자 나이"], inplace=True)
test.drop(columns=["난자 기증자 나이"], inplace=True)

# 원본 "정자 기증자 나이" 컬럼 제거(정자)
train.drop(columns=["정자 기증자 나이"], inplace=True)
test.drop(columns=["정자 기증자 나이"], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train["난자 기증자 나이"].fillna("-1", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test["난자 기증자 나이"].fillna("-1", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values al

### 시술 당시 나이 라벨 인코딩

In [None]:
# 라벨 인코딩 매핑 딕셔너리 (나이가 작은 순으로 매핑)
age_mapping_treatment = {
    "만18-34세": 0,
    "만35-37세": 1,
    "만38-39세": 2,
    "만40-42세": 3,
    "만43-44세": 4,
    "만45-50세": 5,
    "알 수 없음": -1
}

# 라벨 인코딩 적용
train["시술 당시 나이 (라벨 인코딩)"] = train["시술 당시 나이"].map(age_mapping_treatment)
test["시술 당시 나이 (라벨 인코딩)"] = test["시술 당시 나이"].map(age_mapping_treatment)

# 원본 "시술 당시 나이" 컬럼 제거
train.drop(columns=["시술 당시 나이"], inplace=True)
test.drop(columns=["시술 당시 나이"], inplace=True)

### 시술 시기 코드 원 핫 인코딩

In [None]:
train.drop(columns=['시술 시기 코드'], inplace=True)
test.drop(columns=['시술 시기 코드'], inplace=True)

***

# Label/One-Hot/Numeric Column List

In [None]:
label_columns = ['불명확 불임 원인', '시술 유형']

numeric_columns = ['임신 시도 또는 마지막 임신 경과 연수', '난자 혼합 경과일', '배아 이식 경과일'
                  , '총 시술 횟수', '클리닉 내 총 시술 횟수', 'IVF 시술 횟수', 'DI 시술 횟수', '총 임신 횟수'
                   , 'IVF 임신 횟수', 'DI 임신 횟수', '총 출산 횟수', 'IVF 출산 횟수', 'DI 출산 횟수' ]

# 결측치 처리 (추가됨: 결측치가 있는 남은 세 개의 정수형 feature를 중앙값으로 채우기)

# 인코딩

In [None]:
# 카테고리형 컬럼들을 문자열로 변환
for col in label_columns:
    train[col] = train[col].astype(str)
    test[col] = test[col].astype(str)

ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
train[label_columns] = ordinal_encoder.fit_transform(train[label_columns])
test[label_columns] = ordinal_encoder.transform(test[label_columns])

# X와 y로 분리

In [None]:
X = train.drop('임신 성공 여부', axis=1)
y = train['임신 성공 여부']

In [None]:
test.drop(columns=['임신 성공 여부'], inplace=True)

***

# 하이퍼파라미터 (stacking으로 바꿈, 학습모델과 파라미터는 계속 찾아야 할 듯)

In [None]:
"""
import optuna
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import StackingClassifier
from imblearn.over_sampling import SMOTE  # SMOTE 임포트

# 데이터 분할
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# SMOTE 적용
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Optuna 하이퍼파라미터 튜닝을 위한 목적 함수
def objective(trial):
    # XGBoost 하이퍼파라미터 설정
    xgb_param = {
        'learning_rate': trial.suggest_categorical('learning_rate', [0.01, 0.05, 0.1]),
        'max_depth': trial.suggest_int('max_depth', 3, 7),
        'n_estimators': trial.suggest_categorical('n_estimators', [100, 500, 1000]),
        'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
        'gamma': trial.suggest_uniform('gamma', 0.1, 1.0),  # 추가된 파라미터
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),  # 추가된 파라미터
        'eval_metric': 'logloss',
        'random_state': 42
    }

    # LightGBM 하이퍼파라미터 설정
    lgb_param = {
        'learning_rate': trial.suggest_categorical('learning_rate', [0.01, 0.05, 0.1]),
        'num_leaves': trial.suggest_int('num_leaves', 31, 127),
        'n_estimators': trial.suggest_categorical('n_estimators', [100, 500, 1000]),
        'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
        'max_bin': trial.suggest_int('max_bin', 50, 200),  # 추가된 파라미터
        'random_state': 42
    }

    # CatBoost 하이퍼파라미터 설정
    catboost_param = {
        'iterations': trial.suggest_categorical('iterations', [100, 500, 1000]),
        'depth': trial.suggest_int('depth', 4, 10),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.01, 0.05, 0.1]),
        'random_seed': 42,
        'verbose': 0
    }

    # XGBoost 모델 학습
    xgb_clf = xgb.XGBClassifier(**xgb_param)
    xgb_clf.fit(X_train_resampled, y_train_resampled)

    # LightGBM 모델 학습
    lgb_clf = lgb.LGBMClassifier(**lgb_param)
    lgb_clf.fit(X_train_resampled, y_train_resampled)

    # CatBoost 모델 학습
    catboost_clf = cb.CatBoostClassifier(**catboost_param)
    catboost_clf.fit(X_train_resampled, y_train_resampled)

    # Stacking 앙상블 기법 적용 (최종 모델: LightGBM)
    stacking_clf = StackingClassifier(
        estimators=[('xgb', xgb_clf), ('lgb', lgb_clf), ('catboost', catboost_clf)],
        final_estimator=lgb.LGBMClassifier(**lgb_param)
    )

    # Stacking 모델 학습
    stacking_clf.fit(X_train_resampled, y_train_resampled)

    # 예측 (ROC-AUC 계산)
    y_pred = stacking_clf.predict_proba(X_valid)[:, 1]
    score = roc_auc_score(y_valid, y_pred)

    return score

# Optuna 최적화
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

print(f"Best Trial: {study.best_trial.params}")
print(f"Best ROC-AUC Score: {study.best_value}")
"""

'\nimport optuna\nimport xgboost as xgb\nimport lightgbm as lgb\nimport catboost as cb\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.metrics import roc_auc_score\nfrom sklearn.ensemble import StackingClassifier\nfrom imblearn.over_sampling import SMOTE  # SMOTE 임포트\n\n# 데이터 분할\nX_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)\n\n# SMOTE 적용\nsmote = SMOTE(random_state=42)\nX_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)\n\n# Optuna 하이퍼파라미터 튜닝을 위한 목적 함수\ndef objective(trial):\n    # XGBoost 하이퍼파라미터 설정\n    xgb_param = {\n        \'learning_rate\': trial.suggest_categorical(\'learning_rate\', [0.01, 0.05, 0.1]),\n        \'max_depth\': trial.suggest_int(\'max_depth\', 3, 7),\n        \'n_estimators\': trial.suggest_categorical(\'n_estimators\', [100, 500, 1000]),\n        \'subsample\': trial.suggest_uniform(\'subsample\', 0.6, 1.0),\n        \'colsample_bytree\': trial.suggest_uniform(\'colsample

In [None]:
!pip install optuna
!pip install catboost

import optuna
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import StackingClassifier
from imblearn.over_sampling import SMOTE  # SMOTE 임포트

# 데이터 분할
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# SMOTE 적용
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Optuna 하이퍼파라미터 튜닝을 위한 목적 함수
def objective(trial):
    # XGBoost 하이퍼파라미터 설정
    xgb_param = {
      'learning_rate': trial.suggest_categorical('learning_rate', [0.01, 0.05, 0.1]),
      'max_depth': trial.suggest_int('max_depth', 3, 7),
      'n_estimators': trial.suggest_categorical('n_estimators', [100, 500, 1000]),
      'subsample': trial.suggest_float('subsample', 0.6, 1.0),
      'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
      'gamma': trial.suggest_float('gamma', 0.1, 1.0),
      'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
      'random_state': 42
    }

    # LightGBM 하이퍼파라미터 설정
    lgb_param = {
        'learning_rate': trial.suggest_categorical('learning_rate', [0.01, 0.05, 0.1]),
        'num_leaves': trial.suggest_int('num_leaves', 31, 127),
        'n_estimators': trial.suggest_categorical('n_estimators', [100, 500, 1000]),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'max_bin': trial.suggest_int('max_bin', 50, 200),  # 추가된 파라미터
        'random_state': 42
    }

    # CatBoost 하이퍼파라미터 설정
    catboost_param = {
        'learning_rate': trial.suggest_categorical('learning_rate', [0.01, 0.05, 0.1]),
        'depth': trial.suggest_int('depth', 4, 10),
        'iterations': trial.suggest_categorical('iterations', [100, 500, 1000]),
        'l2_leaf_reg': trial.suggest_int('l2_leaf_reg', 1, 10),
        'random_state': 42
    }

    # XGBoost 모델 학습
    xgb_clf = xgb.XGBClassifier(**xgb_param)
    xgb_clf.fit(X_train_resampled, y_train_resampled)

    # LightGBM 모델 학습
    lgb_clf = lgb.LGBMClassifier(**lgb_param)
    lgb_clf.fit(X_train_resampled, y_train_resampled)

    # CatBoost 모델 학습
    catboost_clf = cb.CatBoostClassifier(**catboost_param)
    catboost_clf.fit(X_train_resampled, y_train_resampled, verbose=0)  # verbose=0으로 출력 생략

    # Stacking 앙상블 기법 적용 (최종 모델: LightGBM)
    stacking_clf = StackingClassifier(
        estimators=[('xgb', xgb_clf), ('lgb', lgb_clf), ('catboost', catboost_clf)],
        final_estimator=lgb.LGBMClassifier(**lgb_param)
    )

    # Stacking 모델 학습
    stacking_clf.fit(X_train_resampled, y_train_resampled)

    # 예측 (ROC-AUC 계산)
    y_pred = stacking_clf.predict_proba(X_valid)[:, 1]
    score = roc_auc_score(y_valid, y_pred)

    return score

# Optuna 최적화
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

print(f"Best Trial: {study.best_trial.params}")
print(f"Best ROC-AUC Score: {study.best_value}")



[I 2025-02-25 19:46:33,268] A new study created in memory with name: no-name-9154e3be-52aa-413a-b2eb-2c29a51e9d65


[LightGBM] [Info] Number of positive: 152149, number of negative: 152149
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.051090 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3109
[LightGBM] [Info] Number of data points in the train set: 304298, number of used features: 87
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 152149, number of negative: 152149
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.050646 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3109
[LightGBM] [Info] Number of data points in the train set: 304298, number of used features: 87
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initsco

[I 2025-02-25 19:50:30,164] Trial 0 finished with value: 0.626375262347887 and parameters: {'learning_rate': 0.05, 'max_depth': 7, 'n_estimators': 500, 'subsample': 0.8840495414686645, 'colsample_bytree': 0.7719776295757311, 'gamma': 0.5559558549474342, 'min_child_weight': 8, 'num_leaves': 45, 'max_bin': 160, 'depth': 10, 'iterations': 100, 'l2_leaf_reg': 7}. Best is trial 0 with value: 0.626375262347887.


[LightGBM] [Info] Number of positive: 152149, number of negative: 152149
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.049159 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1721
[LightGBM] [Info] Number of data points in the train set: 304298, number of used features: 87
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 152149, number of negative: 152149
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.050544 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1721
[LightGBM] [Info] Number of data points in the train set: 304298, number of used features: 87
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initsco

[I 2025-02-25 19:57:46,288] Trial 1 finished with value: 0.6234907317543286 and parameters: {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 500, 'subsample': 0.7608739790740021, 'colsample_bytree': 0.8045327811570152, 'gamma': 0.11280865208993443, 'min_child_weight': 6, 'num_leaves': 67, 'max_bin': 82, 'depth': 6, 'iterations': 1000, 'l2_leaf_reg': 7}. Best is trial 0 with value: 0.626375262347887.


[LightGBM] [Info] Number of positive: 152149, number of negative: 152149
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.050813 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2001
[LightGBM] [Info] Number of data points in the train set: 304298, number of used features: 87
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 152149, number of negative: 152149
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.069805 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2001
[LightGBM] [Info] Number of data points in the train set: 304298, number of used features: 87
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initsco

[I 2025-02-25 20:01:28,847] Trial 2 finished with value: 0.6700958299263405 and parameters: {'learning_rate': 0.01, 'max_depth': 4, 'n_estimators': 100, 'subsample': 0.7757844134725829, 'colsample_bytree': 0.8559248183109263, 'gamma': 0.4361392258911754, 'min_child_weight': 10, 'num_leaves': 38, 'max_bin': 98, 'depth': 6, 'iterations': 500, 'l2_leaf_reg': 5}. Best is trial 2 with value: 0.6700958299263405.


[LightGBM] [Info] Number of positive: 152149, number of negative: 152149
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.054740 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2161
[LightGBM] [Info] Number of data points in the train set: 304298, number of used features: 87
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 152149, number of negative: 152149
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.054725 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2161
[LightGBM] [Info] Number of data points in the train set: 304298, number of used features: 87
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initsco

[I 2025-02-25 20:07:08,310] Trial 3 finished with value: 0.6386632971058124 and parameters: {'learning_rate': 0.05, 'max_depth': 4, 'n_estimators': 1000, 'subsample': 0.7259367269137427, 'colsample_bytree': 0.8796227752961439, 'gamma': 0.38335549323612406, 'min_child_weight': 9, 'num_leaves': 97, 'max_bin': 107, 'depth': 8, 'iterations': 100, 'l2_leaf_reg': 2}. Best is trial 2 with value: 0.6700958299263405.


[LightGBM] [Info] Number of positive: 152149, number of negative: 152149
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.053576 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2212
[LightGBM] [Info] Number of data points in the train set: 304298, number of used features: 87
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 152149, number of negative: 152149
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.055173 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2212
[LightGBM] [Info] Number of data points in the train set: 304298, number of used features: 87
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initsco

[I 2025-02-25 20:10:51,164] Trial 4 finished with value: 0.6314459012290079 and parameters: {'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 500, 'subsample': 0.6259064347624559, 'colsample_bytree': 0.851992898582827, 'gamma': 0.8166512041511594, 'min_child_weight': 5, 'num_leaves': 103, 'max_bin': 110, 'depth': 9, 'iterations': 100, 'l2_leaf_reg': 5}. Best is trial 2 with value: 0.6700958299263405.


[LightGBM] [Info] Number of positive: 152149, number of negative: 152149
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.053737 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3279
[LightGBM] [Info] Number of data points in the train set: 304298, number of used features: 87
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 152149, number of negative: 152149
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.052159 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3279
[LightGBM] [Info] Number of data points in the train set: 304298, number of used features: 87
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initsco

[I 2025-02-25 20:14:49,644] Trial 5 finished with value: 0.6384806251630553 and parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 500, 'subsample': 0.7662243873337891, 'colsample_bytree': 0.6880808834779566, 'gamma': 0.562501802577605, 'min_child_weight': 7, 'num_leaves': 71, 'max_bin': 176, 'depth': 10, 'iterations': 100, 'l2_leaf_reg': 9}. Best is trial 2 with value: 0.6700958299263405.


[LightGBM] [Info] Number of positive: 152149, number of negative: 152149
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.054375 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3296
[LightGBM] [Info] Number of data points in the train set: 304298, number of used features: 87
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 152149, number of negative: 152149
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.051220 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3296
[LightGBM] [Info] Number of data points in the train set: 304298, number of used features: 87
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initsco

[I 2025-02-25 20:18:25,138] Trial 6 finished with value: 0.6346199238922785 and parameters: {'learning_rate': 0.05, 'max_depth': 4, 'n_estimators': 100, 'subsample': 0.7378579224304239, 'colsample_bytree': 0.7041449916166695, 'gamma': 0.15823489268208818, 'min_child_weight': 7, 'num_leaves': 86, 'max_bin': 177, 'depth': 6, 'iterations': 500, 'l2_leaf_reg': 4}. Best is trial 2 with value: 0.6700958299263405.


[LightGBM] [Info] Number of positive: 152149, number of negative: 152149
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.056995 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2937
[LightGBM] [Info] Number of data points in the train set: 304298, number of used features: 87
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 152149, number of negative: 152149
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.050454 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2937
[LightGBM] [Info] Number of data points in the train set: 304298, number of used features: 87
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initsco

[I 2025-02-25 20:22:28,446] Trial 7 finished with value: 0.6818894401249374 and parameters: {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 100, 'subsample': 0.7645565763125465, 'colsample_bytree': 0.7679347289696092, 'gamma': 0.33854036274760024, 'min_child_weight': 8, 'num_leaves': 90, 'max_bin': 150, 'depth': 7, 'iterations': 500, 'l2_leaf_reg': 2}. Best is trial 7 with value: 0.6818894401249374.


[LightGBM] [Info] Number of positive: 152149, number of negative: 152149
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.050974 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2266
[LightGBM] [Info] Number of data points in the train set: 304298, number of used features: 87
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 152149, number of negative: 152149
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.051544 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2266
[LightGBM] [Info] Number of data points in the train set: 304298, number of used features: 87
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initsco

[I 2025-02-25 20:26:07,710] Trial 8 finished with value: 0.6821062004795168 and parameters: {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 500, 'subsample': 0.7384849385335619, 'colsample_bytree': 0.7138346459142997, 'gamma': 0.525542402781, 'min_child_weight': 7, 'num_leaves': 105, 'max_bin': 113, 'depth': 6, 'iterations': 100, 'l2_leaf_reg': 4}. Best is trial 8 with value: 0.6821062004795168.


[LightGBM] [Info] Number of positive: 152149, number of negative: 152149
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.056357 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2509
[LightGBM] [Info] Number of data points in the train set: 304298, number of used features: 87
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 152149, number of negative: 152149
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.056082 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2509
[LightGBM] [Info] Number of data points in the train set: 304298, number of used features: 87
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initsco

[I 2025-02-25 20:36:29,738] Trial 9 finished with value: 0.6459939305556016 and parameters: {'learning_rate': 0.05, 'max_depth': 4, 'n_estimators': 100, 'subsample': 0.8276851966145946, 'colsample_bytree': 0.8133630694712576, 'gamma': 0.5222692384937462, 'min_child_weight': 7, 'num_leaves': 121, 'max_bin': 126, 'depth': 10, 'iterations': 500, 'l2_leaf_reg': 7}. Best is trial 8 with value: 0.6821062004795168.


[LightGBM] [Info] Number of positive: 152149, number of negative: 152149
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.052733 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1202
[LightGBM] [Info] Number of data points in the train set: 304298, number of used features: 87
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 152149, number of negative: 152149
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.050535 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1202
[LightGBM] [Info] Number of data points in the train set: 304298, number of used features: 87
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initsco

[I 2025-02-25 20:47:57,346] Trial 10 finished with value: 0.6361098071138913 and parameters: {'learning_rate': 0.01, 'max_depth': 7, 'n_estimators': 1000, 'subsample': 0.9749483411551833, 'colsample_bytree': 0.988700936365419, 'gamma': 0.8134095236427041, 'min_child_weight': 2, 'num_leaves': 123, 'max_bin': 53, 'depth': 4, 'iterations': 1000, 'l2_leaf_reg': 1}. Best is trial 8 with value: 0.6821062004795168.


[LightGBM] [Info] Number of positive: 152149, number of negative: 152149
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.049527 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2797
[LightGBM] [Info] Number of data points in the train set: 304298, number of used features: 87
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 152149, number of negative: 152149
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.051736 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2797
[LightGBM] [Info] Number of data points in the train set: 304298, number of used features: 87
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initsco

[I 2025-02-25 20:53:01,524] Trial 11 finished with value: 0.6912556615122774 and parameters: {'learning_rate': 0.01, 'max_depth': 6, 'n_estimators': 100, 'subsample': 0.6568289740037279, 'colsample_bytree': 0.6126412915734233, 'gamma': 0.2974821113362949, 'min_child_weight': 4, 'num_leaves': 108, 'max_bin': 142, 'depth': 7, 'iterations': 500, 'l2_leaf_reg': 3}. Best is trial 11 with value: 0.6912556615122774.


[LightGBM] [Info] Number of positive: 152149, number of negative: 152149
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.046381 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2779
[LightGBM] [Info] Number of data points in the train set: 304298, number of used features: 87
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 152149, number of negative: 152149
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.052391 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2779
[LightGBM] [Info] Number of data points in the train set: 304298, number of used features: 87
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initsco

[I 2025-02-25 20:59:04,509] Trial 12 finished with value: 0.681742307227922 and parameters: {'learning_rate': 0.01, 'max_depth': 6, 'n_estimators': 500, 'subsample': 0.6448143923338204, 'colsample_bytree': 0.6036257414002384, 'gamma': 0.24681557072871838, 'min_child_weight': 4, 'num_leaves': 108, 'max_bin': 141, 'depth': 4, 'iterations': 500, 'l2_leaf_reg': 3}. Best is trial 11 with value: 0.6912556615122774.


[LightGBM] [Info] Number of positive: 152149, number of negative: 152149
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.052285 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2597
[LightGBM] [Info] Number of data points in the train set: 304298, number of used features: 87
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 152149, number of negative: 152149
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.051167 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2597
[LightGBM] [Info] Number of data points in the train set: 304298, number of used features: 87
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initsco

[I 2025-02-25 21:00:47,624] Trial 13 finished with value: 0.7190503500398462 and parameters: {'learning_rate': 0.01, 'max_depth': 6, 'n_estimators': 100, 'subsample': 0.6749058655928091, 'colsample_bytree': 0.6022099899036761, 'gamma': 0.6893891732628178, 'min_child_weight': 3, 'num_leaves': 112, 'max_bin': 131, 'depth': 7, 'iterations': 100, 'l2_leaf_reg': 4}. Best is trial 13 with value: 0.7190503500398462.


[LightGBM] [Info] Number of positive: 152149, number of negative: 152149
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.051284 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2743
[LightGBM] [Info] Number of data points in the train set: 304298, number of used features: 87
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 152149, number of negative: 152149
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.051748 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2743
[LightGBM] [Info] Number of data points in the train set: 304298, number of used features: 87
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initsco

[I 2025-02-25 21:10:45,262] Trial 14 finished with value: 0.6767585598333002 and parameters: {'learning_rate': 0.01, 'max_depth': 6, 'n_estimators': 100, 'subsample': 0.668084359615099, 'colsample_bytree': 0.6018826593649312, 'gamma': 0.7068627254576458, 'min_child_weight': 2, 'num_leaves': 126, 'max_bin': 139, 'depth': 8, 'iterations': 1000, 'l2_leaf_reg': 3}. Best is trial 13 with value: 0.7190503500398462.


[LightGBM] [Info] Number of positive: 152149, number of negative: 152149
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.063372 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3127
[LightGBM] [Info] Number of data points in the train set: 304298, number of used features: 87
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 152149, number of negative: 152149
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.045652 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3127
[LightGBM] [Info] Number of data points in the train set: 304298, number of used features: 87
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initsco

[I 2025-02-25 21:12:32,296] Trial 15 finished with value: 0.7173532093535704 and parameters: {'learning_rate': 0.01, 'max_depth': 6, 'n_estimators': 100, 'subsample': 0.678958807779098, 'colsample_bytree': 0.6525147477270202, 'gamma': 0.9321549396431035, 'min_child_weight': 4, 'num_leaves': 113, 'max_bin': 161, 'depth': 7, 'iterations': 100, 'l2_leaf_reg': 1}. Best is trial 13 with value: 0.7190503500398462.


[LightGBM] [Info] Number of positive: 152149, number of negative: 152149
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.049426 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3198
[LightGBM] [Info] Number of data points in the train set: 304298, number of used features: 87
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 152149, number of negative: 152149
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.055397 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3198
[LightGBM] [Info] Number of data points in the train set: 304298, number of used features: 87
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initsco

[I 2025-02-25 21:14:12,168] Trial 16 finished with value: 0.7217312165820484 and parameters: {'learning_rate': 0.01, 'max_depth': 6, 'n_estimators': 100, 'subsample': 0.6865588458861974, 'colsample_bytree': 0.6559218321651377, 'gamma': 0.9873207672764417, 'min_child_weight': 1, 'num_leaves': 77, 'max_bin': 165, 'depth': 8, 'iterations': 100, 'l2_leaf_reg': 1}. Best is trial 16 with value: 0.7217312165820484.


[LightGBM] [Info] Number of positive: 152149, number of negative: 152149
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.052034 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3584
[LightGBM] [Info] Number of data points in the train set: 304298, number of used features: 87
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 152149, number of negative: 152149
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.049991 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3584
[LightGBM] [Info] Number of data points in the train set: 304298, number of used features: 87
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initsco

[I 2025-02-25 21:15:49,475] Trial 17 finished with value: 0.631802187811817 and parameters: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 100, 'subsample': 0.8339972401585221, 'colsample_bytree': 0.6544280166439753, 'gamma': 0.9863779940045377, 'min_child_weight': 1, 'num_leaves': 62, 'max_bin': 194, 'depth': 8, 'iterations': 100, 'l2_leaf_reg': 9}. Best is trial 16 with value: 0.7217312165820484.


[LightGBM] [Info] Number of positive: 152149, number of negative: 152149
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.051725 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3619
[LightGBM] [Info] Number of data points in the train set: 304298, number of used features: 87
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 152149, number of negative: 152149
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.052770 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3619
[LightGBM] [Info] Number of data points in the train set: 304298, number of used features: 87
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initsco

[I 2025-02-25 21:21:24,638] Trial 18 finished with value: 0.640870881706743 and parameters: {'learning_rate': 0.01, 'max_depth': 6, 'n_estimators': 1000, 'subsample': 0.6042143306102483, 'colsample_bytree': 0.6458782631600184, 'gamma': 0.7270549488409978, 'min_child_weight': 1, 'num_leaves': 56, 'max_bin': 196, 'depth': 5, 'iterations': 100, 'l2_leaf_reg': 6}. Best is trial 16 with value: 0.7217312165820484.


[LightGBM] [Info] Number of positive: 152149, number of negative: 152149
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.050910 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3279
[LightGBM] [Info] Number of data points in the train set: 304298, number of used features: 87
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 152149, number of negative: 152149
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.052072 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3279
[LightGBM] [Info] Number of data points in the train set: 304298, number of used features: 87
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initsco

[I 2025-02-25 21:23:19,728] Trial 19 finished with value: 0.7190456911780085 and parameters: {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 100, 'subsample': 0.6997860087761725, 'colsample_bytree': 0.7244928845736488, 'gamma': 0.8825454070486712, 'min_child_weight': 3, 'num_leaves': 76, 'max_bin': 176, 'depth': 9, 'iterations': 100, 'l2_leaf_reg': 10}. Best is trial 16 with value: 0.7217312165820484.


[LightGBM] [Info] Number of positive: 152149, number of negative: 152149
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.057476 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1810
[LightGBM] [Info] Number of data points in the train set: 304298, number of used features: 87
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 152149, number of negative: 152149
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.053013 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1810
[LightGBM] [Info] Number of data points in the train set: 304298, number of used features: 87
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initsco

[I 2025-02-25 21:25:14,997] Trial 20 finished with value: 0.6317833483845431 and parameters: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 100, 'subsample': 0.6996113625400618, 'colsample_bytree': 0.9130543446378779, 'gamma': 0.6671671583447653, 'min_child_weight': 2, 'num_leaves': 84, 'max_bin': 87, 'depth': 9, 'iterations': 100, 'l2_leaf_reg': 1}. Best is trial 16 with value: 0.7217312165820484.


[LightGBM] [Info] Number of positive: 152149, number of negative: 152149
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.048894 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3296
[LightGBM] [Info] Number of data points in the train set: 304298, number of used features: 87
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 152149, number of negative: 152149
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.049104 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3296
[LightGBM] [Info] Number of data points in the train set: 304298, number of used features: 87
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initsco

[I 2025-02-25 21:27:08,911] Trial 21 finished with value: 0.7179045235173959 and parameters: {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 100, 'subsample': 0.709718382006539, 'colsample_bytree': 0.7340502224524382, 'gamma': 0.8817344693934013, 'min_child_weight': 3, 'num_leaves': 75, 'max_bin': 177, 'depth': 9, 'iterations': 100, 'l2_leaf_reg': 10}. Best is trial 16 with value: 0.7217312165820484.


[LightGBM] [Info] Number of positive: 152149, number of negative: 152149
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.050077 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3109
[LightGBM] [Info] Number of data points in the train set: 304298, number of used features: 87
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 152149, number of negative: 152149
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.054750 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3109
[LightGBM] [Info] Number of data points in the train set: 304298, number of used features: 87
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initsco

[I 2025-02-25 21:28:50,724] Trial 22 finished with value: 0.7209546808576068 and parameters: {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 100, 'subsample': 0.68854983925347, 'colsample_bytree': 0.6700417616195948, 'gamma': 0.9956981146300317, 'min_child_weight': 3, 'num_leaves': 79, 'max_bin': 160, 'depth': 8, 'iterations': 100, 'l2_leaf_reg': 8}. Best is trial 16 with value: 0.7217312165820484.


[LightGBM] [Info] Number of positive: 152149, number of negative: 152149
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.047176 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2526
[LightGBM] [Info] Number of data points in the train set: 304298, number of used features: 87
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 152149, number of negative: 152149
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.054843 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2526
[LightGBM] [Info] Number of data points in the train set: 304298, number of used features: 87
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initsco

[I 2025-02-25 21:30:35,480] Trial 23 finished with value: 0.7201087236359117 and parameters: {'learning_rate': 0.01, 'max_depth': 6, 'n_estimators': 100, 'subsample': 0.6030855486793871, 'colsample_bytree': 0.6757264276091982, 'gamma': 0.9846650447440359, 'min_child_weight': 3, 'num_leaves': 93, 'max_bin': 127, 'depth': 8, 'iterations': 100, 'l2_leaf_reg': 8}. Best is trial 16 with value: 0.7217312165820484.


[LightGBM] [Info] Number of positive: 152149, number of negative: 152149
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.052914 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3074
[LightGBM] [Info] Number of data points in the train set: 304298, number of used features: 87
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 152149, number of negative: 152149
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.054404 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3074
[LightGBM] [Info] Number of data points in the train set: 304298, number of used features: 87
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initsco

[I 2025-02-25 21:32:18,183] Trial 24 finished with value: 0.7185826765562683 and parameters: {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 100, 'subsample': 0.6007583918887397, 'colsample_bytree': 0.6762364491246484, 'gamma': 0.9916597319593423, 'min_child_weight': 1, 'num_leaves': 91, 'max_bin': 158, 'depth': 8, 'iterations': 100, 'l2_leaf_reg': 8}. Best is trial 16 with value: 0.7217312165820484.


[LightGBM] [Info] Number of positive: 152149, number of negative: 152149
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.047491 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3216
[LightGBM] [Info] Number of data points in the train set: 304298, number of used features: 87
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 152149, number of negative: 152149
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.052543 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3216
[LightGBM] [Info] Number of data points in the train set: 304298, number of used features: 87
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initsco

In [None]:
"""
import optuna
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

# 데이터를 train, validation으로 나누기
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Optuna objective 함수 정의
def objective(trial):
    # 하이퍼파라미터 범위 설정
    params = {
        "boosting_type": "gbdt",  # 고정
        "objective": "binary",  # 고정
        "metric": "auc",  # 고정
        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-5, 1e-1),
        "num_leaves": trial.suggest_int("num_leaves", 20, 100),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 20, 100),
        "min_gain_to_split": trial.suggest_loguniform("min_gain_to_split", 1e-5, 1e-1),
        "lambda_l1": trial.suggest_uniform("lambda_l1", 0, 1),
        "lambda_l2": trial.suggest_uniform("lambda_l2", 0, 1),
        "min_sum_hessian_in_leaf": trial.suggest_loguniform("min_sum_hessian_in_leaf", 1e-5, 1e-1),
        "bagging_fraction": trial.suggest_uniform("bagging_fraction", 0.7, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 10),
        "feature_fraction": trial.suggest_uniform("feature_fraction", 0.7, 1.0),
        "n_estimators": 1000,
        "random_state": 42,
    }

    # 모델 학습
    model = LGBMClassifier(**params)
    model.fit(X_train, y_train)

    # validation 예측
    pred_proba = model.predict_proba(X_val)[:, 1]

    # ROC-AUC 평가
    auc = roc_auc_score(y_val, pred_proba)
    return auc

# Optuna의 최적화 실행
study = optuna.create_study(direction="maximize")  # maximize AUC
study.optimize(objective, n_trials=50)  # 50번의 실험 수행

# 최적의 하이퍼파라미터 출력
print("Best Hyperparameters: ", study.best_params)
print("Best AUC Score: ", study.best_value)
""""

# 학습

In [None]:
"""
from lightgbm import LGBMClassifier

# 최적 하이퍼파라미터
best_params = study.best_params

# Optuna로 튜닝된 best_params를 사용해 모델 초기화
model = LGBMClassifier(
    learning_rate=best_params["learning_rate"],
    num_leaves=best_params["num_leaves"],
    max_depth=best_params["max_depth"],
    min_data_in_leaf=best_params["min_data_in_leaf"],
    min_gain_to_split=best_params["min_gain_to_split"],
    lambda_l1=best_params["lambda_l1"],
    lambda_l2=best_params["lambda_l2"],
    min_sum_hessian_in_leaf=best_params["min_sum_hessian_in_leaf"],
    bagging_fraction=best_params["bagging_fraction"],
    bagging_freq=best_params["bagging_freq"],
    feature_fraction=best_params["feature_fraction"],
)

# 모델 학습
model.fit(X, y)

# test 데이터 예측
pred_proba = model.predict_proba(test)[:, 1]

# 예측 결과를 sample_submission에 저장
sample_submission = pd.read_csv('./sample_submission.csv')
sample_submission['probability'] = pred_proba

# 결과를 CSV로 저장
sample_submission.to_csv('./ㄹㅇ최종의최종2.csv', index=False)
"""

In [None]:
import optuna
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
import pandas as pd
from sklearn.ensemble import StackingClassifier

# Optuna 최적의 하이퍼파라미터 가져오기
best_params = study.best_trial.params

# 모델별로 하이퍼파라미터 분리
xgb_params = {
    'learning_rate': best_params['learning_rate'],
    'max_depth': best_params['max_depth'],
    'n_estimators': best_params['n_estimators'],
    'subsample': best_params['subsample'],
    'colsample_bytree': best_params['colsample_bytree'],
    'gamma': best_params['gamma'],
    'min_child_weight': best_params['min_child_weight'],
    'random_state': 42
}

lgb_params = {
    'learning_rate': best_params['learning_rate'],
    'num_leaves': best_params['num_leaves'],
    'n_estimators': best_params['n_estimators'],
    'subsample': best_params['subsample'],
    'colsample_bytree': best_params['colsample_bytree'],
    'max_bin': best_params['max_bin'],
    'random_state': 42
}

catboost_params = {
    'learning_rate': best_params['learning_rate'],
    'depth': best_params['depth'],
    'iterations': best_params['iterations'],
    'l2_leaf_reg': best_params['l2_leaf_reg'],
    'random_state': 42,
    'verbose': 0
}

# 모델 학습
xgb_clf = xgb.XGBClassifier(**xgb_params)
xgb_clf.fit(X, y)

lgb_clf = lgb.LGBMClassifier(**lgb_params)
lgb_clf.fit(X, y)

catboost_clf = cb.CatBoostClassifier(**catboost_params)
catboost_clf.fit(X, y)

# Stacking 앙상블
stacking_clf = StackingClassifier(
    estimators=[('xgb', xgb_clf), ('lgb', lgb_clf), ('catboost', catboost_clf)],
    final_estimator=lgb.LGBMClassifier(**lgb_params)
)

# Stacking 모델 학습
stacking_clf.fit(X, y)

# test 데이터 예측
pred_proba = stacking_clf.predict_proba(test)[:, 1]

# 결과 저장
sample_submission = pd.read_csv('./sample_submission.csv')
sample_submission['probability'] = pred_proba
sample_submission.to_csv('./ㄹㅇ최종의최종2.csv', index=False)

print("✅ 최종 예측 완료! 결과 저장됨: ㄹㅇ최종의최종2.csv")
