In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import  OrdinalEncoder, StandardScaler
from sklearn.metrics import roc_auc_score, roc_curve, precision_recall_curve

import lightgbm as lgb
import xgboost as xgb
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek
from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif

numeric_columns = [
    "임신 시도 또는 마지막 임신 경과 연수",
    "총 생성 배아 수",
    "미세주입된 난자 수",
    "미세주입에서 생성된 배아 수",
    "이식된 배아 수",
    "미세주입 배아 이식 수",
    "저장된 배아 수",
    "미세주입 후 저장된 배아 수",
    "해동된 배아 수",
    "해동 난자 수",
    "수집된 신선 난자 수",
    "저장된 신선 난자 수",
    "혼합된 난자 수",
    "파트너 정자와 혼합된 난자 수",
    "기증자 정자와 혼합된 난자 수",
    "난자 채취 경과일",
    "난자 해동 경과일",
    "난자 혼합 경과일",
    "배아 이식 경과일",
    "배아 해동 경과일"
]

categorical_columns = [
    "시술 시기 코드",
    "시술 당시 나이",
    "시술 유형",
    "특정 시술 유형",
    "배란 자극 여부",
    "배란 유도 유형",
    "단일 배아 이식 여부",
    "착상 전 유전 검사 사용 여부",
    "착상 전 유전 진단 사용 여부",
    "남성 주 불임 원인",
    "남성 부 불임 원인",
    "여성 주 불임 원인",
    "여성 부 불임 원인",
    "부부 주 불임 원인",
    "부부 부 불임 원인",
    "불명확 불임 원인",
    "불임 원인 - 난관 질환",
    "불임 원인 - 남성 요인",
    "불임 원인 - 배란 장애",
    "불임 원인 - 여성 요인",
    "불임 원인 - 자궁경부 문제",
    "불임 원인 - 자궁내막증",
    "불임 원인 - 정자 농도",
    "불임 원인 - 정자 면역학적 요인",
    "불임 원인 - 정자 운동성",
    "불임 원인 - 정자 형태",
    "배아 생성 주요 이유",
    "총 시술 횟수",
    "클리닉 내 총 시술 횟수",
    "IVF 시술 횟수",
    "DI 시술 횟수",
    "총 임신 횟수",
    "IVF 임신 횟수",
    "DI 임신 횟수",
    "총 출산 횟수",
    "IVF 출산 횟수",
    "DI 출산 횟수",
    "난자 출처",
    "정자 출처",
    "난자 기증자 나이",
    "정자 기증자 나이",
    "동결 배아 사용 여부",
    "신선 배아 사용 여부",
    "기증 배아 사용 여부",
    "대리모 여부",
    "PGD 시술 여부",
    "PGS 시술 여부"
]

# === 이상치 대체 (Winsorizing) 함수 정의 ===
def winsorize(df, numeric_cols, factor=1.5):
    df_new = df.copy()
    for col in numeric_cols:
        Q1 = df_new[col].quantile(0.25)
        Q3 = df_new[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - factor * IQR
        upper_bound = Q3 + factor * IQR
        df_new[col] = df_new[col].clip(lower=lower_bound, upper=upper_bound)
    return df_new

train = pd.read_csv('./data/train.csv').drop(columns=['ID'])
test = pd.read_csv('./data/test.csv').drop(columns=['ID'])

train = winsorize(train, numeric_columns, factor=1.5)
test = winsorize(test, numeric_columns, factor=1.5)

X = train.drop('임신 성공 여부', axis=1)
y = train['임신 성공 여부']

# 카테고리형 컬럼들을 문자열로 변환
for col in categorical_columns:
    X[col] = X[col].astype(str)
    test[col] = test[col].astype(str)

ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

X_train_encoded = X.copy()
X_train_encoded[categorical_columns] = ordinal_encoder.fit_transform(X[categorical_columns])

X_test_encoded = test.copy()
X_test_encoded[categorical_columns] = ordinal_encoder.transform(test[categorical_columns])

# 수치형 컬럼들을 0으로 채움
X_train_encoded[numeric_columns] = X_train_encoded[numeric_columns].fillna(0)
X_test_encoded[numeric_columns] = X_test_encoded[numeric_columns].fillna(0)

# categorical_feature에 인덱스를 전달
categorical_feature_indices = [X_train_encoded.columns.get_loc(col) for col in categorical_columns]

# 스케일링 적용
scaler = StandardScaler()
X_train_encoded[numeric_columns] = scaler.fit_transform(X_train_encoded[numeric_columns])
X_test_encoded[numeric_columns] = scaler.transform(X_test_encoded[numeric_columns])

# 데이터 불균형 확인 및 SMOTE 적용
smote = SMOTE(random_state=42)
X_train_encoded, y = smote.fit_resample(X_train_encoded, y)

# 데이터 불균형 확인 및 SMOTE + Tomek 적용
# smt = SMOTETomek(sampling_strategy=0.8, random_state=42)
# X_train_encoded, y = smt.fit_resample(X_train_encoded, y)

# # Feature Selection (SelectKBest)
# selector = SelectKBest(score_func=mutual_info_classif, k=40)  # 가장 중요한 40개의 변수를 선택
# X_train_encoded = selector.fit_transform(X_train_encoded, y)
# X_test_encoded = selector.transform(X_test_encoded)

print("-------------------------- 1. LightGBM 최적 파라미터로 모델 학습 --------------------------")
lgb_est = LGBMClassifier(
    random_state=42,
    objective='binary',
    n_estimators=1000,
    learning_rate=0.01,
    num_leaves=70,
    max_depth=-1,
    device='gpu',  # GPU 사용
    gpu_platform_id=0,
    gpu_device_id=2
)

# LightGBM 모델 학습
lgb_est.fit(X_train_encoded, y)
y_train_pred_lgb = lgb_est.predict_proba(X_train_encoded)[:, 1]
roc_auc_lgb = roc_auc_score(y, y_train_pred_lgb)
print(f"LightGBM 모델의 ROC-AUC 점수: {roc_auc_lgb:.4f}")

print("-------------------------- 2. XGBoost 최적 파라미터로 모델 학습 --------------------------")
xgb_est = XGBClassifier(
    eval_metric='auc',
    random_state=42,
    n_estimators=300,
    learning_rate=0.05,
    max_depth=11,
    subsample=0.8,
    colsample_bytree=0.7,
    gamma=0,
    tree_method='hist',  # 새 권장 방식
    device='cuda'        # GPU 사용
)


# XGBoost 모델 학습
xgb_est.fit(X_train_encoded, y)
y_train_pred_xgb = xgb_est.predict_proba(X_train_encoded)[:, 1]
roc_auc_xgb = roc_auc_score(y, y_train_pred_xgb)
print(f"XGBoost 모델의 ROC-AUC 점수: {roc_auc_xgb:.4f}")

print("-------------------------- 3. CatBoost 최적 파라미터로 모델 학습 --------------------------")
catboost_model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.01,
    depth=8,
    loss_function='Logloss',
    eval_metric='AUC',
    random_seed=42,
    verbose=100,  # 경고 메시지 최소화
    task_type='GPU',
    devices='1'
)


# CatBoost 모델 학습
catboost_model.fit(X_train_encoded, y)
y_train_pred_cat = catboost_model.predict_proba(X_train_encoded)[:, 1]
roc_auc_cat = roc_auc_score(y, y_train_pred_cat)
print(f"CatBoost 모델의 ROC-AUC 점수: {roc_auc_cat:.4f}")

stacking_clf = StackingClassifier(
    estimators=[
        ('lgb', lgb_est),
        ('xgb', xgb_est),
        ('cat', catboost_model)
    ],
    final_estimator=LogisticRegression(C=0.5, penalty='l2', solver='saga', max_iter=500, random_state=42),
    cv=3,
    n_jobs=1,
    passthrough=True
)

print("-------------------------- 6. 최종 모델 학습 및 제출 파일 생성 --------------------------")

# 테스트 데이터에 대한 예측 (Stacking 앙상블)
stacking_clf.fit(X_train_encoded, y)
y_test_pred_stacking = stacking_clf.predict_proba(X_train_encoded)[:, 1]
ROC_AUC_stacking = roc_auc_score(y, y_test_pred_stacking)
print(f"CatBoost 모델의 ROC-AUC 점수: {ROC_AUC_stacking:.4f}")
final_pred_proba = stacking_clf.predict_proba(X_test_encoded)[:, 1]

# 제출 파일 생성
sample_submission = pd.read_csv('./data/sample_submission.csv')
sample_submission['probability'] = final_pred_proba
sample_submission.to_csv('./submit/stacking_ensemble_submit.csv', index=False)
print("최종 제출 파일 'stacking_ensemble_submit.csv' 생성 완료!")

-------------------------- 1. LightGBM 최적 파라미터로 모델 학습 --------------------------
[LightGBM] [Info] Number of positive: 190123, number of negative: 190123
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 11627
[LightGBM] [Info] Number of data points in the train set: 380246, number of used features: 55
[LightGBM] [Info] Using requested OpenCL platform 0 device 2
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 3090, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 17 dense feature groups (7.25 MB) transferred to GPU in 0.005510 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
LightGBM 모델의 ROC-AUC 점수: 0.9123
-------------------------- 2. XGBoost 최적 파라미터로 모델 학습 --------------------------
XGBoost 모델의 ROC-AUC 점수: 0.9323
-------------------------- 3. CatB

Default metric period is 5 because AUC is/are not implemented for GPU


0:	total: 9.41ms	remaining: 9.4s
100:	total: 766ms	remaining: 6.82s
200:	total: 1.5s	remaining: 5.97s
300:	total: 2.26s	remaining: 5.25s
400:	total: 3.02s	remaining: 4.51s
500:	total: 3.77s	remaining: 3.75s
600:	total: 4.52s	remaining: 3s
700:	total: 5.28s	remaining: 2.25s
800:	total: 6.04s	remaining: 1.5s
900:	total: 6.8s	remaining: 747ms
999:	total: 7.56s	remaining: 0us
CatBoost 모델의 ROC-AUC 점수: 0.9090
-------------------------- 6. 최종 모델 학습 및 제출 파일 생성 --------------------------
[LightGBM] [Info] Number of positive: 190123, number of negative: 190123
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 11627
[LightGBM] [Info] Number of data points in the train set: 380246, number of used features: 55
[LightGBM] [Info] Using requested OpenCL platform 0 device 2
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 3090, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Inf

Default metric period is 5 because AUC is/are not implemented for GPU


0:	total: 9.15ms	remaining: 9.14s
100:	total: 755ms	remaining: 6.72s
200:	total: 1.5s	remaining: 5.97s
300:	total: 2.23s	remaining: 5.19s
400:	total: 2.95s	remaining: 4.41s
500:	total: 3.65s	remaining: 3.64s
600:	total: 4.36s	remaining: 2.9s
700:	total: 5.08s	remaining: 2.17s
800:	total: 5.79s	remaining: 1.44s
900:	total: 6.5s	remaining: 715ms
999:	total: 7.22s	remaining: 0us
[LightGBM] [Info] Number of positive: 126749, number of negative: 126748
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 11949
[LightGBM] [Info] Number of data points in the train set: 253497, number of used features: 55
[LightGBM] [Info] Using requested OpenCL platform 0 device 2
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 3090, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 17 dense feature groups (4.84 MB) transferred to GPU 

Default metric period is 5 because AUC is/are not implemented for GPU


0:	total: 8.06ms	remaining: 8.05s
100:	total: 678ms	remaining: 6.03s
200:	total: 1.32s	remaining: 5.25s
300:	total: 1.97s	remaining: 4.57s
400:	total: 2.61s	remaining: 3.9s
500:	total: 3.24s	remaining: 3.23s
600:	total: 3.87s	remaining: 2.57s
700:	total: 4.5s	remaining: 1.92s
800:	total: 5.14s	remaining: 1.28s
900:	total: 5.76s	remaining: 633ms
999:	total: 6.39s	remaining: 0us


Default metric period is 5 because AUC is/are not implemented for GPU


0:	total: 7.44ms	remaining: 7.43s
100:	total: 674ms	remaining: 6s
200:	total: 1.34s	remaining: 5.31s
300:	total: 2.01s	remaining: 4.67s
400:	total: 2.69s	remaining: 4.01s
500:	total: 3.36s	remaining: 3.35s
600:	total: 4.04s	remaining: 2.68s
700:	total: 4.74s	remaining: 2.02s
800:	total: 5.44s	remaining: 1.35s
900:	total: 6.15s	remaining: 675ms
999:	total: 6.85s	remaining: 0us


Default metric period is 5 because AUC is/are not implemented for GPU


0:	total: 7.6ms	remaining: 7.59s
100:	total: 688ms	remaining: 6.12s
200:	total: 1.37s	remaining: 5.45s
300:	total: 2.06s	remaining: 4.78s
400:	total: 2.74s	remaining: 4.1s
500:	total: 3.4s	remaining: 3.39s
600:	total: 4.1s	remaining: 2.72s
700:	total: 4.77s	remaining: 2.03s
800:	total: 5.44s	remaining: 1.35s
900:	total: 6.14s	remaining: 675ms
999:	total: 6.84s	remaining: 0us




CatBoost 모델의 ROC-AUC 점수: 0.9284
최종 제출 파일 'stacking_ensemble_submit.csv' 생성 완료!


In [None]:
# # 교차 검증 설정
# cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# cv_scores_lgb = []
# cv_scores_xgb = []
# cv_scores_stack = []
# cv_scores_catb = []

# # 교차 검증을 통한 ROC-AUC 비교
# for train_idx, val_idx in cv.split(X_train_encoded, y):
#     X_cv_train, X_cv_val = X_train_encoded.iloc[train_idx], X_train_encoded.iloc[val_idx]
#     y_cv_train, y_cv_val = y.iloc[train_idx], y.iloc[val_idx]
    
#     # LightGBM 예측
#     lgb_est.fit(X_cv_train, y_cv_train)
#     lgb_proba = lgb_est.predict_proba(X_cv_val)[:, 1]
#     auc_lgb = roc_auc_score(y_cv_val, lgb_proba)
#     cv_scores_lgb.append(auc_lgb)
    
#     # XGBoost 예측
#     xgb_model = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
#     xgb_model.fit(X_cv_train, y_cv_train)
#     xgb_proba = xgb_model.predict_proba(X_cv_val)[:, 1]
#     auc_xgb = roc_auc_score(y_cv_val, xgb_proba)
#     cv_scores_xgb.append(auc_xgb)

#     # catboost 예측
#     catboost_model.fit(X_cv_train, y_cv_train)
#     cat_proba = catboost_model.predict_proba(X_cv_val)[:, 1]
#     auc_catb = roc_auc_score(y_cv_val, cat_proba)
#     cv_scores_catb.append(auc_catb)
    
#     # Stacking 앙상블 예측
#     stacking_clf.fit(X_cv_train, y_cv_train)
#     stack_proba = stacking_clf.predict_proba(X_cv_val)[:, 1]
#     auc_stack = roc_auc_score(y_cv_val, stack_proba)
#     cv_scores_stack.append(auc_stack)

# # 평균 ROC-AUC 출력
# print("------------- 교차 검증 평균 ROC-AUC -------------")
# print(f"LightGBM: {np.mean(cv_scores_lgb):.4f}")
# print(f"XGBoost: {np.mean(cv_scores_xgb):.4f}")
# print(f"Stacking Ensemble: {np.mean(cv_scores_stack):.4f}")
# print("---------------------------------------------------")