# load packages

In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score
import time

from summarytools import dfSummary

import os
from tqdm import tqdm


# 데이터 로드

In [6]:
train = pd.read_csv("datasets/train_Preprocessed.csv", index_col=0)
test = pd.read_csv("datasets/test_Preprocessed.csv", index_col=0)

In [7]:
# 수정된 컬럼명으로 X, y 분리
X = train.drop(columns=['임신 성공 여부'])
y = train['임신 성공 여부']

# Gridsearch
- best params : {'depth': 6, 'iterations': 500, 'learning_rate': 0.05}

In [None]:
# 3️⃣ 모델별 하이퍼파라미터 후보군 설정
param_grid = {
    "CatBoost": {
        "depth": [6],
        "learning_rate": [0.05],
        "iterations": [500]
    },
    # "XGBoost": {
    #     "max_depth": [1, 2, 3, 4, 5, 6, 7, 8],
    #     "learning_rate": [0.075, 0.08, 0.085],
    #     "n_estimators": [300, 400, 500, 600, 700, 800]
    # },
    # "RandomForest": {
    #     "max_depth": [10, 20, 30, 40, 50],
    #     "n_estimators": [100, 300, 500, 700, 1000],
    #     "min_samples_split": [2, 5, 10, 15, 20]
    # }
}

# 4️⃣ 모델 정의
models = {
    "CatBoost": CatBoostClassifier(verbose=0, random_state=42),
    # "XGBoost": XGBClassifier(eval_metric="logloss", random_state=42),
    # "RandomForest": RandomForestClassifier(random_state=42)
}

In [9]:
# 🔹 병렬 처리 설정
N_JOBS = 30  # CPU 코어 개수 (변경 가능)

# 5️⃣ 10-Fold Cross Validation 및 GridSearch
best_models = {}
cv_results = {}
auc_results = {}  # ✅ AUC 점수 저장할 딕셔너리 추가

start_time = time.time()  # 전체 실행 시간 측정 시작

for model_name, model in models.items():
    print(f"\n🚀 {model_name} 모델 GridSearch 시작...")
    model_start = time.time()  # 개별 모델 실행 시간 측정

    grid_search = GridSearchCV(
        model, param_grid[model_name], 
        scoring="roc_auc", cv=3, n_jobs=N_JOBS, verbose=3
    )
    
    # ✅ tqdm으로 GridSearch 진행 상황 표시
    with tqdm(total=len(param_grid[model_name]["depth"]) * 
                     len(param_grid[model_name]["learning_rate"]) * 
                     len(param_grid[model_name]["iterations"]), desc=f"🔍 GridSearch {model_name}") as pbar:
        grid_search.fit(X, y)
        pbar.update()

    # 최적 모델 저장
    best_models[model_name] = grid_search.best_estimator_
    print(f"✅ {model_name} 최적 파라미터: {grid_search.best_params_}")

    # ✅ 10-Fold Cross Validation 진행 상황을 tqdm으로 표시
    print(f"📌 {model_name} 10-Fold Cross Validation 시작...")
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    auc_scores = []

    for fold_idx, (train_idx, val_idx) in tqdm(enumerate(skf.split(X, y), 1), total=10, desc=f"🔄 {model_name} CV Progress"):
        X_t, X_v = X.iloc[train_idx], X.iloc[val_idx]
        y_t, y_v = y.iloc[train_idx], y.iloc[val_idx]

        model = grid_search.best_estimator_
        model.fit(X_t, y_t)
        y_pred = model.predict_proba(X_v)[:, 1]
        auc = roc_auc_score(y_v, y_pred)
        auc_scores.append(auc)

    # ✅ AUC 점수 정렬 후 출력
    sorted_auc = sorted(auc_scores)  # 정렬하면 점수의 변동성을 한눈에 파악 가능
    print(f"\n📊 {model_name} 10-Fold AUC Scores (정렬됨) = {sorted_auc}")

    # ✅ AUC 평균 및 표준편차 저장
    mean_auc = np.mean(auc_scores)
    std_auc = np.std(auc_scores)
    cv_results[model_name] = (mean_auc, std_auc)
    auc_results[model_name] = auc_scores  # ✅ 개별 AUC 점수 저장

    print(f"📊 {model_name} CV AUC: 평균={mean_auc:.6f}, 표준편차={std_auc:.6f}")

    model_end = time.time()
    print(f"🕒 {model_name} 학습 완료! 소요 시간: {model_end - model_start:.2f}초\n")

total_time = time.time() - start_time
print(f"\n🚀 전체 학습 완료! 총 소요 시간: {total_time:.2f}초")


🚀 CatBoost 모델 GridSearch 시작...


🔍 GridSearch CatBoost:   0%|          | 0/80 [00:00<?, ?it/s]

Fitting 3 folds for each of 80 candidates, totalling 240 fits
[CV 1/3] END depth=3, iterations=500, learning_rate=0.07222222222222222;, score=0.736 total time= 2.6min
[CV 2/3] END depth=3, iterations=500, learning_rate=0.08333333333333333;, score=0.736 total time= 2.7min
[CV 1/3] END depth=3, iterations=500, learning_rate=0.09444444444444444;, score=0.736 total time= 2.8min
[CV 1/3] END depth=3, iterations=500, learning_rate=0.12777777777777777;, score=0.736 total time= 2.8min
[CV 2/3] END depth=3, iterations=500, learning_rate=0.05;, score=0.736 total time= 3.0min
[CV 2/3] END depth=3, iterations=500, learning_rate=0.09444444444444444;, score=0.736 total time= 3.1min
[CV 2/3] END depth=3, iterations=500, learning_rate=0.15;, score=0.736 total time= 3.5min
[CV 1/3] END depth=3, iterations=500, learning_rate=0.061111111111111116;, score=0.736 total time= 3.6min
[CV 3/3] END depth=3, iterations=500, learning_rate=0.1388888888888889;, score=0.735 total time= 3.6min
[CV 1/3] END depth=3, i



[CV 2/3] END depth=4, iterations=500, learning_rate=0.09444444444444444;, score=0.736 total time= 4.7min
[CV 3/3] END depth=5, iterations=500, learning_rate=0.05;, score=0.736 total time= 3.6min
[CV 1/3] END depth=5, iterations=500, learning_rate=0.061111111111111116;, score=0.736 total time= 3.5min
[CV 3/3] END depth=4, iterations=500, learning_rate=0.09444444444444444;, score=0.735 total time= 4.7min
[CV 3/3] END depth=4, iterations=500, learning_rate=0.10555555555555556;, score=0.735 total time= 4.7min
[CV 1/3] END depth=4, iterations=500, learning_rate=0.09444444444444444;, score=0.736 total time= 5.3min
[CV 2/3] END depth=4, iterations=500, learning_rate=0.10555555555555556;, score=0.736 total time= 5.1min
[CV 2/3] END depth=5, iterations=500, learning_rate=0.061111111111111116;, score=0.737 total time= 3.7min
[CV 2/3] END depth=4, iterations=500, learning_rate=0.12777777777777777;, score=0.736 total time= 5.6min
[CV 2/3] END depth=4, iterations=500, learning_rate=0.11666666666666

🔍 GridSearch CatBoost:   1%|▏         | 1/80 [54:01<71:07:21, 3241.04s/it]


✅ CatBoost 최적 파라미터: {'depth': 6, 'iterations': 500, 'learning_rate': 0.05}
📌 CatBoost 10-Fold Cross Validation 시작...


🔄 CatBoost CV Progress: 100%|██████████| 10/10 [01:05<00:00,  6.56s/it]


📊 CatBoost 10-Fold AUC Scores (정렬됨) = [0.7318032431770076, 0.7322440729662876, 0.7332837308082667, 0.7351890481972165, 0.7356152270134548, 0.7378390872620723, 0.7392191523052265, 0.7392759172021723, 0.739512002881085, 0.7421287720346093]
📊 CatBoost CV AUC: 평균=0.736611, 표준편차=0.003325
🕒 CatBoost 학습 완료! 소요 시간: 3306.65초


🚀 전체 학습 완료! 총 소요 시간: 3306.66초





In [12]:
# ✅ AUC 결과를 DataFrame으로 변환하여 CSV로 저장 (원하면 확인 가능)
auc_df = pd.DataFrame(auc_results)
display(auc_df)

Unnamed: 0,CatBoost
0,0.732244
1,0.731803
2,0.739219
3,0.735189
4,0.739276
5,0.733284
6,0.739512
7,0.735615
8,0.742129
9,0.737839


# Finalize

In [14]:
# 7️⃣ 최적 하이퍼파라미터로 최종 모델 학습 (전체 데이터 사용)
print("\n🚀 최적 하이퍼파라미터로 Final Model 학습 시작...")

# 전체 데이터로 다시 X, y 설정
X = train.drop(columns=['임신 성공 여부'])  # ✅ 컬럼명에 공백 없애기
y = train['임신 성공 여부']

# 최적 파라미터 가져오기
best_params = best_models["CatBoost"].get_params()
best_params.update({"verbose": 0, "random_state": 42, "thread_count": N_JOBS})  # ✅ 불필요한 verbose 제거

# 최종 모델 훈련
final_model = CatBoostClassifier(**best_params)
final_model.fit(X, y)

print("✅ Final Model 학습 완료!")

# 8️⃣ Test 데이터에 대해 predict_proba 수행
print("\n🚀 Test 데이터 예측 (predict_proba) 시작...")

# test 데이터 로드 (파일명 확인 필요)
test = pd.read_csv("datasets/test_Preprocessed.csv", index_col=0)
X_test = test.copy()  # 예측을 위한 X_test 준비

# 예측 수행 (확률 값)
test_preds = final_model.predict_proba(X_test)[:, 1]  # 양성(1)의 확률값 가져오기


🚀 최적 하이퍼파라미터로 Final Model 학습 시작...
✅ Final Model 학습 완료!

🚀 Test 데이터 예측 (predict_proba) 시작...


# Submission

In [15]:
from datetime import datetime

sample_submission = pd.read_csv('./submit/sample_submission.csv')
sample_submission['probability'] = test_preds
display(sample_submission)

Unnamed: 0,ID,probability
0,TEST_00000,0.002888
1,TEST_00001,0.014702
2,TEST_00002,0.161198
3,TEST_00003,0.107650
4,TEST_00004,0.510148
...,...,...
90062,TEST_90062,0.001480
90063,TEST_90063,0.307639
90064,TEST_90064,0.448510
90065,TEST_90065,0.227103


In [16]:
# 제출할 파일명
today = datetime.now().strftime('%m%d_%H%M%S')
file_name = f"./submit/submit_jiho_{today}.csv"
sample_submission.to_csv(file_name, index=False)