In [None]:
import os
import pandas as pd
import lightgbm as lgb
import optuna
from autogluon.tabular import TabularDataset, TabularPredictor
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

# 1️⃣ 데이터 로드
train_file = ''
test_file = ''


train_data = TabularDataset(train_file)
test_data = TabularDataset(test_file)

# 2️⃣ 타겟 변수 설정
target_column = "임신 성공 여부"  # 사용자의 데이터에 맞게 수정 필요

if target_column not in train_data.columns:
    raise ValueError(f"'{target_column}' 컬럼이 데이터셋에 없습니다. 올바른 타겟 변수를 설정하세요.")

# 3️⃣ AutoGluon을 이용한 결측값 자동 채우기 (타겟 변수를 고려하여 학습)
predictor = TabularPredictor(label=target_column)  # 타겟 컬럼을 사용하여 결측값 채우기
predictor.fit(train_data)  # 🔹 학습 진행 (결측값 자동 처리 포함)

# 4️⃣ 결측값이 채워진 데이터 변환
processed_train = predictor.transform_features(train_data)  # 학습 데이터 변환
processed_test = predictor.transform_features(test_data)  # 테스트 데이터 변환

# 3️⃣ AutoGluon 모델 저장 (결측치 처리 모델)
autogluon_save_path = ""
predictor.save(autogluon_save_path)
print(f"🚀 AutoGluon 전처리 모델 저장 완료! 저장 경로: {autogluon_save_path}")









Loaded data from: /content/drive/MyDrive/Aimers6/train_3.csv | Columns = 61 / 61 | Rows = 256351 -> 256351
Loaded data from: /content/drive/MyDrive/Aimers6/test_3.csv | Columns = 60 / 60 | Rows = 90067 -> 90067
No path specified. Models will be saved in: "AutogluonModels/ag-20250223_004443"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.2
Python Version:     3.11.11
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP PREEMPT_DYNAMIC Thu Jun 27 21:05:47 UTC 2024
CPU Count:          2
Memory Avail:       9.76 GB / 12.67 GB (77.0%)
Disk Space Avail:   73.37 GB / 107.72 GB (68.1%)
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets. Defaulting to `'medium'`...
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='experimental' : New in v1.2: Pre-trained foundation model + parallel fits. The absolute best accuracy 

🚀 AutoGluon 전처리 모델 저장 완료! 저장 경로: /content/drive/MyDrive/Aimers6/autogluon_save.csv


In [None]:
processed_train = predictor.transform_features(train_data)
processed_train[target_column] = train_data[target_column].values

In [None]:
# 3️⃣ AutoGluon 모델 저장 (결측치 처리 모델)
autogluon_save_path = ""
predictor.save(autogluon_save_path)
print(f"🚀 AutoGluon 전처리 모델 저장 완료! 저장 경로: {autogluon_save_path}")

🚀 AutoGluon 전처리 모델 저장 완료! 저장 경로: /content/drive/MyDrive/Aimers6/autogluon_save.csv


In [None]:
# 5️⃣ 학습 & 검증 데이터 분할 (80% 학습, 20% 검증)
train_data, val_data = train_test_split(processed_train, test_size=0.2, random_state=42, stratify=processed_train["임신 성공 여부"])

# 6️⃣ LightGBM 학습을 위한 데이터 준비
X_train = train_data.drop(columns=[target_column])
y_train = train_data[target_column]

X_val = val_data.drop(columns=[target_column])
y_val = val_data[target_column]

train_dataset = lgb.Dataset(X_train, label=y_train)
val_dataset = lgb.Dataset(X_val, label=y_val, reference=train_dataset)

In [None]:
import lightgbm as lgb
print(lgb.__version__)


4.5.0


In [None]:
# 7️⃣ Optuna를 활용한 하이퍼파라미터 최적화 함수
def objective(trial):
    params = {
    "objective": "binary",
    "metric": "auc",
    "boosting_type": "gbdt",
    "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2),
    "num_leaves": trial.suggest_int("num_leaves", 20, 100),
    "max_depth": trial.suggest_int("max_depth", 3, 12),
    "min_child_samples": trial.suggest_int("min_child_samples", 5, 50),
    "subsample": trial.suggest_float("subsample", 0.5, 1.0),
    "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
    "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 1.0),
    "reg_lambda": trial.suggest_float("reg_lambda", 0.0, 1.0),
    "feature_pre_filter": False     # ← 추가된 부분
}


    model = lgb.train(
        params,
        train_dataset,
        valid_sets=[val_dataset],
        num_boost_round=100,
        callbacks=[lgb.early_stopping(10)]
    )
    y_val_pred = model.predict(X_val)
    auc = roc_auc_score(y_val, y_val_pred)
    return auc


# 8️⃣ 최적의 하이퍼파라미터 탐색 (Optuna)
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=20)  # 20번의 탐색 수행

best_params = study.best_params
print(f"🔍 최적의 하이퍼파라미터: {best_params}")

# 9️⃣ 최적의 하이퍼파라미터로 모델 학습
# Optuna에서 얻은 best_params에 feature_pre_filter 추가
best_params["feature_pre_filter"] = False

model = lgb.train(
    best_params,
    train_dataset,
    valid_sets=[val_dataset],
    num_boost_round=200,
    callbacks=[lgb.early_stopping(20)]
)


# 🔟 LightGBM 모델 저장
model_save_path = ""
model.save_model(model_save_path)
print(f"🚀 LightGBM 모델 저장 완료! 저장 경로: {model_save_path}")

# 1️⃣1️⃣ 검증 데이터 평가
y_val_pred = model.predict(X_val)
auc = roc_auc_score(y_val, y_val_pred)
print(f"검증 데이터 AUC: {auc:.4f}")

[I 2025-02-23 00:39:24,102] A new study created in memory with name: no-name-07791fe2-eedd-49b1-8776-87f32b3aca56


Training until validation scores don't improve for 10 rounds


[I 2025-02-23 00:39:26,329] Trial 0 finished with value: 0.7364812430691244 and parameters: {'learning_rate': 0.14836056560609476, 'num_leaves': 66, 'max_depth': 9, 'min_child_samples': 7, 'subsample': 0.9649765942851571, 'colsample_bytree': 0.7915792482018329, 'reg_alpha': 0.6082937083821309, 'reg_lambda': 0.36846628558851446}. Best is trial 0 with value: 0.7364812430691244.


Early stopping, best iteration is:
[39]	valid_0's auc: 0.736481
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[56]	valid_0's auc: 0.736426


[I 2025-02-23 00:39:32,361] Trial 1 finished with value: 0.7364262914198454 and parameters: {'learning_rate': 0.09713523082338398, 'num_leaves': 100, 'max_depth': 10, 'min_child_samples': 13, 'subsample': 0.5001141442324484, 'colsample_bytree': 0.6688536934557059, 'reg_alpha': 0.8093902598179066, 'reg_lambda': 0.2698113038809846}. Best is trial 0 with value: 0.7364812430691244.


Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[98]	valid_0's auc: 0.736684


[I 2025-02-23 00:39:36,138] Trial 2 finished with value: 0.7366839594213896 and parameters: {'learning_rate': 0.06480417901742733, 'num_leaves': 24, 'max_depth': 9, 'min_child_samples': 12, 'subsample': 0.618718060019438, 'colsample_bytree': 0.8706749273422912, 'reg_alpha': 0.7948607240205208, 'reg_lambda': 0.6385109111883313}. Best is trial 2 with value: 0.7366839594213896.


Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's auc: 0.736772


[I 2025-02-23 00:39:40,241] Trial 3 finished with value: 0.7367716729985745 and parameters: {'learning_rate': 0.05039871952099418, 'num_leaves': 30, 'max_depth': 12, 'min_child_samples': 48, 'subsample': 0.7900141481065297, 'colsample_bytree': 0.7940664959585371, 'reg_alpha': 0.9040430527153489, 'reg_lambda': 0.32633162886743694}. Best is trial 3 with value: 0.7367716729985745.


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[83]	valid_0's auc: 0.73673


[I 2025-02-23 00:39:45,987] Trial 4 finished with value: 0.7367299956728406 and parameters: {'learning_rate': 0.12120456511571064, 'num_leaves': 41, 'max_depth': 5, 'min_child_samples': 11, 'subsample': 0.6614007921566347, 'colsample_bytree': 0.9188882483408831, 'reg_alpha': 0.01705363775322133, 'reg_lambda': 0.8193677293720205}. Best is trial 3 with value: 0.7367716729985745.


Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[91]	valid_0's auc: 0.737024


[I 2025-02-23 00:39:49,972] Trial 5 finished with value: 0.7370236002423367 and parameters: {'learning_rate': 0.19866527241722814, 'num_leaves': 91, 'max_depth': 4, 'min_child_samples': 14, 'subsample': 0.5892309709242851, 'colsample_bytree': 0.5431078059533901, 'reg_alpha': 0.8227486591935316, 'reg_lambda': 0.607069116206866}. Best is trial 5 with value: 0.7370236002423367.


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[39]	valid_0's auc: 0.736172


[I 2025-02-23 00:39:52,846] Trial 6 finished with value: 0.7361717107011478 and parameters: {'learning_rate': 0.13508693918959414, 'num_leaves': 93, 'max_depth': 11, 'min_child_samples': 46, 'subsample': 0.7865069044228719, 'colsample_bytree': 0.5759477574213205, 'reg_alpha': 0.8458098848157963, 'reg_lambda': 0.46585008203042977}. Best is trial 5 with value: 0.7370236002423367.


Training until validation scores don't improve for 10 rounds


[I 2025-02-23 00:39:54,707] Trial 7 finished with value: 0.7368164465414143 and parameters: {'learning_rate': 0.18972949459792057, 'num_leaves': 35, 'max_depth': 12, 'min_child_samples': 27, 'subsample': 0.9373404596493198, 'colsample_bytree': 0.8139573903164468, 'reg_alpha': 0.340615855447403, 'reg_lambda': 0.8887880334443373}. Best is trial 5 with value: 0.7370236002423367.


Early stopping, best iteration is:
[36]	valid_0's auc: 0.736816
Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[96]	valid_0's auc: 0.736854


[I 2025-02-23 00:40:00,643] Trial 8 finished with value: 0.7368537391313498 and parameters: {'learning_rate': 0.09091000765603624, 'num_leaves': 75, 'max_depth': 5, 'min_child_samples': 40, 'subsample': 0.9583144576234333, 'colsample_bytree': 0.6994257861942199, 'reg_alpha': 0.20872604241874815, 'reg_lambda': 0.3973932621630133}. Best is trial 5 with value: 0.7370236002423367.


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[86]	valid_0's auc: 0.737101


[I 2025-02-23 00:40:05,299] Trial 9 finished with value: 0.7371006383726624 and parameters: {'learning_rate': 0.05207700266245276, 'num_leaves': 79, 'max_depth': 12, 'min_child_samples': 36, 'subsample': 0.6967863403199377, 'colsample_bytree': 0.7537099043819682, 'reg_alpha': 0.8267558705867153, 'reg_lambda': 0.49510737360816537}. Best is trial 9 with value: 0.7371006383726624.


Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's auc: 0.734818


[I 2025-02-23 00:40:11,839] Trial 10 finished with value: 0.7348183680027256 and parameters: {'learning_rate': 0.01852886915801795, 'num_leaves': 52, 'max_depth': 7, 'min_child_samples': 33, 'subsample': 0.7141969162912416, 'colsample_bytree': 0.9955908955663908, 'reg_alpha': 0.5954559233951567, 'reg_lambda': 0.05124856376111353}. Best is trial 9 with value: 0.7371006383726624.


Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's auc: 0.736458


[I 2025-02-23 00:40:15,962] Trial 11 finished with value: 0.7364583475412869 and parameters: {'learning_rate': 0.18827248287541143, 'num_leaves': 82, 'max_depth': 3, 'min_child_samples': 22, 'subsample': 0.5644634560474039, 'colsample_bytree': 0.5112482824660532, 'reg_alpha': 0.9986060073568206, 'reg_lambda': 0.6595837770905377}. Best is trial 9 with value: 0.7371006383726624.


Training until validation scores don't improve for 10 rounds


[I 2025-02-23 00:40:16,805] Trial 12 finished with value: 0.7314199237748873 and parameters: {'learning_rate': 0.012702238479417371, 'num_leaves': 83, 'max_depth': 7, 'min_child_samples': 20, 'subsample': 0.7081263578101162, 'colsample_bytree': 0.6364906258120668, 'reg_alpha': 0.6697516330957684, 'reg_lambda': 0.6543921414606071}. Best is trial 9 with value: 0.7371006383726624.


Early stopping, best iteration is:
[4]	valid_0's auc: 0.73142
Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's auc: 0.736049


[I 2025-02-23 00:40:19,373] Trial 13 finished with value: 0.7360487981287294 and parameters: {'learning_rate': 0.1602448238671745, 'num_leaves': 65, 'max_depth': 3, 'min_child_samples': 34, 'subsample': 0.5905198131272461, 'colsample_bytree': 0.7295944016598602, 'reg_alpha': 0.4485924974360949, 'reg_lambda': 0.5789255723198031}. Best is trial 9 with value: 0.7371006383726624.


Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[99]	valid_0's auc: 0.735942


[I 2025-02-23 00:40:25,965] Trial 14 finished with value: 0.7359420595432627 and parameters: {'learning_rate': 0.05489113441366479, 'num_leaves': 88, 'max_depth': 5, 'min_child_samples': 37, 'subsample': 0.8593408805490277, 'colsample_bytree': 0.6011109318048031, 'reg_alpha': 0.7332345892334862, 'reg_lambda': 0.7752737034639886}. Best is trial 9 with value: 0.7371006383726624.


Training until validation scores don't improve for 10 rounds


[I 2025-02-23 00:40:27,052] Trial 15 finished with value: 0.7300640655861971 and parameters: {'learning_rate': 0.03590255495306022, 'num_leaves': 74, 'max_depth': 8, 'min_child_samples': 20, 'subsample': 0.506247880771242, 'colsample_bytree': 0.5255317816070448, 'reg_alpha': 0.9877052233447406, 'reg_lambda': 0.9934899610848401}. Best is trial 9 with value: 0.7371006383726624.


Early stopping, best iteration is:
[4]	valid_0's auc: 0.730064
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[76]	valid_0's auc: 0.736851


[I 2025-02-23 00:40:30,375] Trial 16 finished with value: 0.73685139637009 and parameters: {'learning_rate': 0.0843334619953663, 'num_leaves': 54, 'max_depth': 6, 'min_child_samples': 27, 'subsample': 0.6822325725488755, 'colsample_bytree': 0.7049779427132209, 'reg_alpha': 0.47029166946229756, 'reg_lambda': 0.20865584925949732}. Best is trial 9 with value: 0.7371006383726624.


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[90]	valid_0's auc: 0.736957


[I 2025-02-23 00:40:33,320] Trial 17 finished with value: 0.7369569407429313 and parameters: {'learning_rate': 0.16940069696581958, 'num_leaves': 99, 'max_depth': 4, 'min_child_samples': 42, 'subsample': 0.610427751969973, 'colsample_bytree': 0.8648181492273216, 'reg_alpha': 0.6094857628058903, 'reg_lambda': 0.5510054719392503}. Best is trial 9 with value: 0.7371006383726624.


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[46]	valid_0's auc: 0.7369


[I 2025-02-23 00:40:37,063] Trial 18 finished with value: 0.7368997992074915 and parameters: {'learning_rate': 0.11635699625796309, 'num_leaves': 75, 'max_depth': 10, 'min_child_samples': 32, 'subsample': 0.7712513157552451, 'colsample_bytree': 0.604808744774737, 'reg_alpha': 0.7188101723863091, 'reg_lambda': 0.47121410045000417}. Best is trial 9 with value: 0.7371006383726624.


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[54]	valid_0's auc: 0.736464


[I 2025-02-23 00:40:41,737] Trial 19 finished with value: 0.7364642322399091 and parameters: {'learning_rate': 0.07245084340894582, 'num_leaves': 90, 'max_depth': 8, 'min_child_samples': 16, 'subsample': 0.8331724758180747, 'colsample_bytree': 0.755953867622671, 'reg_alpha': 0.8959937011914757, 'reg_lambda': 0.15029728409973175}. Best is trial 9 with value: 0.7371006383726624.


🔍 최적의 하이퍼파라미터: {'learning_rate': 0.05207700266245276, 'num_leaves': 79, 'max_depth': 12, 'min_child_samples': 36, 'subsample': 0.6967863403199377, 'colsample_bytree': 0.7537099043819682, 'reg_alpha': 0.8267558705867153, 'reg_lambda': 0.49510737360816537}
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[99]	valid_0's l2: 0.166392
🚀 LightGBM 모델 저장 완료! 저장 경로: /content/drive/MyDrive/model_save
검증 데이터 AUC: 0.7370


In [None]:
# 1️⃣2️⃣ 테스트 데이터 예측
X_test = processed_test.drop(columns=[target_column], errors="ignore")  # 타겟 컬럼 제거
y_test_pred = model.predict(X_test)

# 1️⃣3️⃣ 예측 결과 저장
submission = pd.DataFrame({target_column: y_test_pred})
submission.to_csv("", index=False)



In [None]:
# 7️⃣ **최적 모델로 테스트 데이터 예측**
y_test_pred_proba = predictor.predict_proba(test_data)

# 🔹 테스트 데이터 확률값 처리
if isinstance(y_test_pred_proba, pd.DataFrame):
    y_test_pred = y_test_pred_proba.iloc[:, 1]  # DataFrame이면 iloc 사용
else:
    y_test_pred = y_test_pred_proba[:, 1]  # numpy 배열이면 직접 사용

# 8️⃣ **최종 제출 파일 생성**
submission['probability'] = y_test_pred  # 확률값 그대로 저장
submission.to_csv('', index=False)
print("🚀 AutoGluon 최적 모델 예측 완료! 제출 파일 생성됨: submission_autogluon.csv")

🚀 AutoGluon 최적 모델 예측 완료! 제출 파일 생성됨: submission_autogluon.csv


In [None]:
# 1️⃣ 제출 파일 불러오기
submission = pd.read_csv('')

# 2️⃣ 기존에 있던 "probability" 컬럼 제거 (만약 존재한다면)
if 'probability' in submission.columns:
    submission.drop(columns=['probability'], inplace=True)

# 3️⃣ 컬럼명 변경: "임신 성공 여부" → "probability"
submission.rename(columns={'임신 성공 여부': 'probability'}, inplace=True)

# 4️⃣ 수정된 파일 저장
submission.to_csv('', index=False)

print("🚀 최종 제출 파일 생성 완료: submission_autogluon_final.csv")

# 5️⃣ Colab에서 다운로드 가능
from google.colab import files
files.download('')

In [None]:
binary_predictions = (y_test_pred >= 0.5).astype(int)
print(binary_predictions.sum())

4670


In [None]:
pip install optuna




In [None]:
pip install autogluon


Collecting autogluon
  Downloading autogluon-1.2-py3-none-any.whl.metadata (11 kB)
Collecting autogluon.core==1.2 (from autogluon.core[all]==1.2->autogluon)
  Downloading autogluon.core-1.2-py3-none-any.whl.metadata (12 kB)
Collecting autogluon.features==1.2 (from autogluon)
  Downloading autogluon.features-1.2-py3-none-any.whl.metadata (11 kB)
Collecting autogluon.tabular==1.2 (from autogluon.tabular[all]==1.2->autogluon)
  Downloading autogluon.tabular-1.2-py3-none-any.whl.metadata (14 kB)
Collecting autogluon.multimodal==1.2 (from autogluon)
  Downloading autogluon.multimodal-1.2-py3-none-any.whl.metadata (12 kB)
Collecting autogluon.timeseries==1.2 (from autogluon.timeseries[all]==1.2->autogluon)
  Downloading autogluon.timeseries-1.2-py3-none-any.whl.metadata (12 kB)
Collecting scikit-learn<1.5.3,>=1.4.0 (from autogluon.core==1.2->autogluon.core[all]==1.2->autogluon)
  Downloading scikit_learn-1.5.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Collec

In [None]:
pip install scikit-learn




In [None]:
from google.colab import drive
drive.mount('/content/drive') # 팝업허용 및 로그인

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
processed_test.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90067 entries, 0 to 90066
Data columns (total 58 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   시술 유형              90067 non-null  int8    
 1   배란 자극 여부           90067 non-null  int8    
 2   단일 배아 이식 여부        87891 non-null  float64 
 3   남성 주 불임 원인         90067 non-null  int8    
 4   남성 부 불임 원인         90067 non-null  int8    
 5   여성 주 불임 원인         90067 non-null  int8    
 6   여성 부 불임 원인         90067 non-null  int8    
 7   부부 주 불임 원인         90067 non-null  int8    
 8   불명확 불임 원인          90067 non-null  int8    
 9   불임 원인 - 난관 질환      90067 non-null  int8    
 10  불임 원인 - 남성 요인      90067 non-null  int8    
 11  불임 원인 - 배란 장애      90067 non-null  int8    
 12  불임 원인 - 자궁내막증      90067 non-null  int8    
 13  총 생성 배아 수          87891 non-null  float64 
 14  미세주입된 난자 수         87891 non-null  float64 
 15  미세주입에서 생성된 배아 수    87891 non-null  float64 
 16  이식된 

In [None]:
!pip install --upgrade lightgbm



Collecting lightgbm
  Downloading lightgbm-4.6.0-py3-none-manylinux_2_28_x86_64.whl.metadata (17 kB)
Downloading lightgbm-4.6.0-py3-none-manylinux_2_28_x86_64.whl (3.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m32.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: lightgbm
  Attempting uninstall: lightgbm
    Found existing installation: lightgbm 4.5.0
    Uninstalling lightgbm-4.5.0:
      Successfully uninstalled lightgbm-4.5.0
Successfully installed lightgbm-4.6.0


In [None]:
import lightgbm as lgb
print(lgb.__version__)


4.5.0


In [None]:
!pip install --upgrade lightgbm --no-cache-dir




In [None]:
import pandas as pd
import lightgbm as lgb
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

# 1️⃣ 데이터 로드
train = pd.read_csv('')
test = pd.read_csv('')

# 2️⃣ 타겟 컬럼 설정 및 특징/타겟 분리
target = '임신 성공 여부'
X = train.drop(columns=[target])
y = train[target]

# 3️⃣ 학습 & 검증 데이터 분할 (80% 학습, 20% 검증)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 변환할 컬럼 목록
bad_columns = ['시술 시기 코드', '시술 당시 나이', '시술 유형', 'IVF 임신 횟수',
               'DI 임신 횟수', '총 출산 횟수', 'IVF 출산 횟수', '난자 출처',
               '정자 출처', '난자 기증자 나이', '정자 기증자 나이',
               '배아 이식 경과일', '시술 연속성']

# 학습 데이터와 검증 데이터의 해당 컬럼을 범주형으로 변환
for col in bad_columns:
    X_train[col] = X_train[col].astype('category')
    X_val[col] = X_val[col].astype('category')


# 4️⃣ Optuna Objective 함수 정의 (LightGBM 하이퍼파라미터 튜닝)
def objective(trial):
    param = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': 'gbdt',
        'verbosity': -1,
        'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 1e-1, log=True),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
        'n_estimators': 1000,  # 조기 종료를 위해 크게 설정
        'random_state': 42
    }
    model = lgb.LGBMClassifier(**param)
    model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    eval_metric='auc',
    callbacks=[lgb.early_stopping(50)]
)

    y_pred_proba = model.predict_proba(X_val)[:, 1]
    auc_score = roc_auc_score(y_val, y_pred_proba)
    return auc_score

# 5️⃣ Optuna를 통한 하이퍼파라미터 최적화 (n_trials는 상황에 따라 조정)
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

print("Best trial:")
best_trial = study.best_trial
print("  AUC: {:.4f}".format(best_trial.value))
print("  Best Parameters:")
for key, value in best_trial.params.items():
    print("    {}: {}".format(key, value))




[I 2025-02-23 01:46:00,319] A new study created in memory with name: no-name-afd3e810-001c-409b-b813-6bcddb222438


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's auc: 0.736915


[I 2025-02-23 01:46:58,767] Trial 0 finished with value: 0.736915355737874 and parameters: {'num_leaves': 43, 'max_depth': 7, 'learning_rate': 0.007052526219051578, 'min_child_samples': 33, 'subsample': 0.5071812214346069, 'colsample_bytree': 0.7848920906354371, 'reg_alpha': 5.894662818775544e-07, 'reg_lambda': 2.142612224754731e-08}. Best is trial 0 with value: 0.736915355737874.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's auc: 0.736607


[I 2025-02-23 01:48:06,079] Trial 1 finished with value: 0.7366068895248097 and parameters: {'num_leaves': 60, 'max_depth': 12, 'learning_rate': 0.0032572614489326124, 'min_child_samples': 93, 'subsample': 0.7399151645734068, 'colsample_bytree': 0.7770245466730308, 'reg_alpha': 0.0008126927485076498, 'reg_lambda': 7.538120879277646e-08}. Best is trial 0 with value: 0.736915355737874.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[216]	valid_0's auc: 0.736487


[I 2025-02-23 01:48:24,266] Trial 2 finished with value: 0.7364866453177583 and parameters: {'num_leaves': 94, 'max_depth': 12, 'learning_rate': 0.018498148834360684, 'min_child_samples': 100, 'subsample': 0.645819751288932, 'colsample_bytree': 0.969230228309435, 'reg_alpha': 6.909396059382877e-06, 'reg_lambda': 1.182794059849649e-06}. Best is trial 0 with value: 0.736915355737874.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[936]	valid_0's auc: 0.73667


[I 2025-02-23 01:49:03,662] Trial 3 finished with value: 0.7366704667048458 and parameters: {'num_leaves': 22, 'max_depth': 4, 'learning_rate': 0.01665165671202152, 'min_child_samples': 45, 'subsample': 0.923141871692328, 'colsample_bytree': 0.9112336678258346, 'reg_alpha': 0.005981671736573895, 'reg_lambda': 3.860717123122199e-07}. Best is trial 0 with value: 0.736915355737874.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[779]	valid_0's auc: 0.736825


[I 2025-02-23 01:49:46,453] Trial 4 finished with value: 0.736824863606127 and parameters: {'num_leaves': 32, 'max_depth': 6, 'learning_rate': 0.010234254006225596, 'min_child_samples': 8, 'subsample': 0.7302172023422439, 'colsample_bytree': 0.8441669377074317, 'reg_alpha': 1.0281874242972457e-07, 'reg_lambda': 0.001879151862763488}. Best is trial 0 with value: 0.736915355737874.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[127]	valid_0's auc: 0.736839


[I 2025-02-23 01:49:58,301] Trial 5 finished with value: 0.7368390452533125 and parameters: {'num_leaves': 84, 'max_depth': 15, 'learning_rate': 0.03622076600358538, 'min_child_samples': 16, 'subsample': 0.7213805104042881, 'colsample_bytree': 0.7584679872142213, 'reg_alpha': 1.625320002361135e-08, 'reg_lambda': 0.00012264558465852288}. Best is trial 0 with value: 0.736915355737874.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[768]	valid_0's auc: 0.737096


[I 2025-02-23 01:50:48,479] Trial 6 finished with value: 0.7370961801774005 and parameters: {'num_leaves': 129, 'max_depth': 6, 'learning_rate': 0.0102943518421667, 'min_child_samples': 90, 'subsample': 0.5301079648137378, 'colsample_bytree': 0.6336492525346324, 'reg_alpha': 0.04899419487541723, 'reg_lambda': 1.9365789350317248e-08}. Best is trial 6 with value: 0.7370961801774005.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[178]	valid_0's auc: 0.736906


[I 2025-02-23 01:51:01,661] Trial 7 finished with value: 0.7369060879331615 and parameters: {'num_leaves': 84, 'max_depth': 7, 'learning_rate': 0.039833021668690814, 'min_child_samples': 83, 'subsample': 0.7103832361439746, 'colsample_bytree': 0.9013260297370793, 'reg_alpha': 1.5931272470319042, 'reg_lambda': 6.962113162837368e-07}. Best is trial 6 with value: 0.7370961801774005.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's auc: 0.735706


[I 2025-02-23 01:51:42,053] Trial 8 finished with value: 0.7357063658481793 and parameters: {'num_leaves': 93, 'max_depth': 4, 'learning_rate': 0.007434303683929562, 'min_child_samples': 57, 'subsample': 0.9816536046948027, 'colsample_bytree': 0.9425386390505844, 'reg_alpha': 0.020850375708503805, 'reg_lambda': 4.079712371300023e-07}. Best is trial 6 with value: 0.7370961801774005.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's auc: 0.726189


[I 2025-02-23 01:52:31,345] Trial 9 finished with value: 0.7261885081008416 and parameters: {'num_leaves': 58, 'max_depth': 3, 'learning_rate': 0.0014793227724852972, 'min_child_samples': 32, 'subsample': 0.6264853271719599, 'colsample_bytree': 0.5974186939194737, 'reg_alpha': 0.000992970407888723, 'reg_lambda': 9.71188368194938e-07}. Best is trial 6 with value: 0.7370961801774005.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[42]	valid_0's auc: 0.736438


[I 2025-02-23 01:52:39,013] Trial 10 finished with value: 0.7364376627462145 and parameters: {'num_leaves': 148, 'max_depth': 10, 'learning_rate': 0.09289441510274869, 'min_child_samples': 70, 'subsample': 0.5168878628591072, 'colsample_bytree': 0.5958758272992222, 'reg_alpha': 1.4216605421168356, 'reg_lambda': 2.7882927965966346}. Best is trial 6 with value: 0.7370961801774005.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[998]	valid_0's auc: 0.736843


[I 2025-02-23 01:53:52,207] Trial 11 finished with value: 0.7368426130007566 and parameters: {'num_leaves': 142, 'max_depth': 7, 'learning_rate': 0.004199347453267928, 'min_child_samples': 35, 'subsample': 0.5285034833208084, 'colsample_bytree': 0.6513045609418014, 'reg_alpha': 5.9624693800228025e-06, 'reg_lambda': 1.2264406098378849e-08}. Best is trial 6 with value: 0.7370961801774005.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[999]	valid_0's auc: 0.736861


[I 2025-02-23 01:55:08,056] Trial 12 finished with value: 0.7368612597920721 and parameters: {'num_leaves': 122, 'max_depth': 9, 'learning_rate': 0.003919305704557461, 'min_child_samples': 61, 'subsample': 0.5901760509489071, 'colsample_bytree': 0.6757986452025206, 'reg_alpha': 4.072072252229103e-06, 'reg_lambda': 4.184730255820131e-05}. Best is trial 6 with value: 0.7370961801774005.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[89]	valid_0's auc: 0.730855


[I 2025-02-23 01:55:17,077] Trial 13 finished with value: 0.7308547346063461 and parameters: {'num_leaves': 119, 'max_depth': 6, 'learning_rate': 0.0011599823612239034, 'min_child_samples': 25, 'subsample': 0.8570605782945031, 'colsample_bytree': 0.6816589523267019, 'reg_alpha': 0.05698160253019941, 'reg_lambda': 0.019604763590335396}. Best is trial 6 with value: 0.7370961801774005.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[816]	valid_0's auc: 0.737227


[I 2025-02-23 01:56:26,997] Trial 14 finished with value: 0.7372272288817197 and parameters: {'num_leaves': 49, 'max_depth': 9, 'learning_rate': 0.008364671002619489, 'min_child_samples': 78, 'subsample': 0.5006296719645948, 'colsample_bytree': 0.5080574065690815, 'reg_alpha': 7.976738056516953e-05, 'reg_lambda': 1.0206476668579171e-08}. Best is trial 14 with value: 0.7372272288817197.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[450]	valid_0's auc: 0.737078


[I 2025-02-23 01:57:08,524] Trial 15 finished with value: 0.7370783454109625 and parameters: {'num_leaves': 64, 'max_depth': 9, 'learning_rate': 0.014278077763319243, 'min_child_samples': 80, 'subsample': 0.5853720415607836, 'colsample_bytree': 0.506129957466819, 'reg_alpha': 0.17190236667618272, 'reg_lambda': 1.7929155488781387e-05}. Best is trial 14 with value: 0.7372272288817197.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[48]	valid_0's auc: 0.734053


[I 2025-02-23 01:57:18,465] Trial 16 finished with value: 0.7340532479853494 and parameters: {'num_leaves': 113, 'max_depth': 11, 'learning_rate': 0.0022864363661272698, 'min_child_samples': 73, 'subsample': 0.8102304190833856, 'colsample_bytree': 0.5253569649413661, 'reg_alpha': 0.00010750985209322862, 'reg_lambda': 0.7641509244187564}. Best is trial 14 with value: 0.7372272288817197.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[151]	valid_0's auc: 0.737165


[I 2025-02-23 01:57:34,664] Trial 17 finished with value: 0.7371649610669808 and parameters: {'num_leaves': 132, 'max_depth': 8, 'learning_rate': 0.03421698704818706, 'min_child_samples': 89, 'subsample': 0.5606355805146028, 'colsample_bytree': 0.5814730857171851, 'reg_alpha': 0.0001128604130754039, 'reg_lambda': 8.499168388903759e-06}. Best is trial 14 with value: 0.7372272288817197.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[201]	valid_0's auc: 0.737018


[I 2025-02-23 01:57:54,649] Trial 18 finished with value: 0.7370184878607741 and parameters: {'num_leaves': 105, 'max_depth': 15, 'learning_rate': 0.029898759786198555, 'min_child_samples': 67, 'subsample': 0.6596429145268587, 'colsample_bytree': 0.5588696836958322, 'reg_alpha': 8.173269171084589e-05, 'reg_lambda': 0.001558650264271045}. Best is trial 14 with value: 0.7372272288817197.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[69]	valid_0's auc: 0.736905


[I 2025-02-23 01:58:02,897] Trial 19 finished with value: 0.7369047924656004 and parameters: {'num_leaves': 73, 'max_depth': 8, 'learning_rate': 0.07948657658517846, 'min_child_samples': 82, 'subsample': 0.5856543954029689, 'colsample_bytree': 0.7158522576922395, 'reg_alpha': 2.372130137057812e-05, 'reg_lambda': 7.736924309646564e-06}. Best is trial 14 with value: 0.7372272288817197.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[295]	valid_0's auc: 0.737149


[I 2025-02-23 01:58:27,796] Trial 20 finished with value: 0.7371486620004024 and parameters: {'num_leaves': 42, 'max_depth': 12, 'learning_rate': 0.024670130775495482, 'min_child_samples': 99, 'subsample': 0.8178952713192191, 'colsample_bytree': 0.5729372105450198, 'reg_alpha': 0.003142996440990787, 'reg_lambda': 0.09003690543803919}. Best is trial 14 with value: 0.7372272288817197.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[136]	valid_0's auc: 0.737032


[I 2025-02-23 01:58:41,252] Trial 21 finished with value: 0.7370318942128139 and parameters: {'num_leaves': 42, 'max_depth': 13, 'learning_rate': 0.0502084766543651, 'min_child_samples': 100, 'subsample': 0.7903511399664711, 'colsample_bytree': 0.5653361695660569, 'reg_alpha': 0.0015482258465849537, 'reg_lambda': 0.04740097770801546}. Best is trial 14 with value: 0.7372272288817197.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[336]	valid_0's auc: 0.737427


[I 2025-02-23 01:59:07,530] Trial 22 finished with value: 0.7374266663609165 and parameters: {'num_leaves': 44, 'max_depth': 10, 'learning_rate': 0.024872783485083465, 'min_child_samples': 90, 'subsample': 0.8712250273596444, 'colsample_bytree': 0.5038568650513761, 'reg_alpha': 0.00018591024122716011, 'reg_lambda': 0.17670316730852656}. Best is trial 22 with value: 0.7374266663609165.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[89]	valid_0's auc: 0.736672


[I 2025-02-23 01:59:18,290] Trial 23 finished with value: 0.7366721086231185 and parameters: {'num_leaves': 52, 'max_depth': 10, 'learning_rate': 0.05695400425471503, 'min_child_samples': 89, 'subsample': 0.8846676194488312, 'colsample_bytree': 0.5213081642461509, 'reg_alpha': 5.488698360301668e-05, 'reg_lambda': 0.0008486466213493061}. Best is trial 22 with value: 0.7374266663609165.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[542]	valid_0's auc: 0.737243


[I 2025-02-23 01:59:51,008] Trial 24 finished with value: 0.7372433581973763 and parameters: {'num_leaves': 22, 'max_depth': 10, 'learning_rate': 0.02276949007696137, 'min_child_samples': 77, 'subsample': 0.9875267399965062, 'colsample_bytree': 0.618939055475643, 'reg_alpha': 8.781214287521604e-07, 'reg_lambda': 0.006170455023042319}. Best is trial 22 with value: 0.7374266663609165.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[631]	valid_0's auc: 0.737286


[I 2025-02-23 02:00:25,988] Trial 25 finished with value: 0.7372864372090844 and parameters: {'num_leaves': 20, 'max_depth': 10, 'learning_rate': 0.022647922444218763, 'min_child_samples': 49, 'subsample': 0.976986691880069, 'colsample_bytree': 0.6217048826181759, 'reg_alpha': 6.993934201588206e-07, 'reg_lambda': 0.005393311381155952}. Best is trial 22 with value: 0.7374266663609165.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[544]	valid_0's auc: 0.737223


[I 2025-02-23 02:00:57,163] Trial 26 finished with value: 0.7372233196470412 and parameters: {'num_leaves': 22, 'max_depth': 11, 'learning_rate': 0.023301746923385675, 'min_child_samples': 47, 'subsample': 0.9996831198281039, 'colsample_bytree': 0.6259725356855774, 'reg_alpha': 5.12386454608218e-07, 'reg_lambda': 0.010629001245163474}. Best is trial 22 with value: 0.7374266663609165.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[611]	valid_0's auc: 0.737155


[I 2025-02-23 02:01:30,490] Trial 27 finished with value: 0.7371545665529335 and parameters: {'num_leaves': 32, 'max_depth': 13, 'learning_rate': 0.013980036417271559, 'min_child_samples': 50, 'subsample': 0.9635898712382546, 'colsample_bytree': 0.7286004820267592, 'reg_alpha': 1.554488984770747e-08, 'reg_lambda': 0.2819807927343664}. Best is trial 22 with value: 0.7374266663609165.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[223]	valid_0's auc: 0.737067


[I 2025-02-23 02:01:44,090] Trial 28 finished with value: 0.7370672222584556 and parameters: {'num_leaves': 33, 'max_depth': 10, 'learning_rate': 0.059404460191118565, 'min_child_samples': 61, 'subsample': 0.9409570493257866, 'colsample_bytree': 0.6929644233972871, 'reg_alpha': 6.922384067372208e-07, 'reg_lambda': 5.127194964807356}. Best is trial 22 with value: 0.7374266663609165.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[410]	valid_0's auc: 0.736935


[I 2025-02-23 02:02:04,868] Trial 29 finished with value: 0.736935172917124 and parameters: {'num_leaves': 20, 'max_depth': 11, 'learning_rate': 0.022288832719785674, 'min_child_samples': 43, 'subsample': 0.8923741430562183, 'colsample_bytree': 0.7998376689347826, 'reg_alpha': 1.9018220528317527e-07, 'reg_lambda': 0.006608266472067094}. Best is trial 22 with value: 0.7374266663609165.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[669]	valid_0's auc: 0.737158


[I 2025-02-23 02:02:57,456] Trial 30 finished with value: 0.7371580459504825 and parameters: {'num_leaves': 35, 'max_depth': 14, 'learning_rate': 0.011927490229414668, 'min_child_samples': 65, 'subsample': 0.9247149605655659, 'colsample_bytree': 0.5407616981134964, 'reg_alpha': 2.0791232995947065e-06, 'reg_lambda': 0.22074658331624664}. Best is trial 22 with value: 0.7374266663609165.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[48]	valid_0's auc: 0.732603


[I 2025-02-23 02:03:06,555] Trial 31 finished with value: 0.7326026290744813 and parameters: {'num_leaves': 45, 'max_depth': 9, 'learning_rate': 0.006088741284152748, 'min_child_samples': 75, 'subsample': 0.9572381751011088, 'colsample_bytree': 0.5063749947131442, 'reg_alpha': 2.296614686429684e-05, 'reg_lambda': 0.00024696979541687666}. Best is trial 22 with value: 0.7374266663609165.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[866]	valid_0's auc: 0.736989


[I 2025-02-23 02:04:10,483] Trial 32 finished with value: 0.7369889363099504 and parameters: {'num_leaves': 50, 'max_depth': 8, 'learning_rate': 0.006932352925965469, 'min_child_samples': 54, 'subsample': 0.8605404308362129, 'colsample_bytree': 0.6214080379372738, 'reg_alpha': 0.000330769308827882, 'reg_lambda': 0.006317688602563322}. Best is trial 22 with value: 0.7374266663609165.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[263]	valid_0's auc: 0.737145


[I 2025-02-23 02:04:35,990] Trial 33 finished with value: 0.7371446276860975 and parameters: {'num_leaves': 69, 'max_depth': 10, 'learning_rate': 0.018433188174496202, 'min_child_samples': 82, 'subsample': 0.8955005277300194, 'colsample_bytree': 0.5391765661095707, 'reg_alpha': 7.144069777479653e-08, 'reg_lambda': 0.04088059411011074}. Best is trial 22 with value: 0.7374266663609165.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[998]	valid_0's auc: 0.737123


[I 2025-02-23 02:05:33,413] Trial 34 finished with value: 0.7371227040071044 and parameters: {'num_leaves': 28, 'max_depth': 9, 'learning_rate': 0.008391266352709719, 'min_child_samples': 76, 'subsample': 0.767152033230242, 'colsample_bytree': 0.6512083273250818, 'reg_alpha': 1.0741945637047554e-06, 'reg_lambda': 0.8234703316777913}. Best is trial 22 with value: 0.7374266663609165.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's auc: 0.737064


[I 2025-02-23 02:06:42,962] Trial 35 finished with value: 0.7370643881129485 and parameters: {'num_leaves': 38, 'max_depth': 11, 'learning_rate': 0.005378985062109048, 'min_child_samples': 92, 'subsample': 0.9938326398788265, 'colsample_bytree': 0.6023930591331036, 'reg_alpha': 1.2392827268590453e-05, 'reg_lambda': 0.00047737014583876827}. Best is trial 22 with value: 0.7374266663609165.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[307]	valid_0's auc: 0.737145


[I 2025-02-23 02:07:06,498] Trial 36 finished with value: 0.7371453305144753 and parameters: {'num_leaves': 26, 'max_depth': 12, 'learning_rate': 0.02821486514662505, 'min_child_samples': 42, 'subsample': 0.6873747458463524, 'colsample_bytree': 0.5547073104132461, 'reg_alpha': 0.0004584905528234981, 'reg_lambda': 9.873483379960035e-08}. Best is trial 22 with value: 0.7374266663609165.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[364]	valid_0's auc: 0.737167


[I 2025-02-23 02:07:39,470] Trial 37 finished with value: 0.7371669553921381 and parameters: {'num_leaves': 53, 'max_depth': 8, 'learning_rate': 0.018063279400355712, 'min_child_samples': 95, 'subsample': 0.8465600139145777, 'colsample_bytree': 0.501731529445876, 'reg_alpha': 2.3211117049781435e-07, 'reg_lambda': 0.0037209748433599645}. Best is trial 22 with value: 0.7374266663609165.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[332]	valid_0's auc: 0.736891


[I 2025-02-23 02:08:04,644] Trial 38 finished with value: 0.7368911826109934 and parameters: {'num_leaves': 77, 'max_depth': 13, 'learning_rate': 0.01259597065832873, 'min_child_samples': 87, 'subsample': 0.9236874011482785, 'colsample_bytree': 0.8280764256085135, 'reg_alpha': 4.9324820840644455e-08, 'reg_lambda': 0.12402517843672195}. Best is trial 22 with value: 0.7374266663609165.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[223]	valid_0's auc: 0.736917


[I 2025-02-23 02:08:17,809] Trial 39 finished with value: 0.7369166899205576 and parameters: {'num_leaves': 27, 'max_depth': 10, 'learning_rate': 0.04173325589058552, 'min_child_samples': 78, 'subsample': 0.9695187152234194, 'colsample_bytree': 0.775840160368018, 'reg_alpha': 2.1740721470381452e-06, 'reg_lambda': 9.009189955514e-08}. Best is trial 22 with value: 0.7374266663609165.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[737]	valid_0's auc: 0.73702


[I 2025-02-23 02:09:06,804] Trial 40 finished with value: 0.7370197555328626 and parameters: {'num_leaves': 63, 'max_depth': 7, 'learning_rate': 0.009156641932267208, 'min_child_samples': 59, 'subsample': 0.9058182535371833, 'colsample_bytree': 0.6608908271884448, 'reg_alpha': 0.0067718838659263135, 'reg_lambda': 9.401467959552224e-05}. Best is trial 22 with value: 0.7374266663609165.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[526]	valid_0's auc: 0.737156


[I 2025-02-23 02:09:37,021] Trial 41 finished with value: 0.737155729992 and parameters: {'num_leaves': 21, 'max_depth': 11, 'learning_rate': 0.020699918847225166, 'min_child_samples': 50, 'subsample': 0.9866722744407503, 'colsample_bytree': 0.6370582207202362, 'reg_alpha': 2.569405423577271e-07, 'reg_lambda': 0.014017818882341835}. Best is trial 22 with value: 0.7374266663609165.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[536]	valid_0's auc: 0.7374


[I 2025-02-23 02:10:08,999] Trial 42 finished with value: 0.7373995637897657 and parameters: {'num_leaves': 26, 'max_depth': 11, 'learning_rate': 0.025841424153462043, 'min_child_samples': 46, 'subsample': 0.9982833182671234, 'colsample_bytree': 0.6170476012445443, 'reg_alpha': 5.347537765209642e-07, 'reg_lambda': 0.01441527869018412}. Best is trial 22 with value: 0.7374266663609165.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[459]	valid_0's auc: 0.737103


[I 2025-02-23 02:10:40,562] Trial 43 finished with value: 0.7371028937767228 and parameters: {'num_leaves': 38, 'max_depth': 9, 'learning_rate': 0.015475467607399854, 'min_child_samples': 38, 'subsample': 0.9464539254179557, 'colsample_bytree': 0.6130089857212626, 'reg_alpha': 1.271482439389527e-05, 'reg_lambda': 0.0021456571452398154}. Best is trial 22 with value: 0.7374266663609165.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[230]	valid_0's auc: 0.736961


[I 2025-02-23 02:10:59,839] Trial 44 finished with value: 0.7369611120492083 and parameters: {'num_leaves': 46, 'max_depth': 12, 'learning_rate': 0.029867194245033823, 'min_child_samples': 29, 'subsample': 0.9400027497192552, 'colsample_bytree': 0.5845930079509678, 'reg_alpha': 3.7091444663353125e-08, 'reg_lambda': 0.03188004717530164}. Best is trial 22 with value: 0.7374266663609165.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[567]	valid_0's auc: 0.737009


[I 2025-02-23 02:11:30,175] Trial 45 finished with value: 0.7370086174899237 and parameters: {'num_leaves': 27, 'max_depth': 10, 'learning_rate': 0.011278366479983468, 'min_child_samples': 21, 'subsample': 0.7559424356466077, 'colsample_bytree': 0.9938535117406113, 'reg_alpha': 2.4124200115984638e-06, 'reg_lambda': 0.4522450667691628}. Best is trial 22 with value: 0.7374266663609165.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[238]	valid_0's auc: 0.737078


[I 2025-02-23 02:11:44,684] Trial 46 finished with value: 0.7370782908127128 and parameters: {'num_leaves': 38, 'max_depth': 12, 'learning_rate': 0.041591673631721386, 'min_child_samples': 55, 'subsample': 0.967162743018923, 'colsample_bytree': 0.7476481492338078, 'reg_alpha': 6.3089185575238655e-06, 'reg_lambda': 2.609192779884718}. Best is trial 22 with value: 0.7374266663609165.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[215]	valid_0's auc: 0.736969


[I 2025-02-23 02:12:00,211] Trial 47 finished with value: 0.736969131043046 and parameters: {'num_leaves': 57, 'max_depth': 9, 'learning_rate': 0.026974558255036078, 'min_child_samples': 6, 'subsample': 0.9165672694241035, 'colsample_bytree': 0.7111549866874063, 'reg_alpha': 1.0410411711917225e-06, 'reg_lambda': 0.0009659534470525425}. Best is trial 22 with value: 0.7374266663609165.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[47]	valid_0's auc: 0.731276


[I 2025-02-23 02:12:08,119] Trial 48 finished with value: 0.731275712921609 and parameters: {'num_leaves': 30, 'max_depth': 10, 'learning_rate': 0.002548553076049082, 'min_child_samples': 71, 'subsample': 0.8803626141460202, 'colsample_bytree': 0.5372977017487901, 'reg_alpha': 0.00024750383237826104, 'reg_lambda': 2.0505496845054537e-06}. Best is trial 22 with value: 0.7374266663609165.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[711]	valid_0's auc: 0.737069


[I 2025-02-23 02:12:42,336] Trial 49 finished with value: 0.7370692066566584 and parameters: {'num_leaves': 25, 'max_depth': 5, 'learning_rate': 0.017288229286515255, 'min_child_samples': 85, 'subsample': 0.8408610038870797, 'colsample_bytree': 0.8655249022493532, 'reg_alpha': 3.735126863128714e-07, 'reg_lambda': 0.08557085951988354}. Best is trial 22 with value: 0.7374266663609165.


Best trial:
  AUC: 0.7374
  Best Parameters:
    num_leaves: 44
    max_depth: 10
    learning_rate: 0.024872783485083465
    min_child_samples: 90
    subsample: 0.8712250273596444
    colsample_bytree: 0.5038568650513761
    reg_alpha: 0.00018591024122716011
    reg_lambda: 0.17670316730852656


In [None]:
# 6️⃣ 최적 파라미터로 최종 모델 학습
best_params = best_trial.params
# 추가로 고정 파라미터 설정
best_params.update({
    'objective': 'binary',
    'metric': 'auc',
    'verbosity': -1,
    'boosting_type': 'gbdt',
    'n_estimators': 1000,
    'random_state': 42
})

final_model = lgb.LGBMClassifier(**best_params)
final_model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    eval_metric='auc',
    callbacks=[lgb.early_stopping(50)]
)


# 7️⃣ 검증 데이터 평가
y_val_pred_proba = final_model.predict_proba(X_val)[:, 1]
y_val_pred = (y_val_pred_proba >= 0.5).astype(int)
accuracy = accuracy_score(y_val, y_val_pred)
f1 = f1_score(y_val, y_val_pred)
auc = roc_auc_score(y_val, y_val_pred_proba)

print("Validation Metrics:")
print(f"🔹 Accuracy: {accuracy:.4f}")
print(f"🔹 F1-score: {f1:.4f}")
print(f"🔹 AUC: {auc:.4f}")

# 테스트 데이터 전처리: 학습 데이터와 동일한 카테고리로 설정
for col in bad_columns:
    if col in test.columns:
        test[col] = test[col].astype('category')
        test[col] = test[col].cat.set_categories(X_train[col].cat.categories)


# 8️⃣ 테스트 데이터 예측
# 테스트 데이터는 타겟 컬럼이 없으므로, 바로 예측 진행
y_test_pred_proba = final_model.predict_proba(test)[:, 1]

Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[336]	valid_0's auc: 0.737427
Validation Metrics:
🔹 Accuracy: 0.7459
🔹 F1-score: 0.1800
🔹 AUC: 0.7374


In [None]:
# 9️⃣ 제출 파일 생성
submission = pd.read_csv('/')
submission[target] = y_test_pred_proba
submission.to_csv('', index=False)

print("🚀 LightGBM 최적 모델 예측 완료! 제출 파일 생성됨: submission_lightgbm.csv")

# 10️⃣ Colab에서 제출 파일 다운로드
from google.colab import files
files.download('')


🚀 LightGBM 최적 모델 예측 완료! 제출 파일 생성됨: submission_lightgbm.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# 1️⃣ 제출 파일 불러오기
submission = pd.read_csv('')

# 2️⃣ 기존에 있던 "probability" 컬럼 제거 (만약 존재한다면)
if 'probability' in submission.columns:
    submission.drop(columns=['probability'], inplace=True)

# 3️⃣ 컬럼명 변경: "임신 성공 여부" → "probability"
submission.rename(columns={'임신 성공 여부': 'probability'}, inplace=True)

# 4️⃣ 수정된 파일 저장
submission.to_csv('', index=False)

print("🚀 최종 제출 파일 생성 완료: submission_lightgbm_final.csv")

# 5️⃣ Colab에서 다운로드 가능
from google.colab import files
files.download('')

🚀 최종 제출 파일 생성 완료: submission_lightgbm_final.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
binary_predictions = (y_test_pred >= 0.5).astype(int)
print(binary_predictions.sum())

4749
