In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import prince

from sklearn.preprocessing import  OrdinalEncoder
from sklearn.ensemble import ExtraTreesClassifier

train = pd.read_csv('./train.csv').drop(columns=['ID'])
test = pd.read_csv('./test.csv').drop(columns=['ID'])

In [3]:
#'시술 시기 코드' 열 제거
train = train.drop(columns=['시술 시기 코드'])
test = test.drop(columns=['시술 시기 코드'])

#'임신 시도 또는 마지막 임신 경과 연수' 열 제거
train = train.drop(columns=['임신 시도 또는 마지막 임신 경과 연수'])
test = test.drop(columns=['임신 시도 또는 마지막 임신 경과 연수'])

#'시술 유형' 열 제거
train = train.drop(columns=['시술 유형'])
test = test.drop(columns=['시술 유형'])

#'특정 시술 유형' 그룹화
def categorize_treatment(treatment):
    if pd.isna(treatment) or "Unknown" in treatment:
        return "Unknown"
    elif "IVF" in treatment:
        return "IVF 기반"
    elif "ICSI" in treatment:
        return "ICSI 기반"
    else:
        return "기타"

# 새로운 그룹 컬럼 생성
train["특정 시술 유형"] = train["특정 시술 유형"].apply(categorize_treatment)

#'배란 유도 유형' 열 제거
train = train.drop(columns=['배란 유도 유형'])
test = test.drop(columns=['배란 유도 유형'])

#'착상 전 유전 검사 사용 여부' 열 제거
train = train.drop(columns=['착상 전 유전 검사 사용 여부'])
test = test.drop(columns=['착상 전 유전 검사 사용 여부'])

# 정자 면역학적 요인 drop
train = train.drop('불임 원인 - 정자 면역학적 요인', axis=1)
test = test.drop('불임 원인 - 정자 면역학적 요인', axis=1)

# 남성 요인 MCA
features_male = ["불임 원인 - 남성 요인", "불임 원인 - 정자 농도", "불임 원인 - 정자 운동성", "불임 원인 - 정자 형태"]
subset_train = train[features_male].copy()  # 선택한 feature만 사용
subset_test = test[features_male].copy()

# prince 라이브러리로 MCA 모델 훈련 (n_components=1로 차원 축소)
mca = prince.MCA(n_components=1)

# MCA 모델을 훈련시키고 차원 축소된 데이터 얻기
mca_result_train = mca.fit_transform(subset_train)
mca_result_test = mca.fit_transform(subset_test)

# 기존 남성 요인 feature 삭제
train = train.drop(columns=features_male)
test = test.drop(columns=features_male)

# 차원 축소된 데이터를 원본 데이터에 새로운 열로 추가
train["불임 원인 - 남성 요인"] = mca_result_train
test["불임 원인 - 남성 요인"] = mca_result_test

# 여성 요인 OR 진행
train['불임 원인 - 여성 요인'] = train[['불임 원인 - 난관 질환', '불임 원인 - 배란 장애', '불임 원인 - 자궁경부 문제', '불임 원인 - 자궁내막증']].any(axis=1).astype(int)
test['불임 원인 - 여성 요인'] = test[['불임 원인 - 난관 질환', '불임 원인 - 배란 장애', '불임 원인 - 자궁경부 문제', '불임 원인 - 자궁내막증']].any(axis=1).astype(int)

# 나머지 여성 feature drop
features_fe = ['불임 원인 - 난관 질환', '불임 원인 - 배란 장애', '불임 원인 - 자궁경부 문제', '불임 원인 - 자궁내막증']
train = train.drop(columns = features_fe)
test = test.drop(columns = features_fe)

train["배아 생성 주요 이유"] = train["배아 생성 주요 이유"].apply(lambda x: 1 if isinstance(x, str) and "현재 시술용" in x else 0)
test["배아 생성 주요 이유"] = test["배아 생성 주요 이유"].apply(lambda x: 1 if isinstance(x, str) and "현재 시술용" in x else 0)

# drop
train = train.drop(columns=['IVF 임신 횟수', 'IVF 출산 횟수', '클리닉 내 총 시술 횟수', 'IVF 시술 횟수'])
test = test.drop(columns=['IVF 임신 횟수', 'IVF 출산 횟수', '클리닉 내 총 시술 횟수', 'IVF 시술 횟수'])

# feature 53 ~ 62 전처리 진행

# 난자 출처 : '알 수 없음' 데이터 삭제
train = train[train['난자 출처'] != '알 수 없음']



# 정자 출처 : '미할당', '배우자 제공' 데이터 삭제
train = train[train['정자 출처'] != '미할당']
train = train[train['정자 출처'] != '배우자 및 기증 제공']



#drop
train.drop(['난자 기증자 나이'], axis=1, inplace=True)
train.drop(['정자 기증자 나이'], axis=1, inplace=True)
train.drop(['신선 배아 사용 여부'], axis=1, inplace=True)
train.drop(['기증 배아 사용 여부'], axis=1, inplace=True)
train.drop(['대리모 여부'], axis=1, inplace=True)
train.drop(['PGD 시술 여부'], axis=1, inplace=True)
train.drop(['PGS 시술 여부'], axis=1, inplace=True)


test.drop(['난자 기증자 나이'], axis=1, inplace=True)
test.drop(['정자 기증자 나이'], axis=1, inplace=True)
test.drop(['신선 배아 사용 여부'], axis=1, inplace=True)
test.drop(['기증 배아 사용 여부'], axis=1, inplace=True)
test.drop(['대리모 여부'], axis=1, inplace=True)
test.drop(['PGD 시술 여부'], axis=1, inplace=True)
test.drop(['PGS 시술 여부'], axis=1, inplace=True)

# '난자 채취 경과일' 열 제거
train = train.drop(columns=['난자 채취 경과일'])
test = test.drop(columns=['난자 채취 경과일'])

# '난자 해동 경과일' 열 제거
train = train.drop(columns=['난자 해동 경과일'])
test = test.drop(columns=['난자 해동 경과일'])

# '난자 혼합 경과일' 열 제거
train = train.drop(columns=['난자 혼합 경과일'])
test = test.drop(columns=['난자 혼합 경과일'])

# '배아 이식 경과일'의 평균값 계산 (결측치는 제외한 값으로 평균 계산), 반올림 처리
mean_value = round(train['배아 이식 경과일'].mean())  # 반올림 처리


# 결측치를 평균값으로 대체
train['배아 이식 경과일'].fillna(mean_value, inplace=True)

# '배아 이식 경과일'의 평균값 계산 (결측치는 제외한 값으로 평균 계산), 반올림 처리
mean_value = round(test['배아 이식 경과일'].mean())  # 반올림 처리

# 결측치를 평균값으로 대체
test['배아 이식 경과일'].fillna(mean_value, inplace=True)

# '배아 해동 경과일'별 데이터 개수 세기
total_counts_5 = train['배아 해동 경과일'].value_counts()
test_total_counts_5 = test['배아 해동 경과일'].value_counts()

# 특정 컬럼의 결측치 개수 계산
missing_count = train['배아 해동 경과일'].isna().sum()
test_missing_count = test['배아 해동 경과일'].isna().sum()

# 원본 데이터의 비율 계산
value_ratios = total_counts_5 / total_counts_5.sum()
test_value_ratios = test_total_counts_5 / test_total_counts_5.sum()

# 각 값에 대해 채울 개수 계산
fill_counts = (value_ratios * missing_count).round().astype(int)
test_fill_counts = (test_value_ratios * test_missing_count).round().astype(int)

# 결측치 샘플링
filled_values = np.concatenate([
    np.full(count, value) for value, count in fill_counts.items()
]) if len(fill_counts) > 0 else np.array([])

test_filled_values = np.concatenate([
    np.full(count, value) for value, count in test_fill_counts.items()
]) if len(test_fill_counts) > 0 else np.array([])

# 결측치 개수 맞추기
if len(filled_values) < missing_count:
    extra_values = np.random.choice(list(fill_counts.index), missing_count - len(filled_values), replace=True)
    filled_values = np.concatenate([filled_values, extra_values])
elif len(filled_values) > missing_count:
    filled_values = np.random.choice(filled_values, missing_count, replace=False)

if len(test_filled_values) < test_missing_count:
    extra_values = np.random.choice(list(test_fill_counts.index), test_missing_count - len(test_filled_values), replace=True)
    test_filled_values = np.concatenate([test_filled_values, extra_values])
elif len(test_filled_values) > test_missing_count:
    test_filled_values = np.random.choice(test_filled_values, test_missing_count, replace=False)

# 배열을 섞어 랜덤 배치
np.random.shuffle(filled_values)
np.random.shuffle(test_filled_values)

# 결측치 채우기
train.loc[train['배아 해동 경과일'].isna(), '배아 해동 경과일'] = filled_values
test.loc[test['배아 해동 경과일'].isna(), '배아 해동 경과일'] = test_filled_values

#난자 수, 배아수 관련 drop할 column들
columns_to_drop = [
    '미세주입된 난자 수', '미세주입 배아 이식 수', '저장된 배아 수', '미세주입 후 저장된 배아 수',
    '해동된 배아 수', '해동 난자 수', '수집된 신선 난자 수', '저장된 신선 난자 수',
    '혼합된 난자 수', '파트너 정자와 혼합된 난자 수', '기증자 정자와 혼합된 난자 수'
]

train = train.drop(columns=columns_to_drop)
test = test.drop(columns=columns_to_drop)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train['배아 이식 경과일'].fillna(mean_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test['배아 이식 경과일'].fillna(mean_value, inplace=True)


In [5]:
X = train.drop('임신 성공 여부', axis=1)
y = train['임신 성공 여부']

categorical_columns = [
    "시술 당시 나이",
    "특정 시술 유형",
    "배란 자극 여부",
    "단일 배아 이식 여부",
    "착상 전 유전 진단 사용 여부",
    "불임 원인 - 남성 요인",
    "불임 원인 - 여성 요인",
    "배아 생성 주요 이유",
    "총 시술 횟수",
    "DI 시술 횟수",
    "총 임신 횟수",
    "DI 임신 횟수",
    "총 출산 횟수",
    "DI 출산 횟수",
    "난자 출처",
    "정자 출처",
    "동결 배아 사용 여부",
    "남성 주 불임 원인",
    "남성 부 불임 원인",
    "여성 주 불임 원인",
    "여성 부 불임 원인",
    "부부 주 불임 원인",
    "부부 부 불임 원인",
    "불명확 불임 원인"
]

numeric_columns = [
    "총 생성 배아 수",
    "미세주입에서 생성된 배아 수",
    "이식된 배아 수",
    "배아 이식 경과일",
    "배아 해동 경과일"
]

# 범주형 데이터의 모든 자료형을 문자열로 변환
for col in categorical_columns:
    X[col] = X[col].astype(str)
    test[col] = test[col].astype(str)


# 범주의 정수화 (by sklearn 전처리)
ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
# test encoding 시 기존에 없던 범주를 발견하면 -1로 처리

X_train_encoded = X.copy()
X_train_encoded[categorical_columns] = ordinal_encoder.fit_transform(X[categorical_columns])

X_test_encoded = test.copy()
X_test_encoded[categorical_columns] = ordinal_encoder.transform(test[categorical_columns])

# 결측치 처리
X_train_encoded[numeric_columns] = X_train_encoded[numeric_columns].fillna(0)
X_test_encoded[numeric_columns] = X_test_encoded[numeric_columns].fillna(0)

In [None]:
import optuna
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

# Optuna의 objective 함수 정의
def objective(trial, X_train_encoded, y):
    # 탐색할 하이퍼파라미터 정의
    param = {
        'n_estimators': trial.suggest_categorical('n_estimators', [100, 200, 300, 500, 1000]),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.05, log=True),  # 더 작은 범위로 조정
        'max_depth': trial.suggest_int('max_depth', 3, 9),  # 더 넓은 범위로
        'subsample': trial.suggest_float('subsample', 0.7, 1.0),  # 더 좁은 범위
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.7, 1.0),  # 더 좁은 범위
        'num_leaves': trial.suggest_int('num_leaves', 30, 60),  # 더 큰 범위
        'reg_alpha': trial.suggest_float('reg_alpha', 0.01, 0.5, log=True),  # L1 정규화, 작은 범위
        'reg_lambda': trial.suggest_float('reg_lambda', 0.01, 5, log=True),  # L2 정규화, 작은 범위
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 100),  # 최소한의 데이터 샘플
        'min_child_weight': trial.suggest_float('min_child_weight', 1e-3, 1e2, log=True)  # 자식 노드의 최소 가중치
    }

    # LGBMClassifier 모델 생성
    model = lgb.LGBMClassifier(
        random_state=42,
        **param
    )

    # 데이터셋을 훈련 세트와 검증 세트로 분리
    X_train_encoded, X_val_encoded, y_train, y_val = train_test_split(X_train_encoded, y, test_size=0.2, random_state=42)

    # 모델 훈련
    model.fit(X_train_encoded, y_train)

    # 검증 세트에서 예측 수행
    y_pred = model.predict_proba(X_val_encoded)[:, 1]

    # ROC-AUC 점수 반환
    return roc_auc_score(y_val, y_pred)

# Optuna study 생성 및 최적화 실행
study = optuna.create_study(direction='maximize')  # ROC-AUC를 최대화
study.optimize(lambda trial: objective(trial, X_train_encoded, y), n_trials=100)  # 100번의 실험 수행

# 최적의 하이퍼파라미터와 ROC-AUC 점수 출력
print("Best Parameters:", study.best_params)
print("Best ROC-AUC Score:", study.best_value)

[I 2025-02-16 21:22:16,283] A new study created in memory with name: no-name-1635fda3-bd94-4eb4-8f26-21fed2eeaa84


[LightGBM] [Info] Number of positive: 52474, number of negative: 147465
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.017839 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 184
[LightGBM] [Info] Number of data points in the train set: 199939, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.262450 -> initscore=-1.033273
[LightGBM] [Info] Start training from score -1.033273


[I 2025-02-16 21:22:20,218] Trial 0 finished with value: 0.7255517670217648 and parameters: {'n_estimators': 500, 'learning_rate': 0.006513747122210546, 'max_depth': 5, 'subsample': 0.9822918096518112, 'colsample_bytree': 0.9461598349039417, 'num_leaves': 40, 'reg_alpha': 0.43105416700612936, 'reg_lambda': 0.36334735074610996, 'min_child_samples': 34, 'min_child_weight': 0.07802512210308159}. Best is trial 0 with value: 0.7255517670217648.


[LightGBM] [Info] Number of positive: 52474, number of negative: 147465
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.020292 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 184
[LightGBM] [Info] Number of data points in the train set: 199939, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.262450 -> initscore=-1.033273
[LightGBM] [Info] Start training from score -1.033273


[I 2025-02-16 21:22:23,151] Trial 1 finished with value: 0.7249632154623105 and parameters: {'n_estimators': 200, 'learning_rate': 0.007879568275751073, 'max_depth': 9, 'subsample': 0.8150568969470678, 'colsample_bytree': 0.7992155636706503, 'num_leaves': 36, 'reg_alpha': 0.06132792803826115, 'reg_lambda': 0.794520323923645, 'min_child_samples': 44, 'min_child_weight': 27.82501011998228}. Best is trial 0 with value: 0.7255517670217648.


[LightGBM] [Info] Number of positive: 52474, number of negative: 147465
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018094 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 184
[LightGBM] [Info] Number of data points in the train set: 199939, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.262450 -> initscore=-1.033273
[LightGBM] [Info] Start training from score -1.033273


[I 2025-02-16 21:22:36,755] Trial 2 finished with value: 0.7249193340765692 and parameters: {'n_estimators': 1000, 'learning_rate': 0.0366514417555454, 'max_depth': 8, 'subsample': 0.772236881743983, 'colsample_bytree': 0.7035718860952913, 'num_leaves': 51, 'reg_alpha': 0.38150725963046705, 'reg_lambda': 0.5309043313199208, 'min_child_samples': 42, 'min_child_weight': 0.14771452467778073}. Best is trial 0 with value: 0.7255517670217648.


[LightGBM] [Info] Number of positive: 52474, number of negative: 147465
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.016296 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 184
[LightGBM] [Info] Number of data points in the train set: 199939, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.262450 -> initscore=-1.033273
[LightGBM] [Info] Start training from score -1.033273


[I 2025-02-16 21:22:44,938] Trial 3 finished with value: 0.7275029113200283 and parameters: {'n_estimators': 500, 'learning_rate': 0.008118082676554452, 'max_depth': 9, 'subsample': 0.8944618838856633, 'colsample_bytree': 0.7476521403624405, 'num_leaves': 56, 'reg_alpha': 0.02291131313291152, 'reg_lambda': 0.1998405300370868, 'min_child_samples': 71, 'min_child_weight': 0.45792971875981614}. Best is trial 3 with value: 0.7275029113200283.


[LightGBM] [Info] Number of positive: 52474, number of negative: 147465
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.017867 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 184
[LightGBM] [Info] Number of data points in the train set: 199939, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.262450 -> initscore=-1.033273
[LightGBM] [Info] Start training from score -1.033273


[I 2025-02-16 21:22:48,078] Trial 4 finished with value: 0.7259962495915699 and parameters: {'n_estimators': 200, 'learning_rate': 0.009919226131931707, 'max_depth': 8, 'subsample': 0.9275482043798371, 'colsample_bytree': 0.8357225264324276, 'num_leaves': 48, 'reg_alpha': 0.06838131045215852, 'reg_lambda': 0.014511854949252877, 'min_child_samples': 38, 'min_child_weight': 2.673333613071544}. Best is trial 3 with value: 0.7275029113200283.


[LightGBM] [Info] Number of positive: 52474, number of negative: 147465
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018883 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 184
[LightGBM] [Info] Number of data points in the train set: 199939, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.262450 -> initscore=-1.033273
[LightGBM] [Info] Start training from score -1.033273


[I 2025-02-16 21:22:50,050] Trial 5 finished with value: 0.7276148767383782 and parameters: {'n_estimators': 100, 'learning_rate': 0.038688254788791274, 'max_depth': 9, 'subsample': 0.9080874821635619, 'colsample_bytree': 0.9020573031123819, 'num_leaves': 54, 'reg_alpha': 0.02563637986787885, 'reg_lambda': 0.24726042481976873, 'min_child_samples': 74, 'min_child_weight': 69.22367158371702}. Best is trial 5 with value: 0.7276148767383782.


[LightGBM] [Info] Number of positive: 52474, number of negative: 147465
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.015469 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 184
[LightGBM] [Info] Number of data points in the train set: 199939, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.262450 -> initscore=-1.033273
[LightGBM] [Info] Start training from score -1.033273


[I 2025-02-16 21:22:51,922] Trial 6 finished with value: 0.7261298003646885 and parameters: {'n_estimators': 300, 'learning_rate': 0.03384189401325302, 'max_depth': 3, 'subsample': 0.8256219416353459, 'colsample_bytree': 0.7600453406584572, 'num_leaves': 41, 'reg_alpha': 0.047962837718700024, 'reg_lambda': 4.523584427082471, 'min_child_samples': 95, 'min_child_weight': 3.1171742063432806}. Best is trial 5 with value: 0.7276148767383782.


[LightGBM] [Info] Number of positive: 52474, number of negative: 147465
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.016240 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 184
[LightGBM] [Info] Number of data points in the train set: 199939, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.262450 -> initscore=-1.033273
[LightGBM] [Info] Start training from score -1.033273


[I 2025-02-16 21:22:56,947] Trial 7 finished with value: 0.7272250191835615 and parameters: {'n_estimators': 1000, 'learning_rate': 0.017757742868199526, 'max_depth': 3, 'subsample': 0.7390362206733563, 'colsample_bytree': 0.9700231743255618, 'num_leaves': 36, 'reg_alpha': 0.017380983489789916, 'reg_lambda': 0.18844383128197356, 'min_child_samples': 43, 'min_child_weight': 33.366973928632795}. Best is trial 5 with value: 0.7276148767383782.


[LightGBM] [Info] Number of positive: 52474, number of negative: 147465
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.019584 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 184
[LightGBM] [Info] Number of data points in the train set: 199939, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.262450 -> initscore=-1.033273
[LightGBM] [Info] Start training from score -1.033273


[I 2025-02-16 21:23:03,131] Trial 8 finished with value: 0.7263254147724432 and parameters: {'n_estimators': 500, 'learning_rate': 0.004932483201561643, 'max_depth': 8, 'subsample': 0.839173090764897, 'colsample_bytree': 0.9201377328670759, 'num_leaves': 36, 'reg_alpha': 0.4257046103924453, 'reg_lambda': 0.1331120102419164, 'min_child_samples': 79, 'min_child_weight': 27.146115200185946}. Best is trial 5 with value: 0.7276148767383782.


[LightGBM] [Info] Number of positive: 52474, number of negative: 147465
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.022473 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 184
[LightGBM] [Info] Number of data points in the train set: 199939, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.262450 -> initscore=-1.033273
[LightGBM] [Info] Start training from score -1.033273


[I 2025-02-16 21:23:10,132] Trial 9 finished with value: 0.7274599138443376 and parameters: {'n_estimators': 500, 'learning_rate': 0.008161738908707796, 'max_depth': 9, 'subsample': 0.7208946888627332, 'colsample_bytree': 0.9397153616047617, 'num_leaves': 45, 'reg_alpha': 0.08491268401498224, 'reg_lambda': 1.565291165735074, 'min_child_samples': 91, 'min_child_weight': 1.2132929783421411}. Best is trial 5 with value: 0.7276148767383782.


[LightGBM] [Info] Number of positive: 52474, number of negative: 147465
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018616 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 184
[LightGBM] [Info] Number of data points in the train set: 199939, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.262450 -> initscore=-1.033273
[LightGBM] [Info] Start training from score -1.033273


[I 2025-02-16 21:23:11,648] Trial 10 finished with value: 0.7210501540163693 and parameters: {'n_estimators': 100, 'learning_rate': 0.0014606397937435664, 'max_depth': 6, 'subsample': 0.9986019213690722, 'colsample_bytree': 0.882744764175709, 'num_leaves': 60, 'reg_alpha': 0.15767063399573508, 'reg_lambda': 0.022834887724550553, 'min_child_samples': 10, 'min_child_weight': 0.002834533540655384}. Best is trial 5 with value: 0.7276148767383782.


[LightGBM] [Info] Number of positive: 52474, number of negative: 147465
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.015520 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 184
[LightGBM] [Info] Number of data points in the train set: 199939, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.262450 -> initscore=-1.033273
[LightGBM] [Info] Start training from score -1.033273


[I 2025-02-16 21:23:13,500] Trial 11 finished with value: 0.7233745911655121 and parameters: {'n_estimators': 100, 'learning_rate': 0.0027201927047893976, 'max_depth': 7, 'subsample': 0.9055808822404526, 'colsample_bytree': 0.8659670169195781, 'num_leaves': 57, 'reg_alpha': 0.011941287344728239, 'reg_lambda': 0.06511370281131632, 'min_child_samples': 69, 'min_child_weight': 0.01186935563212548}. Best is trial 5 with value: 0.7276148767383782.


[LightGBM] [Info] Number of positive: 52474, number of negative: 147465
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.016227 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 184
[LightGBM] [Info] Number of data points in the train set: 199939, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.262450 -> initscore=-1.033273
[LightGBM] [Info] Start training from score -1.033273


[I 2025-02-16 21:23:15,889] Trial 12 finished with value: 0.7261634964350876 and parameters: {'n_estimators': 100, 'learning_rate': 0.019232056600302066, 'max_depth': 9, 'subsample': 0.8957953642121651, 'colsample_bytree': 0.7018747895247048, 'num_leaves': 54, 'reg_alpha': 0.028409696786233108, 'reg_lambda': 0.06180890976323309, 'min_child_samples': 66, 'min_child_weight': 93.32606971945135}. Best is trial 5 with value: 0.7276148767383782.


[LightGBM] [Info] Number of positive: 52474, number of negative: 147465
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.016256 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 184
[LightGBM] [Info] Number of data points in the train set: 199939, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.262450 -> initscore=-1.033273
[LightGBM] [Info] Start training from score -1.033273


[I 2025-02-16 21:23:19,613] Trial 13 finished with value: 0.7272547934901226 and parameters: {'n_estimators': 300, 'learning_rate': 0.04927659669235577, 'max_depth': 6, 'subsample': 0.9444731595312401, 'colsample_bytree': 0.781559076450512, 'num_leaves': 54, 'reg_alpha': 0.02604762769305331, 'reg_lambda': 0.10490647210360174, 'min_child_samples': 78, 'min_child_weight': 0.027693180023526803}. Best is trial 5 with value: 0.7276148767383782.


[LightGBM] [Info] Number of positive: 52474, number of negative: 147465
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.020464 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 184
[LightGBM] [Info] Number of data points in the train set: 199939, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.262450 -> initscore=-1.033273
[LightGBM] [Info] Start training from score -1.033273


[I 2025-02-16 21:23:21,724] Trial 14 finished with value: 0.7256593191513253 and parameters: {'n_estimators': 100, 'learning_rate': 0.016106876134819093, 'max_depth': 7, 'subsample': 0.880672259864128, 'colsample_bytree': 0.9021556841986508, 'num_leaves': 60, 'reg_alpha': 0.010677631755466996, 'reg_lambda': 1.2334158196066762, 'min_child_samples': 60, 'min_child_weight': 0.704700510420891}. Best is trial 5 with value: 0.7276148767383782.


[LightGBM] [Info] Number of positive: 52474, number of negative: 147465
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.019567 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 184
[LightGBM] [Info] Number of data points in the train set: 199939, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.262450 -> initscore=-1.033273
[LightGBM] [Info] Start training from score -1.033273


[I 2025-02-16 21:23:25,881] Trial 15 finished with value: 0.7232671705267029 and parameters: {'n_estimators': 500, 'learning_rate': 0.0033725928074157543, 'max_depth': 5, 'subsample': 0.8677817220003086, 'colsample_bytree': 0.9988552386457163, 'num_leaves': 49, 'reg_alpha': 0.028238055443780077, 'reg_lambda': 0.31735023718534927, 'min_child_samples': 82, 'min_child_weight': 7.158333383328283}. Best is trial 5 with value: 0.7276148767383782.


[LightGBM] [Info] Number of positive: 52474, number of negative: 147465
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.017724 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 184
[LightGBM] [Info] Number of data points in the train set: 199939, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.262450 -> initscore=-1.033273
[LightGBM] [Info] Start training from score -1.033273


[I 2025-02-16 21:23:27,957] Trial 16 finished with value: 0.722973194775476 and parameters: {'n_estimators': 100, 'learning_rate': 0.0014517222790497223, 'max_depth': 7, 'subsample': 0.952535072883016, 'colsample_bytree': 0.8353299160817629, 'num_leaves': 54, 'reg_alpha': 0.018585466559678947, 'reg_lambda': 0.04239228396079983, 'min_child_samples': 57, 'min_child_weight': 0.0010106560872868242}. Best is trial 5 with value: 0.7276148767383782.


[LightGBM] [Info] Number of positive: 52474, number of negative: 147465
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.017904 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 184
[LightGBM] [Info] Number of data points in the train set: 199939, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.262450 -> initscore=-1.033273
[LightGBM] [Info] Start training from score -1.033273


[I 2025-02-16 21:23:35,091] Trial 17 finished with value: 0.7269775765482279 and parameters: {'n_estimators': 500, 'learning_rate': 0.024753318277983313, 'max_depth': 9, 'subsample': 0.7958927633724855, 'colsample_bytree': 0.7417702801386711, 'num_leaves': 47, 'reg_alpha': 0.042949238135606006, 'reg_lambda': 0.21103566002018465, 'min_child_samples': 99, 'min_child_weight': 0.35400203488780535}. Best is trial 5 with value: 0.7276148767383782.


[LightGBM] [Info] Number of positive: 52474, number of negative: 147465
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018831 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 184
[LightGBM] [Info] Number of data points in the train set: 199939, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.262450 -> initscore=-1.033273
[LightGBM] [Info] Start training from score -1.033273


[W 2025-02-16 21:23:37,382] Trial 18 failed with parameters: {'n_estimators': 200, 'learning_rate': 0.012247397759418346, 'max_depth': 8, 'subsample': 0.8639265059112904, 'colsample_bytree': 0.810894357146377, 'num_leaves': 31, 'reg_alpha': 0.11502511823550932, 'reg_lambda': 2.6592427217244032, 'min_child_samples': 71, 'min_child_weight': 9.581174531363141} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "C:\Users\USER\anaconda3\Lib\site-packages\optuna\study\_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "C:\Users\USER\AppData\Local\Temp\ipykernel_30456\1682182320.py", line 42, in <lambda>
    study.optimize(lambda trial: objective(trial, X_train_encoded, y), n_trials=100)  # 100번의 실험 수행
                                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\USER\AppData\Local\Temp\ipykernel_30456\1682182320.py", line 32, in objective
    model.fit(X_train_e