In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import prince

from sklearn.preprocessing import  OrdinalEncoder
from sklearn.ensemble import ExtraTreesClassifier

train = pd.read_csv('./train.csv').drop(columns=['ID'])
test = pd.read_csv('./test.csv').drop(columns=['ID'])

In [9]:
#'임신 시도 또는 마지막 임신 경과 연수' 열 제거
train = train.drop(columns=['난자 해동 경과일'])
test = test.drop(columns=['난자 해동 경과일'])

#'시술 유형' 열 제거
train = train.drop(columns=['PGS 시술 여부'])
test = test.drop(columns=['PGS 시술 여부'])

#'시술 유형' 열 제거
train = train.drop(columns=['PGD 시술 여부'])
test = test.drop(columns=['PGD 시술 여부'])

#'시술 유형' 열 제거
train = train.drop(columns=['착상 전 유전 검사 사용 여부'])
test = test.drop(columns=['착상 전 유전 검사 사용 여부'])

#'시술 유형' 열 제거
train = train.drop(columns=['임신 시도 또는 마지막 임신 경과 연수'])
test = test.drop(columns=['임신 시도 또는 마지막 임신 경과 연수'])

#'시술 유형' 열 제거
train = train.drop(columns=['배아 해동 경과일'])
test = test.drop(columns=['배아 해동 경과일'])

In [10]:
X = train.drop('임신 성공 여부', axis=1)
y = train['임신 성공 여부']

# 범주형 컬럼과 수치형 컬럼 자동 추출
categorical_columns = X.select_dtypes(include=['object']).columns.tolist()
numeric_columns = X.select_dtypes(include=['number']).columns.tolist()

# 범주형 데이터의 모든 자료형을 문자열로 변환
for col in categorical_columns:
    X[col] = X[col].astype(str)
    test[col] = test[col].astype(str)

# 범주의 정수화 (by sklearn 전처리)
ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

X_train_encoded = X.copy()
X_train_encoded[categorical_columns] = ordinal_encoder.fit_transform(X[categorical_columns])

X_test_encoded = test.copy()
X_test_encoded[categorical_columns] = ordinal_encoder.transform(test[categorical_columns])

# 결측치 처리
#X_train_encoded[numeric_columns] = X_train_encoded[numeric_columns].fillna(0)
#X_test_encoded[numeric_columns] = X_test_encoded[numeric_columns].fillna(0)

In [11]:
# 1. 라이브러리 import
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import numpy as np

# 2. Stratified K-Fold 설정 (데이터 불균형 고려)
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
auc_scores = []

# 3. LightGBM 모델 설정
model = lgb.LGBMClassifier(random_state=42, objective='binary')

# 4. Cross Validation 수행 
for train_idx, val_idx in kf.split(X_train_encoded, y):
    X_train_part, X_val = X_train_encoded.iloc[train_idx], X_train_encoded.iloc[val_idx]
    y_train_part, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    # 모델 학습
    model.fit(X_train_part, y_train_part)
    
    # Validation 데이터 예측 확률
    val_pred_proba = model.predict_proba(X_val)[:, 1]
    
    # ROC-AUC 점수 계산
    roc_auc = roc_auc_score(y_val, val_pred_proba)
    auc_scores.append(roc_auc)

# 5. 평균 ROC-AUC 점수 출력
mean_auc = np.mean(auc_scores)
print(f"ROC score: {mean_auc:.10f}") # dacon과 같이 소수점 10자리 출력!

[LightGBM] [Info] Number of positive: 59605, number of negative: 171110
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.033599 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 700
[LightGBM] [Info] Number of data points in the train set: 230715, number of used features: 58
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258349 -> initscore=-1.054567
[LightGBM] [Info] Start training from score -1.054567
[LightGBM] [Info] Number of positive: 59606, number of negative: 171110
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.032308 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 706
[LightGBM] [Info] Number of data points in the train set: 230716, number of used features: 58
[LightGBM] [Info

***