In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error
from lightgbm import early_stopping

# 복합 파생 변수가 적용된 최종 데이터 불러오기
train_df_combo = pd.read_csv('../preprocessed_data/train_processed_combo.csv')
test_df_combo = pd.read_csv('../preprocessed_data/test_processed_combo.csv')

# 피처(X)와 타겟(y) 분리
X = train_df_combo.drop(['ID', 'stress_score'], axis=1)
y = train_df_combo['stress_score']
X_test = test_df_combo.drop('ID', axis=1)

# K-Fold 교차 검증 설정 (5개의 폴드로 나눔)
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

# RMSE를 저장할 리스트
rmse_scores = []
# 예측 결과를 저장할 배열
test_preds = np.zeros(len(X_test))

# LightGBM 모델 하이퍼파라미터 설정 (우리가 찾은 최적 파라미터)
lgbm_params = {
    'n_estimators': 700,
    'learning_rate': 0.05,
    'max_depth': 15,
    'random_state': 42,
    'n_jobs': -1
}

# K-Fold 교차 검증
for fold, (train_index, val_index) in enumerate(kf.split(X, y)):
    print(f"Fold {fold+1}/{n_splits} 시작")
    X_train, y_train = X.iloc[train_index], y.iloc[train_index]
    X_val, y_val = X.iloc[val_index], y.iloc[val_index]
    
    # LightGBM 모델 학습
    lgbm_model = LGBMRegressor(**lgbm_params)
    lgbm_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], callbacks=[early_stopping(100, verbose=False)])
    
    # 검증 데이터 예측 및 RMSE 계산
    y_pred_val = lgbm_model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred_val))
    rmse_scores.append(rmse)
    print(f"Fold {fold+1} RMSE: {rmse:.4f}")
    
    # 테스트 데이터 예측
    test_preds += lgbm_model.predict(X_test) / n_splits
    
# 평균 RMSE 출력
print(f"평균 교차 검증 RMSE: {np.mean(rmse_scores):.4f}")

# 제출 파일 생성
submission_df = pd.read_csv('../data/sample_submission.csv')
submission_df['stress_score'] = test_preds
submission_df.to_csv('submission_lgbm_best_combo.csv', index=False)

print("최고점 데이터와 최적의 LightGBM 모델을 사용한 최종 제출 파일 'submission_lgbm_best_combo.csv'가 성공적으로 생성되었습니다.")

Fold 1/5 시작
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000332 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1766
[LightGBM] [Info] Number of data points in the train set: 2400, number of used features: 53
[LightGBM] [Info] Start training from score 0.481912
Fold 1 RMSE: 0.2321
Fold 2/5 시작
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000316 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1767
[LightGBM] [Info] Number of data points in the train set: 2400, number of used features: 53
[LightGBM] [Info] Start training from score 0.486092
Fold 2 RMSE: 0.2428
Fold 3/5 시작
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000266 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1769
[LightGBM] [Info] Number of data points in the