In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

# 1. 데이터 불러오기
try:
    train_df = pd.read_csv('../data/train.csv')
    test_df = pd.read_csv('../data/test.csv')
    submission_df = pd.read_csv('../data/sample_submission.csv')
except FileNotFoundError:
    print("파일 경로를 다시 확인해주세요.")
    exit()

# 2. 전처리 및 파생 변수 생성 함수 (최고점 모델 기준)
def preprocess_and_feature_engineer(df):
    df_processed = df.copy()
    
    if 'gender' in df_processed.columns:
        df_processed = df_processed.drop('gender', axis=1)

    df_processed.loc[df_processed['bone_density'] < 0, 'bone_density'] = 0
    df_processed[['medical_history', 'family_medical_history', 'edu_level']] = df_processed[['medical_history', 'family_medical_history', 'edu_level']].fillna('unknown')
    df_processed['mean_working'] = df_processed.groupby(['smoke_status', 'edu_level'])['mean_working'].transform(lambda x: x.fillna(x.median()))
    df_processed['mean_working'] = df_processed['mean_working'].fillna(df_processed['mean_working'].median())
    
    categorical_cols = ['activity', 'smoke_status', 'sleep_pattern',
                        'medical_history', 'family_medical_history', 'edu_level']
    df_processed = pd.get_dummies(df_processed, columns=categorical_cols, drop_first=True)
    
    df_processed['BMI'] = df_processed['weight'] / (df_processed['height'] / 100) ** 2
    df_processed['Pulse_Pressure'] = df_processed['systolic_blood_pressure'] - df_processed['diastolic_blood_pressure']
    df_processed['bp_interaction'] = df_processed['systolic_blood_pressure'] * df_processed['diastolic_blood_pressure']
    df_processed['cholesterol_glucose_ratio'] = df_processed['cholesterol'] / (df_processed['glucose'] + 1)
    df_processed['bone_density_bmi_interaction'] = df_processed['bone_density'] * df_processed['BMI']
    df_processed['blood_pressure_sum'] = df_processed['systolic_blood_pressure'] + df_processed['diastolic_blood_pressure']

    return df_processed

# 3. 데이터에 함수 적용
train_final = preprocess_and_feature_engineer(train_df.copy())
test_final = preprocess_and_feature_engineer(test_df.copy())

# 4. 학습 데이터와 테스트 데이터의 컬럼 일치시키기
train_cols = set(train_final.columns)
test_cols = set(test_final.columns)
common_cols = list(train_cols.intersection(test_cols))
common_cols = [col for col in common_cols if col not in ['ID', 'stress_score']]

low_importance_features = [
    'sleep_pattern_sleep difficulty',
    'activity_moderate',
    'mean_working',
    'family_medical_history_unknown'
]
final_cols = [col for col in common_cols if col not in low_importance_features]

X_train_full = train_final[final_cols]
y_train_full = train_final['stress_score']

# 5. 최적화된 LGBM과 XGBoost 모델 준비
best_lgbm_params = {'learning_rate': 0.1, 'n_estimators': 400, 'num_leaves': 40}
best_xgb_params = {'learning_rate': 0.05, 'max_depth': 9, 'n_estimators': 600}

lgb_model = lgb.LGBMRegressor(random_state=42, **best_lgbm_params)
xgb_model = xgb.XGBRegressor(random_state=42, n_jobs=-1, **best_xgb_params)

# 6. 다양한 가중치 조합으로 교차 검증 RMSE 측정
print("--- 앙상블 가중치 최적화 시작 ---")
best_rmse = float('inf')
best_weights = None
weights_to_test = np.arange(0.1, 1.0, 0.1)

for lgb_weight in weights_to_test:
    xgb_weight = 1 - lgb_weight
    
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    oof_predictions = np.zeros(len(train_final))
    
    for fold, (train_index, val_index) in enumerate(kf.split(X_train_full, y_train_full)):
        X_train, X_val = X_train_full.iloc[train_index], X_train_full.iloc[val_index]
        y_train, y_val = y_train_full.iloc[train_index], y_train_full.iloc[val_index]

        lgb_model.fit(X_train, y_train)
        lgb_preds = lgb_model.predict(X_val)
        
        xgb_model.fit(X_train, y_train)
        xgb_preds = xgb_model.predict(X_val)

        oof_predictions[val_index] = (lgb_weight * lgb_preds) + (xgb_weight * xgb_preds)

    current_rmse = np.sqrt(mean_squared_error(y_train_full, oof_predictions))
    
    print(f"LGBM 가중치: {lgb_weight:.1f}, XGBoost 가중치: {xgb_weight:.1f}, RMSE: {current_rmse:.5f}")
    
    if current_rmse < best_rmse:
        best_rmse = current_rmse
        best_weights = (lgb_weight, xgb_weight)

print("---")
print(f"**최적의 앙상블 가중치**: LGBM {best_weights[0]:.1f}, XGBoost {best_weights[1]:.1f}")
print(f"**최소 RMSE**: {best_rmse:.5f}")
print("---")

# 7. 최종 모델 학습 및 제출 파일 생성 (최적 가중치 적용)
print("--- 최종 모델 학습 및 제출 파일 생성 시작 ---")

lgb_model.fit(X_train_full, y_train_full)
xgb_model.fit(X_train_full, y_train_full)

lgb_test_preds = lgb_model.predict(test_final[final_cols])
xgb_test_preds = xgb_model.predict(test_final[final_cols])

final_predictions = (best_weights[0] * lgb_test_preds) + (best_weights[1] * xgb_test_preds)

submission_df['stress_score'] = final_predictions
submission_df.to_csv('submission.csv', index=False)

print("✅ submission.csv 파일 생성 완료!")
print("\n테스트 데이터 예측값 (상위 5개):\n", final_predictions[:5])

--- 앙상블 가중치 최적화 시작 ---
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000327 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2642
[LightGBM] [Info] Number of data points in the train set: 2400, number of used features: 26
[LightGBM] [Info] Start training from score 0.481912


[WinError 2] 지정된 파일을 찾을 수 없습니다
  File "c:\Users\rladud\anaconda3\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
        "wmic CPU Get NumberOfCores /Format:csv".split(),
        capture_output=True,
        text=True,
    )
  File "c:\Users\rladud\anaconda3\Lib\subprocess.py", line 554, in run
    with Popen(*popenargs, **kwargs) as process:
         ~~~~~^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\rladud\anaconda3\Lib\subprocess.py", line 1039, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
    ~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
                        pass_fds, cwd, env,
                        ^^^^^^^^^^^^^^^^^^^
    ...<5 lines>...
                        gid, gids, uid, umask,
                        ^^^^^^^^^^^^^^^^^^^^^^
                        start_new_session, process_group)
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Use

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000209 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2642
[LightGBM] [Info] Number of data points in the train set: 2400, number of used features: 26
[LightGBM] [Info] Start training from score 0.486092
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000288 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2639
[LightGBM] [Info] Number of data points in the train set: 2400, number of used features: 26
[LightGBM] [Info] Start training from score 0.481937
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000308 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2640
[LightGBM] [Info] Number of data points in the train set: 2400, number of used features: 26
[LightGBM] [Info] Start traini