In [3]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

# 1. 데이터 불러오기
try:
    train_df = pd.read_csv('../data/train.csv')
    test_df = pd.read_csv('../data/test.csv')
    submission_df = pd.read_csv('../data/sample_submission.csv')
except FileNotFoundError:
    print("파일 경로를 다시 확인해주세요.")
    exit()

# 2. 전처리 및 파생 변수 생성 함수 (최고점 모델 기준)
def preprocess_and_feature_engineer(df):
    df_processed = df.copy()
    
    if 'gender' in df_processed.columns:
        df_processed = df_processed.drop('gender', axis=1)

    df_processed.loc[df_processed['bone_density'] < 0, 'bone_density'] = 0
    df_processed[['medical_history', 'family_medical_history', 'edu_level']] = df_processed[['medical_history', 'family_medical_history', 'edu_level']].fillna('unknown')
    df_processed['mean_working'] = df_processed.groupby(['smoke_status', 'edu_level'])['mean_working'].transform(lambda x: x.fillna(x.median()))
    df_processed['mean_working'] = df_processed['mean_working'].fillna(df_processed['mean_working'].median())
    
    categorical_cols = ['activity', 'smoke_status', 'sleep_pattern',
                        'medical_history', 'family_medical_history', 'edu_level']
    df_processed = pd.get_dummies(df_processed, columns=categorical_cols, drop_first=True)
    
    df_processed['BMI'] = df_processed['weight'] / (df_processed['height'] / 100) ** 2
    df_processed['Pulse_Pressure'] = df_processed['systolic_blood_pressure'] - df_processed['diastolic_blood_pressure']
    df_processed['bp_interaction'] = df_processed['systolic_blood_pressure'] * df_processed['diastolic_blood_pressure']
    df_processed['cholesterol_glucose_ratio'] = df_processed['cholesterol'] / (df_processed['glucose'] + 1)
    df_processed['bone_density_bmi_interaction'] = df_processed['bone_density'] * df_processed['BMI']
    df_processed['blood_pressure_sum'] = df_processed['systolic_blood_pressure'] + df_processed['diastolic_blood_pressure']

    return df_processed

# 3. 데이터에 함수 적용
train_final = preprocess_and_feature_engineer(train_df.copy())
test_final = preprocess_and_feature_engineer(test_df.copy())

# 4. 학습 데이터와 테스트 데이터의 컬럼 일치시키기
train_cols = set(train_final.columns)
test_cols = set(test_final.columns)
common_cols = list(train_cols.intersection(test_cols))
common_cols = [col for col in common_cols if col not in ['ID', 'stress_score']]

low_importance_features = [
    'sleep_pattern_sleep difficulty',
    'activity_moderate',
    'mean_working',
    'family_medical_history_unknown'
]
final_cols = [col for col in common_cols if col not in low_importance_features]

X_train_full = train_final[final_cols]
y_train_full = train_final['stress_score']

# 5. 최적화된 LGBM, XGBoost, 그리고 CatBoost 모델 준비
best_lgbm_params = {'learning_rate': 0.1, 'n_estimators': 400, 'num_leaves': 40}
best_xgb_params = {'learning_rate': 0.05, 'max_depth': 9, 'n_estimators': 600}

lgb_model = lgb.LGBMRegressor(random_state=42, **best_lgbm_params)
xgb_model = xgb.XGBRegressor(random_state=42, n_jobs=-1, **best_xgb_params)
cat_model = CatBoostRegressor(random_state=42, verbose=0) # 기본 파라미터로 시작

# 6. K-Fold 교차 검증을 통한 RMSE 측정 (세 모델 앙상블)
print("--- 3개 모델 앙상블 로컬 점수 측정 시작 ---")

kf = KFold(n_splits=5, shuffle=True, random_state=42)
lgb_oof = np.zeros(len(train_final))
xgb_oof = np.zeros(len(train_final))
cat_oof = np.zeros(len(train_final))

for fold, (train_index, val_index) in enumerate(kf.split(X_train_full, y_train_full)):
    X_train, X_val = X_train_full.iloc[train_index], X_train_full.iloc[val_index]
    y_train, y_val = y_train_full.iloc[train_index], y_train_full.iloc[val_index]

    lgb_model.fit(X_train, y_train)
    lgb_oof[val_index] = lgb_model.predict(X_val)
    
    xgb_model.fit(X_train, y_train)
    xgb_oof[val_index] = xgb_model.predict(X_val)
    
    cat_model.fit(X_train, y_train)
    cat_oof[val_index] = cat_model.predict(X_val)

# 가중치 앙상블 (초기 가중치 0.33, 0.33, 0.34)
oof_predictions = (lgb_oof * 0.33) + (xgb_oof * 0.33) + (cat_oof * 0.34)
oof_rmse = np.sqrt(mean_squared_error(y_train_full, oof_predictions))

print(f"**교차 검증 RMSE 예상 점수**: {oof_rmse:.5f}")
print("---")

# 7. 최종 모델 학습 및 제출 파일 생성 (임시 가중치 적용)
print("--- 최종 모델 학습 및 제출 파일 생성 시작 ---")

lgb_model.fit(X_train_full, y_train_full)
xgb_model.fit(X_train_full, y_train_full)
cat_model.fit(X_train_full, y_train_full)

lgb_test_preds = lgb_model.predict(test_final[final_cols])
xgb_test_preds = xgb_model.predict(test_final[final_cols])
cat_test_preds = cat_model.predict(test_final[final_cols])

final_predictions = (lgb_test_preds * 0.33) + (xgb_test_preds * 0.33) + (cat_test_preds * 0.34)

submission_df['stress_score'] = final_predictions
submission_df.to_csv('submission.csv', index=False)

print("✅ submission.csv 파일 생성 완료!")

--- 3개 모델 앙상블 로컬 점수 측정 시작 ---
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000326 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2642
[LightGBM] [Info] Number of data points in the train set: 2400, number of used features: 26
[LightGBM] [Info] Start training from score 0.481912


[WinError 2] 지정된 파일을 찾을 수 없습니다
  File "c:\Users\rladud\anaconda3\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
        "wmic CPU Get NumberOfCores /Format:csv".split(),
        capture_output=True,
        text=True,
    )
  File "c:\Users\rladud\anaconda3\Lib\subprocess.py", line 554, in run
    with Popen(*popenargs, **kwargs) as process:
         ~~~~~^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\rladud\anaconda3\Lib\subprocess.py", line 1039, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
    ~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
                        pass_fds, cwd, env,
                        ^^^^^^^^^^^^^^^^^^^
    ...<5 lines>...
                        gid, gids, uid, umask,
                        ^^^^^^^^^^^^^^^^^^^^^^
                        start_new_session, process_group)
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Use

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000229 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2642
[LightGBM] [Info] Number of data points in the train set: 2400, number of used features: 26
[LightGBM] [Info] Start training from score 0.486092
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000211 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2639
[LightGBM] [Info] Number of data points in the train set: 2400, number of used features: 26
[LightGBM] [Info] Start training from score 0.481937
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000461 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2640
[LightGBM] [Info] Number of data points in the train set: 2400, number of used features: 26
[LightGBM] [Info] Start traini

In [12]:

import pandas as pd
import numpy as np
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

# 1. 데이터 불러오기
try:
    train_df = pd.read_csv('../data/train.csv')
    test_df = pd.read_csv('../data/test.csv')
    submission_df = pd.read_csv('../data/sample_submission.csv')
except FileNotFoundError:
    print("파일 경로를 다시 확인해주세요.")
    exit()

# 2. 전처리 및 파생 변수 생성 함수
def preprocess_and_feature_engineer(df):
    df_processed = df.copy()
    
    if 'gender' in df_processed.columns:
        df_processed = df_processed.drop('gender', axis=1)

    df_processed.loc[df_processed['bone_density'] < 0, 'bone_density'] = 0
    df_processed[['medical_history', 'family_medical_history', 'edu_level']] = df_processed[['medical_history', 'family_medical_history', 'edu_level']].fillna('unknown')
    df_processed['mean_working'] = df_processed.groupby(['smoke_status', 'edu_level'])['mean_working'].transform(lambda x: x.fillna(x.median()))
    df_processed['mean_working'] = df_processed['mean_working'].fillna(df_processed['mean_working'].median())
    
    categorical_cols = ['activity', 'smoke_status', 'sleep_pattern',
                        'medical_history', 'family_medical_history', 'edu_level']
    df_processed = pd.get_dummies(df_processed, columns=categorical_cols, drop_first=True)
    
    df_processed['BMI'] = df_processed['weight'] / (df_processed['height'] / 100) ** 2
    df_processed['Pulse_Pressure'] = df_processed['systolic_blood_pressure'] - df_processed['diastolic_blood_pressure']
    df_processed['bp_interaction'] = df_processed['systolic_blood_pressure'] * df_processed['diastolic_blood_pressure']
    df_processed['cholesterol_glucose_ratio'] = df_processed['cholesterol'] / (df_processed['glucose'] + 1)
    df_processed['bone_density_bmi_interaction'] = df_processed['bone_density'] * df_processed['BMI']
    df_processed['blood_pressure_sum'] = df_processed['systolic_blood_pressure'] + df_processed['diastolic_blood_pressure']

    return df_processed

# 3. 데이터에 함수 적용
train_final = preprocess_and_feature_engineer(train_df.copy())
test_final = preprocess_and_feature_engineer(test_df.copy())

# 4. 학습 데이터와 테스트 데이터의 컬럼 일치시키기
train_cols = set(train_final.columns)
test_cols = set(test_final.columns)
common_cols = list(train_cols.intersection(test_cols))
common_cols = [col for col in common_cols if col not in ['ID', 'stress_score']]

low_importance_features = [
    'sleep_pattern_sleep difficulty',
    'activity_moderate',
    'mean_working',
    'family_medical_history_unknown'
]
final_cols = [col for col in common_cols if col not in low_importance_features]

X_train_full = train_final[final_cols]
y_train_full = train_final['stress_score']

# 5. 최적화된 LGBM, XGBoost, CatBoost 모델 준비
best_lgbm_params = {'learning_rate': 0.1, 'n_estimators': 400, 'num_leaves': 40}
best_xgb_params = {'learning_rate': 0.05, 'max_depth': 9, 'n_estimators': 600}
cat_model = CatBoostRegressor(random_state=42, verbose=0)

lgb_model = lgb.LGBMRegressor(random_state=42, **best_lgbm_params)
xgb_model = xgb.XGBRegressor(random_state=42, n_jobs=-1, **best_xgb_params)

# 6. 다양한 가중치 조합으로 교차 검증 RMSE 측정 (CatBoost 포함)
print("--- 3개 모델 앙상블 가중치 최적화 시작 ---")

best_rmse = float('inf')
best_weights = None
weights_to_test = [(i / 10, j / 10, k / 10) for i in range(1, 9) for j in range(1, 9) for k in range(1, 9) if i + j + k == 10]

for lgb_weight, xgb_weight, cat_weight in weights_to_test:
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    oof_predictions = np.zeros(len(train_final))
    
    for fold, (train_index, val_index) in enumerate(kf.split(X_train_full, y_train_full)):
        X_train, X_val = X_train_full.iloc[train_index], X_train_full.iloc[val_index]
        y_train, y_val = y_train_full.iloc[train_index], y_train_full.iloc[val_index]

        lgb_model.fit(X_train, y_train)
        lgb_preds = lgb_model.predict(X_val)
        
        xgb_model.fit(X_train, y_train)
        xgb_preds = xgb_model.predict(X_val)
        
        cat_model.fit(X_train, y_train)
        cat_preds = cat_model.predict(X_val)

        oof_predictions[val_index] = (lgb_weight * lgb_preds) + (xgb_weight * xgb_preds) + (cat_weight * cat_preds)

    current_rmse = np.sqrt(mean_squared_error(y_train_full, oof_predictions))
    
    print(f"LGBM:{lgb_weight:.1f}, XGBoost:{xgb_weight:.1f}, CatBoost:{cat_weight:.1f}, RMSE:{current_rmse:.5f}")
    
    if current_rmse < best_rmse:
        best_rmse = current_rmse
        best_weights = (lgb_weight, xgb_weight, cat_weight)

print("---")
print(f"**최적의 앙상블 가중치**: LGBM {best_weights[0]:.1f}, XGBoost {best_weights[1]:.1f}, CatBoost {best_weights[2]:.1f}")
print(f"**최소 RMSE**: {best_rmse:.5f}")
print("---")

# 7. 최종 모델 학습 및 제출 파일 생성 (최적 가중치 적용)
print("--- 최종 모델 학습 및 제출 파일 생성 시작 ---")

lgb_model.fit(X_train_full, y_train_full)
xgb_model.fit(X_train_full, y_train_full)
cat_model.fit(X_train_full, y_train_full)

lgb_test_preds = lgb_model.predict(test_final[final_cols])
xgb_test_preds = xgb_model.predict(test_final[final_cols])
cat_test_preds = cat_model.predict(test_final[final_cols])

final_predictions = (best_weights[0] * lgb_test_preds) + (best_weights[1] * xgb_test_preds) + (best_weights[2] * cat_test_preds)

submission_df['stress_score'] = final_predictions
submission_df.to_csv('submission.csv', index=False)

print("✅ submission.csv 파일 생성 완료!")
print("\n테스트 데이터 예측값 (상위 5개):\n", final_predictions[:5])

--- 3개 모델 앙상블 가중치 최적화 시작 ---
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000321 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2642
[LightGBM] [Info] Number of data points in the train set: 2400, number of used features: 26
[LightGBM] [Info] Start training from score 0.481912
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000294 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2642
[LightGBM] [Info] Number of data points in the train set: 2400, number of used features: 26
[LightGBM] [Info] Start training from score 0.486092
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000195 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2639
[LightGBM] [Info] Number of data points in the train set: 2400, number of used features: 26
[

In [13]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor
from sklearn.model_selection import KFold, RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from scipy.stats import uniform, randint

# 1. 데이터 불러오기
try:
    train_df = pd.read_csv('../data/train.csv')
    test_df = pd.read_csv('../data/test.csv')
    submission_df = pd.read_csv('../data/sample_submission.csv')
except FileNotFoundError:
    print("파일 경로를 다시 확인해주세요.")
    exit()

# 2. 전처리 및 파생 변수 생성 함수
def preprocess_and_feature_engineer(df):
    df_processed = df.copy()
    
    if 'gender' in df_processed.columns:
        df_processed = df_processed.drop('gender', axis=1)

    df_processed.loc[df_processed['bone_density'] < 0, 'bone_density'] = 0
    df_processed[['medical_history', 'family_medical_history', 'edu_level']] = df_processed[['medical_history', 'family_medical_history', 'edu_level']].fillna('unknown')
    df_processed['mean_working'] = df_processed.groupby(['smoke_status', 'edu_level'])['mean_working'].transform(lambda x: x.fillna(x.median()))
    df_processed['mean_working'] = df_processed['mean_working'].fillna(df_processed['mean_working'].median())
    
    categorical_cols = ['activity', 'smoke_status', 'sleep_pattern',
                        'medical_history', 'family_medical_history', 'edu_level']
    df_processed = pd.get_dummies(df_processed, columns=categorical_cols, drop_first=True)
    
    df_processed['BMI'] = df_processed['weight'] / (df_processed['height'] / 100) ** 2
    df_processed['Pulse_Pressure'] = df_processed['systolic_blood_pressure'] - df_processed['diastolic_blood_pressure']
    df_processed['bp_interaction'] = df_processed['systolic_blood_pressure'] * df_processed['diastolic_blood_pressure']
    df_processed['cholesterol_glucose_ratio'] = df_processed['cholesterol'] / (df_processed['glucose'] + 1)
    df_processed['bone_density_bmi_interaction'] = df_processed['bone_density'] * df_processed['BMI']
    df_processed['blood_pressure_sum'] = df_processed['systolic_blood_pressure'] + df_processed['diastolic_blood_pressure']

    return df_processed

# 3. 데이터에 함수 적용
train_final = preprocess_and_feature_engineer(train_df.copy())

# 4. 학습 데이터와 테스트 데이터의 컬럼 일치시키기
train_cols = set(train_final.columns)
common_cols = list(train_cols)
common_cols = [col for col in common_cols if col not in ['ID', 'stress_score']]

low_importance_features = [
    'sleep_pattern_sleep difficulty',
    'activity_moderate',
    'mean_working',
    'family_medical_history_unknown'
]
final_cols = [col for col in common_cols if col not in low_importance_features]

X_train_full = train_final[final_cols]
y_train_full = train_final['stress_score']

# 5. CatBoost 모델의 하이퍼파라미터 튜닝
print("--- CatBoost 하이퍼파라미터 튜닝 시작 ---")

cat_model_base = CatBoostRegressor(random_state=42, verbose=0)

param_distributions = {
    'learning_rate': uniform(0.01, 0.1),
    'depth': randint(4, 10),
    'l2_leaf_reg': uniform(1, 10),
    'iterations': randint(200, 1000)
}

random_search = RandomizedSearchCV(
    estimator=cat_model_base,
    param_distributions=param_distributions,
    n_iter=50,  # 테스트할 파라미터 조합의 수. 시간이 많이 걸리면 줄여.
    cv=KFold(n_splits=5, shuffle=True, random_state=42),
    scoring='neg_root_mean_squared_error',
    random_state=42,
    n_jobs=-1,
    verbose=1
)

random_search.fit(X_train_full, y_train_full)

best_params = random_search.best_params_
best_rmse = -random_search.best_score_

print("---")
print(f"**최적의 CatBoost 파라미터**: {best_params}")
print(f"**최적 파라미터에서의 RMSE**: {best_rmse:.5f}")
print("---")

--- CatBoost 하이퍼파라미터 튜닝 시작 ---
Fitting 5 folds for each of 50 candidates, totalling 250 fits
---
**최적의 CatBoost 파라미터**: {'depth': 9, 'iterations': 926, 'l2_leaf_reg': np.float64(6.208342600258237), 'learning_rate': np.float64(0.10611720243493492)}
**최적 파라미터에서의 RMSE**: 0.23849
---


In [14]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

# 1. 데이터 불러오기
try:
    train_df = pd.read_csv('../data/train.csv')
    test_df = pd.read_csv('../data/test.csv')
    submission_df = pd.read_csv('../data/sample_submission.csv')
except FileNotFoundError:
    print("파일 경로를 다시 확인해주세요.")
    exit()

# 2. 전처리 및 파생 변수 생성 함수
def preprocess_and_feature_engineer(df):
    df_processed = df.copy()
    
    if 'gender' in df_processed.columns:
        df_processed = df_processed.drop('gender', axis=1)

    df_processed.loc[df_processed['bone_density'] < 0, 'bone_density'] = 0
    df_processed[['medical_history', 'family_medical_history', 'edu_level']] = df_processed[['medical_history', 'family_medical_history', 'edu_level']].fillna('unknown')
    df_processed['mean_working'] = df_processed.groupby(['smoke_status', 'edu_level'])['mean_working'].transform(lambda x: x.fillna(x.median()))
    df_processed['mean_working'] = df_processed['mean_working'].fillna(df_processed['mean_working'].median())
    
    categorical_cols = ['activity', 'smoke_status', 'sleep_pattern',
                        'medical_history', 'family_medical_history', 'edu_level']
    df_processed = pd.get_dummies(df_processed, columns=categorical_cols, drop_first=True)
    
    df_processed['BMI'] = df_processed['weight'] / (df_processed['height'] / 100) ** 2
    df_processed['Pulse_Pressure'] = df_processed['systolic_blood_pressure'] - df_processed['diastolic_blood_pressure']
    df_processed['bp_interaction'] = df_processed['systolic_blood_pressure'] * df_processed['diastolic_blood_pressure']
    df_processed['cholesterol_glucose_ratio'] = df_processed['cholesterol'] / (df_processed['glucose'] + 1)
    df_processed['bone_density_bmi_interaction'] = df_processed['bone_density'] * df_processed['BMI']
    df_processed['blood_pressure_sum'] = df_processed['systolic_blood_pressure'] + df_processed['diastolic_blood_pressure']

    return df_processed

# 3. 데이터에 함수 적용
train_final = preprocess_and_feature_engineer(train_df.copy())
test_final = preprocess_and_feature_engineer(test_df.copy())

# 4. 학습 데이터와 테스트 데이터의 컬럼 일치시키기
train_cols = set(train_final.columns)
test_cols = set(test_final.columns)
common_cols = list(train_cols.intersection(test_cols))
common_cols = [col for col in common_cols if col not in ['ID', 'stress_score']]

low_importance_features = [
    'sleep_pattern_sleep difficulty',
    'activity_moderate',
    'mean_working',
    'family_medical_history_unknown'
]
final_cols = [col for col in common_cols if col not in low_importance_features]

X_train_full = train_final[final_cols]
y_train_full = train_final['stress_score']

# 5. 최적화된 LGBM, XGBoost, 그리고 CatBoost 모델 준비
best_lgbm_params = {'learning_rate': 0.1, 'n_estimators': 400, 'num_leaves': 40}
best_xgb_params = {'learning_rate': 0.05, 'max_depth': 9, 'n_estimators': 600}
best_cat_params = {'depth': 9, 'iterations': 926, 'l2_leaf_reg': 6.20834, 'learning_rate': 0.10611}

lgb_model = lgb.LGBMRegressor(random_state=42, **best_lgbm_params)
xgb_model = xgb.XGBRegressor(random_state=42, n_jobs=-1, **best_xgb_params)
cat_model = CatBoostRegressor(random_state=42, verbose=0, **best_cat_params)

# 6. 다양한 가중치 조합으로 교차 검증 RMSE 측정 (최적화된 CatBoost 포함)
print("--- 3개 모델 앙상블 가중치 최적화 시작 (CatBoost 튜닝 적용) ---")

best_rmse = float('inf')
best_weights = None
weights_to_test = [(i / 10, j / 10, k / 10) for i in range(1, 9) for j in range(1, 9) for k in range(1, 9) if i + j + k == 10]

for lgb_weight, xgb_weight, cat_weight in weights_to_test:
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    oof_predictions = np.zeros(len(train_final))
    
    for fold, (train_index, val_index) in enumerate(kf.split(X_train_full, y_train_full)):
        X_train, X_val = X_train_full.iloc[train_index], X_train_full.iloc[val_index]
        y_train, y_val = y_train_full.iloc[train_index], y_train_full.iloc[val_index]

        lgb_model.fit(X_train, y_train)
        lgb_preds = lgb_model.predict(X_val)
        
        xgb_model.fit(X_train, y_train)
        xgb_preds = xgb_model.predict(X_val)
        
        cat_model.fit(X_train, y_train)
        cat_preds = cat_model.predict(X_val)

        oof_predictions[val_index] = (lgb_weight * lgb_preds) + (xgb_weight * xgb_preds) + (cat_weight * cat_preds)

    current_rmse = np.sqrt(mean_squared_error(y_train_full, oof_predictions))
    
    print(f"LGBM:{lgb_weight:.1f}, XGBoost:{xgb_weight:.1f}, CatBoost:{cat_weight:.1f}, RMSE:{current_rmse:.5f}")
    
    if current_rmse < best_rmse:
        best_rmse = current_rmse
        best_weights = (lgb_weight, xgb_weight, cat_weight)

print("---")
print(f"**최적의 앙상블 가중치**: LGBM {best_weights[0]:.1f}, XGBoost {best_weights[1]:.1f}, CatBoost {best_weights[2]:.1f}")
print(f"**최소 RMSE**: {best_rmse:.5f}")
print("---")

--- 3개 모델 앙상블 가중치 최적화 시작 (CatBoost 튜닝 적용) ---
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000399 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2642
[LightGBM] [Info] Number of data points in the train set: 2400, number of used features: 26
[LightGBM] [Info] Start training from score 0.481912
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000200 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2642
[LightGBM] [Info] Number of data points in the train set: 2400, number of used features: 26
[LightGBM] [Info] Start training from score 0.486092
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000207 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2639
[LightGBM] [Info] Number of data points in the train set: 2400, number of us

In [15]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

# 1. 데이터 불러오기
try:
    train_df = pd.read_csv('../data/train.csv')
    test_df = pd.read_csv('../data/test.csv')
    submission_df = pd.read_csv('../data/sample_submission.csv')
except FileNotFoundError:
    print("파일 경로를 다시 확인해주세요.")
    exit()

# 2. 전처리 및 파생 변수 생성 함수
def preprocess_and_feature_engineer(df):
    df_processed = df.copy()
    
    if 'gender' in df_processed.columns:
        df_processed = df_processed.drop('gender', axis=1)

    df_processed.loc[df_processed['bone_density'] < 0, 'bone_density'] = 0
    df_processed[['medical_history', 'family_medical_history', 'edu_level']] = df_processed[['medical_history', 'family_medical_history', 'edu_level']].fillna('unknown')
    df_processed['mean_working'] = df_processed.groupby(['smoke_status', 'edu_level'])['mean_working'].transform(lambda x: x.fillna(x.median()))
    df_processed['mean_working'] = df_processed['mean_working'].fillna(df_processed['mean_working'].median())
    
    categorical_cols = ['activity', 'smoke_status', 'sleep_pattern',
                        'medical_history', 'family_medical_history', 'edu_level']
    df_processed = pd.get_dummies(df_processed, columns=categorical_cols, drop_first=True)
    
    df_processed['BMI'] = df_processed['weight'] / (df_processed['height'] / 100) ** 2
    df_processed['Pulse_Pressure'] = df_processed['systolic_blood_pressure'] - df_processed['diastolic_blood_pressure']
    df_processed['bp_interaction'] = df_processed['systolic_blood_pressure'] * df_processed['diastolic_blood_pressure']
    df_processed['cholesterol_glucose_ratio'] = df_processed['cholesterol'] / (df_processed['glucose'] + 1)
    df_processed['bone_density_bmi_interaction'] = df_processed['bone_density'] * df_processed['BMI']
    df_processed['blood_pressure_sum'] = df_processed['systolic_blood_pressure'] + df_processed['diastolic_blood_pressure']

    return df_processed

# 3. 데이터에 함수 적용
train_final = preprocess_and_feature_engineer(train_df.copy())
test_final = preprocess_and_feature_engineer(test_df.copy())

# 4. 학습 데이터와 테스트 데이터의 컬럼 일치시키기
train_cols = set(train_final.columns)
test_cols = set(test_final.columns)
common_cols = list(train_cols.intersection(test_cols))
common_cols = [col for col in common_cols if col not in ['ID', 'stress_score']]

low_importance_features = [
    'sleep_pattern_sleep difficulty',
    'activity_moderate',
    'mean_working',
    'family_medical_history_unknown'
]
final_cols = [col for col in common_cols if col not in low_importance_features]

X_train_full = train_final[final_cols]
y_train_full = train_final['stress_score']
X_test_full = test_final[final_cols]

# 5. 최적화된 LGBM, XGBoost, 그리고 CatBoost 모델 준비
best_lgbm_params = {'learning_rate': 0.1, 'n_estimators': 400, 'num_leaves': 40}
best_xgb_params = {'learning_rate': 0.05, 'max_depth': 9, 'n_estimators': 600}
best_cat_params = {'depth': 9, 'iterations': 926, 'l2_leaf_reg': 6.20834, 'learning_rate': 0.10611}

lgb_model = lgb.LGBMRegressor(random_state=42, **best_lgbm_params)
xgb_model = xgb.XGBRegressor(random_state=42, n_jobs=-1, **best_xgb_params)
cat_model = CatBoostRegressor(random_state=42, verbose=0, **best_cat_params)

# 6. 최종 모델 학습 및 제출 파일 생성 (최적 가중치 적용)
print("--- 최종 모델 학습 및 제출 파일 생성 시작 ---")

lgb_model.fit(X_train_full, y_train_full)
xgb_model.fit(X_train_full, y_train_full)
cat_model.fit(X_train_full, y_train_full)

lgb_test_preds = lgb_model.predict(X_test_full)
xgb_test_preds = xgb_model.predict(X_test_full)
cat_test_preds = cat_model.predict(X_test_full)

best_weights = (0.1, 0.4, 0.5)
final_predictions = (best_weights[0] * lgb_test_preds) + (best_weights[1] * xgb_test_preds) + (best_weights[2] * cat_test_preds)

submission_df['stress_score'] = final_predictions
submission_df.to_csv('submission.csv', index=False)

print("✅ submission.csv 파일 생성 완료!")
print("\n테스트 데이터 예측값 (상위 5개):\n", final_predictions[:5])

--- 최종 모델 학습 및 제출 파일 생성 시작 ---
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000345 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2653
[LightGBM] [Info] Number of data points in the train set: 3000, number of used features: 26
[LightGBM] [Info] Start training from score 0.482130
✅ submission.csv 파일 생성 완료!

테스트 데이터 예측값 (상위 5개):
 [0.59615304 0.87606538 0.21383194 0.38757051 0.55990377]
