In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import xgboost as xgb
from sklearn.model_selection import KFold, RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from scipy.stats import uniform, randint

# 1. 데이터 불러오기
try:
    train_df = pd.read_csv('../data/train.csv')
    test_df = pd.read_csv('../data/test.csv')
    submission_df = pd.read_csv('../data/sample_submission.csv')
except FileNotFoundError:
    print("파일 경로를 다시 확인해주세요.")
    exit()

# 2. 전처리 및 파생 변수 생성 함수
def preprocess_and_feature_engineer(df):
    df_processed = df.copy()
    
    if 'gender' in df_processed.columns:
        df_processed = df_processed.drop('gender', axis=1)

    df_processed.loc[df_processed['bone_density'] < 0, 'bone_density'] = 0
    df_processed[['medical_history', 'family_medical_history', 'edu_level']] = df_processed[['medical_history', 'family_medical_history', 'edu_level']].fillna('unknown')
    df_processed['mean_working'] = df_processed.groupby(['smoke_status', 'edu_level'])['mean_working'].transform(lambda x: x.fillna(x.median()))
    df_processed['mean_working'] = df_processed['mean_working'].fillna(df_processed['mean_working'].median())
    
    categorical_cols = ['activity', 'smoke_status', 'sleep_pattern',
                        'medical_history', 'family_medical_history', 'edu_level']
    df_processed = pd.get_dummies(df_processed, columns=categorical_cols, drop_first=True)
    
    df_processed['BMI'] = df_processed['weight'] / (df_processed['height'] / 100) ** 2
    df_processed['Pulse_Pressure'] = df_processed['systolic_blood_pressure'] - df_processed['diastolic_blood_pressure']
    df_processed['bp_interaction'] = df_processed['systolic_blood_pressure'] * df_processed['diastolic_blood_pressure']
    df_processed['cholesterol_glucose_ratio'] = df_processed['cholesterol'] / (df_processed['glucose'] + 1)
    df_processed['bone_density_bmi_interaction'] = df_processed['bone_density'] * df_processed['BMI']
    df_processed['blood_pressure_sum'] = df_processed['systolic_blood_pressure'] + df_processed['diastolic_blood_pressure']

    return df_processed

# 3. 데이터에 함수 적용
train_final = preprocess_and_feature_engineer(train_df.copy())

# 4. 학습 데이터와 테스트 데이터의 컬럼 일치시키기
train_cols = set(train_final.columns)
common_cols = list(train_cols)
common_cols = [col for col in common_cols if col not in ['ID', 'stress_score']]

low_importance_features = [
    'sleep_pattern_sleep difficulty',
    'activity_moderate',
    'mean_working',
    'family_medical_history_unknown'
]
final_cols = [col for col in common_cols if col not in low_importance_features]

X_train_full = train_final[final_cols]
y_train_full = train_final['stress_score']

# 5. LGBM 모델의 하이퍼파라미터 튜닝
print("--- LGBM 하이퍼파라미터 튜닝 시작 ---")
lgbm_model_base = lgb.LGBMRegressor(random_state=42)
lgbm_param_distributions = {
    'learning_rate': uniform(0.01, 0.1),
    'n_estimators': randint(200, 1000),
    'num_leaves': randint(20, 60),
    'max_depth': randint(5, 15)
}
lgbm_random_search = RandomizedSearchCV(
    estimator=lgbm_model_base,
    param_distributions=lgbm_param_distributions,
    n_iter=50,
    cv=KFold(n_splits=5, shuffle=True, random_state=42),
    scoring='neg_root_mean_squared_error',
    random_state=42,
    n_jobs=-1,
    verbose=1
)
lgbm_random_search.fit(X_train_full, y_train_full)
best_lgbm_params = lgbm_random_search.best_params_
best_lgbm_rmse = -lgbm_random_search.best_score_
print("---")
print(f"**최적의 LGBM 파라미터**: {best_lgbm_params}")
print(f"**최적 파라미터에서의 RMSE**: {best_lgbm_rmse:.5f}")
print("---")

# 6. XGBoost 모델의 하이퍼파라미터 튜닝
print("--- XGBoost 하이퍼파라미터 튜닝 시작 ---")
xgb_model_base = xgb.XGBRegressor(random_state=42, n_jobs=-1)
xgb_param_distributions = {
    'learning_rate': uniform(0.01, 0.1),
    'n_estimators': randint(200, 1000),
    'max_depth': randint(5, 15),
    'subsample': uniform(0.6, 0.4),
    'colsample_bytree': uniform(0.6, 0.4)
}
xgb_random_search = RandomizedSearchCV(
    estimator=xgb_model_base,
    param_distributions=xgb_param_distributions,
    n_iter=50,
    cv=KFold(n_splits=5, shuffle=True, random_state=42),
    scoring='neg_root_mean_squared_error',
    random_state=42,
    n_jobs=-1,
    verbose=1
)
xgb_random_search.fit(X_train_full, y_train_full)
best_xgb_params = xgb_random_search.best_params_
best_xgb_rmse = -xgb_random_search.best_score_
print("---")
print(f"**최적의 XGBoost 파라미터**: {best_xgb_params}")
print(f"**최적 파라미터에서의 RMSE**: {best_xgb_rmse:.5f}")
print("---")

--- LGBM 하이퍼파라미터 튜닝 시작 ---
Fitting 5 folds for each of 50 candidates, totalling 250 fits


[WinError 2] 지정된 파일을 찾을 수 없습니다
  File "c:\Users\rladud\anaconda3\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
        "wmic CPU Get NumberOfCores /Format:csv".split(),
        capture_output=True,
        text=True,
    )
  File "c:\Users\rladud\anaconda3\Lib\subprocess.py", line 554, in run
    with Popen(*popenargs, **kwargs) as process:
         ~~~~~^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\rladud\anaconda3\Lib\subprocess.py", line 1039, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
    ~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
                        pass_fds, cwd, env,
                        ^^^^^^^^^^^^^^^^^^^
    ...<5 lines>...
                        gid, gids, uid, umask,
                        ^^^^^^^^^^^^^^^^^^^^^^
                        start_new_session, process_group)
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Use

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000430 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2653
[LightGBM] [Info] Number of data points in the train set: 3000, number of used features: 26
[LightGBM] [Info] Start training from score 0.482130
---
**최적의 LGBM 파라미터**: {'learning_rate': np.float64(0.02988424040888052), 'max_depth': 12, 'n_estimators': 874, 'num_leaves': 54}
**최적 파라미터에서의 RMSE**: 0.24308
---
--- XGBoost 하이퍼파라미터 튜닝 시작 ---
Fitting 5 folds for each of 50 candidates, totalling 250 fits
---
**최적의 XGBoost 파라미터**: {'colsample_bytree': np.float64(0.6943939678995823), 'learning_rate': np.float64(0.0356068322761324), 'max_depth': 13, 'n_estimators': 406, 'subsample': np.float64(0.7710164073434198)}
**최적 파라미터에서의 RMSE**: 0.23128
---


In [2]:
import pandas as pd
import numpy as np
import xgboost as xgb

# 1. 데이터 불러오기
try:
    train_df = pd.read_csv('../data/train.csv')
    test_df = pd.read_csv('../data/test.csv')
    submission_df = pd.read_csv('../data/sample_submission.csv')
except FileNotFoundError:
    print("파일 경로를 다시 확인해주세요.")
    exit()

# 2. 전처리 및 파생 변수 생성 함수
def preprocess_and_feature_engineer(df):
    df_processed = df.copy()
    
    if 'gender' in df_processed.columns:
        df_processed = df_processed.drop('gender', axis=1)

    df_processed.loc[df_processed['bone_density'] < 0, 'bone_density'] = 0
    df_processed[['medical_history', 'family_medical_history', 'edu_level']] = df_processed[['medical_history', 'family_medical_history', 'edu_level']].fillna('unknown')
    df_processed['mean_working'] = df_processed.groupby(['smoke_status', 'edu_level'])['mean_working'].transform(lambda x: x.fillna(x.median()))
    df_processed['mean_working'] = df_processed['mean_working'].fillna(df_processed['mean_working'].median())
    
    categorical_cols = ['activity', 'smoke_status', 'sleep_pattern',
                        'medical_history', 'family_medical_history', 'edu_level']
    df_processed = pd.get_dummies(df_processed, columns=categorical_cols, drop_first=True)
    
    df_processed['BMI'] = df_processed['weight'] / (df_processed['height'] / 100) ** 2
    df_processed['Pulse_Pressure'] = df_processed['systolic_blood_pressure'] - df_processed['diastolic_blood_pressure']
    df_processed['bp_interaction'] = df_processed['systolic_blood_pressure'] * df_processed['diastolic_blood_pressure']
    df_processed['cholesterol_glucose_ratio'] = df_processed['cholesterol'] / (df_processed['glucose'] + 1)
    df_processed['bone_density_bmi_interaction'] = df_processed['bone_density'] * df_processed['BMI']
    df_processed['blood_pressure_sum'] = df_processed['systolic_blood_pressure'] + df_processed['diastolic_blood_pressure']

    return df_processed

# 3. 데이터에 함수 적용
train_final = preprocess_and_feature_engineer(train_df.copy())
test_final = preprocess_and_feature_engineer(test_df.copy())

# 4. 학습 데이터와 테스트 데이터의 컬럼 일치시키기
train_cols = set(train_final.columns)
test_cols = set(test_final.columns)
common_cols = list(train_cols.intersection(test_cols))
common_cols = [col for col in common_cols if col not in ['ID', 'stress_score']]

low_importance_features = [
    'sleep_pattern_sleep difficulty',
    'activity_moderate',
    'mean_working',
    'family_medical_history_unknown'
]
final_cols = [col for col in common_cols if col not in low_importance_features]

X_train_full = train_final[final_cols]
y_train_full = train_final['stress_score']
X_test_full = test_final[final_cols]

# 5. 최적화된 XGBoost 모델 준비 (튜닝된 파라미터 적용)
best_xgb_params = {'colsample_bytree': 0.69439, 'learning_rate': 0.03560, 'max_depth': 13, 'n_estimators': 406, 'subsample': 0.77101}
xgb_model = xgb.XGBRegressor(random_state=42, n_jobs=-1, **best_xgb_params)

# 6. 최종 모델 학습 및 제출 파일 생성
print("--- 최종 XGBoost 모델 학습 및 제출 파일 생성 시작 ---")

xgb_model.fit(X_train_full, y_train_full)
xgb_test_preds = xgb_model.predict(X_test_full)

submission_df['stress_score'] = xgb_test_preds
submission_df.to_csv('submission_xgb_solo.csv', index=False)

print("✅ submission_xgb_solo.csv 파일 생성 완료!")
print("\n테스트 데이터 예측값 (상위 5개):\n", xgb_test_preds[:5])

--- 최종 XGBoost 모델 학습 및 제출 파일 생성 시작 ---
✅ submission_xgb_solo.csv 파일 생성 완료!

테스트 데이터 예측값 (상위 5개):
 [0.54613173 0.8554774  0.19040878 0.45324594 0.52282214]
