In [1]:
import numpy as np
import pandas as pd
from lightgbm import LGBMRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import KNNImputer

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
train.head(5)

Unnamed: 0,ID,gender,age,height,weight,cholesterol,systolic_blood_pressure,diastolic_blood_pressure,glucose,bone_density,activity,smoke_status,medical_history,family_medical_history,sleep_pattern,edu_level,mean_working,stress_score
0,TRAIN_0000,F,72,161.49,58.47,279.84,165,100,143.35,0.87,moderate,ex-smoker,high blood pressure,diabetes,sleep difficulty,bachelors degree,,0.63
1,TRAIN_0001,M,88,179.87,77.6,257.37,178,111,146.94,0.07,moderate,ex-smoker,,diabetes,normal,graduate degree,,0.83
2,TRAIN_0002,M,47,182.47,89.93,226.66,134,95,142.61,1.18,light,ex-smoker,,,normal,high school diploma,9.0,0.7
3,TRAIN_0003,M,69,185.78,68.63,206.74,158,92,137.26,0.48,intense,ex-smoker,high blood pressure,,oversleeping,graduate degree,,0.17
4,TRAIN_0004,F,81,164.63,71.53,255.92,171,116,129.37,0.34,moderate,ex-smoker,diabetes,diabetes,sleep difficulty,bachelors degree,,0.36


In [4]:
train.isnull().sum()

ID                             0
gender                         0
age                            0
height                         0
weight                         0
cholesterol                    0
systolic_blood_pressure        0
diastolic_blood_pressure       0
glucose                        0
bone_density                   0
activity                       0
smoke_status                   0
medical_history             1289
family_medical_history      1486
sleep_pattern                  0
edu_level                    607
mean_working                1032
stress_score                   0
dtype: int64

In [5]:
# 결측값 있는 칼럼(column) 확인
missing_columns_train = train.columns[train.isnull().sum() > 0]
missing_columns_train

Index(['medical_history', 'family_medical_history', 'edu_level',
       'mean_working'],
      dtype='object')

In [6]:
train[missing_columns_train].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   medical_history         1711 non-null   object 
 1   family_medical_history  1514 non-null   object 
 2   edu_level               2393 non-null   object 
 3   mean_working            1968 non-null   float64
dtypes: float64(1), object(3)
memory usage: 93.9+ KB


In [7]:
categorical_na_cols = []
numerical_na_cols = []
# 결측값이 있는 각 칼럼에 대해 데이터 타입 확인
for col in missing_columns_train:
    # 해당 칼럼이 범주형(object 또는 category)인지 확인
    if col in train.select_dtypes(include=['object', 'category']).columns:
        categorical_na_cols.append(col)
    # 해당 칼럼이 수치형(int 또는 float)인지 확인
    elif col in train.select_dtypes(include=['int', 'float']).columns:
        numerical_na_cols.append(col)

print("결측값이 있는 범주형 변수:", categorical_na_cols)
print("결측값이 있는 수치형 변수:", numerical_na_cols)

결측값이 있는 범주형 변수: ['medical_history', 'family_medical_history', 'edu_level']
결측값이 있는 수치형 변수: ['mean_working']


In [8]:
# 범주형 변수의 결측값을 Unknown으로 대체
for col in categorical_na_cols:
    train[col] = train[col].fillna('Unknown')
    test[col] = test[col].fillna('Unknown')

In [9]:
# 수치형 변수의 결측값을 KNN Imputer로 대체
imputer = KNNImputer(n_neighbors=5)
train[['mean_working']] = imputer.fit_transform(train[['mean_working']])
test[['mean_working']] = imputer.transform(test[['mean_working']])

In [10]:
# mean_working에 대해 중앙값 대체
median_value = train['mean_working'].median()

train['mean_working'] = train['mean_working'].fillna(median_value)
test['mean_working'] = test['mean_working'].fillna(median_value)

In [11]:
# BMI 추가 
train['BMI'] = train['weight'] / (train['height'] / 100) ** 2
test['BMI'] = test['weight'] / (test['height'] / 100) ** 2

In [12]:
# Label Encoding 적용 열 - 범주형 데이터
categorical_cols = train.select_dtypes(include='object').columns.drop('ID')

for feature in categorical_cols:
    le = LabelEncoder()
    le = le.fit(train[feature])
    train[feature] = le.transform(train[feature])
    
    for label in np.unique(test[feature]):
        if label not in le.classes_:
            le.classes_ = np.append(le.classes_, label)
    
    test[feature] = le.transform(test[feature])

In [13]:
x_train = train.drop(['ID', 'stress_score'], axis = 1)
y_train = train['stress_score']

test = test.drop('ID', axis = 1)

In [17]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-3.0.2-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.2-py3-none-win_amd64.whl (150.0 MB)
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.3/150.0 MB ? eta -:--:--
   ---------------------------------------- 1.0/150.0 MB 3.9 MB/s eta 0:00:39
    --------------------------------------- 2.6/150.0 MB 5.4 MB/s eta 0:00:28
   - -------------------------------------- 4.5/150.0 MB 6.4 MB/s eta 0:00:23
   - -------------------------------------- 6.3/150.0 MB 7.2 MB/s eta 0:00:21
   -- ------------------------------------- 8.7/150.0 MB 7.7 MB/s eta 0:00:19
   -- ------------------------------------- 9.7/150.0 MB 7.5 MB/s eta 0:00:19
   -- ------------------------------------- 10.2/150.0 MB 6.9 MB/s eta 0:00:21
   -- ------------------------------------- 11.0/150.0 MB 6.4 MB/s eta 0:00:22
   --- ------------------------------------ 12.8/150.0 MB 6.5 MB/s eta 0:00:21
   --


[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [18]:
import optuna
from sklearn.model_selection import cross_val_score, KFold
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
import numpy as np

# 데이터 준비
X = x_train  # 특성
y = y_train  # 타겟

# 공통 CV 전략
cv = KFold(n_splits=5, shuffle=True, random_state=42)

# 모델별 Optuna objective 함수
def objective(trial):
    model_name = trial.suggest_categorical("model", ["lgbm", "xgb", "rf"])
    if model_name == "lgbm":
        params = {
            "num_leaves": trial.suggest_int("num_leaves", 20, 100),
            "max_depth": trial.suggest_int("max_depth", 3, 10),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2),
            "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
            "random_state": 42,
        }
        model = LGBMRegressor(**params)
    elif model_name == "xgb":
        params = {
            "max_depth": trial.suggest_int("max_depth", 3, 10),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2),
            "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
            "subsample": trial.suggest_float("subsample", 0.6, 1.0),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
            "random_state": 42,
        }
        model = XGBRegressor(**params)
    else:  # RandomForest
        params = {
            "max_depth": trial.suggest_int("max_depth", 3, 15),
            "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
            "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2", None]),
            "random_state": 42,
        }
        model = RandomForestRegressor(**params)
    
    # 음의 MSE(작을수록 좋음)
    score = cross_val_score(model, X, y, cv=cv, scoring="neg_mean_squared_error").mean()
    return -score  # Optuna는 최소화 기준

# Optuna 실행
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=30)  # n_trials는 상황에 따라 조절

print("Best trial:")
print(study.best_trial.params)
print("Best RMSE:", np.sqrt(study.best_value))

[I 2025-07-17 16:46:29,976] A new study created in memory with name: no-name-ad14d2bd-af19-48c8-8e6a-b63818a9dc9f


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000723 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1705
[LightGBM] [Info] Number of data points in the train set: 2400, number of used features: 17
[LightGBM] [Info] Start training from score 0.481912
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000673 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1706
[LightGBM] [Info] Number of data points in the train set: 2400, number of used features: 17
[LightGBM] [Info] Start training from score 0.486092
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000619 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1704
[LightGBM] [Info] Number of data points in the train set: 

[I 2025-07-17 16:46:36,167] Trial 0 finished with value: 0.060436032639426096 and parameters: {'model': 'lgbm', 'num_leaves': 52, 'max_depth': 8, 'learning_rate': 0.10232661919070739, 'n_estimators': 979}. Best is trial 0 with value: 0.060436032639426096.




[I 2025-07-17 16:46:38,555] Trial 1 finished with value: 0.06847367756324149 and parameters: {'model': 'xgb', 'max_depth': 4, 'learning_rate': 0.04127820547731363, 'n_estimators': 473, 'subsample': 0.8682700828936363, 'colsample_bytree': 0.9804280220666708}. Best is trial 0 with value: 0.060436032639426096.
[I 2025-07-17 16:47:26,469] Trial 2 finished with value: 0.06391586069493024 and parameters: {'model': 'rf', 'max_depth': 15, 'n_estimators': 856, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.060436032639426096.
[I 2025-07-17 16:48:14,934] Trial 3 finished with value: 0.06391130763261871 and parameters: {'model': 'rf', 'max_depth': 15, 'n_estimators': 775, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.060436032639426096.
[I 2025-07-17 16:48:32,723] Trial 4 finished with value: 0.058777196949509204 and parameters: {'model': 'xgb', 'max_depth': 9, 'learning_rate': 0.1188333251383155, 'n_estimators': 450, 'subsample': 0.6355855227636542, 'colsample_bytree': 0.8103827

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000729 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1705
[LightGBM] [Info] Number of data points in the train set: 2400, number of used features: 17
[LightGBM] [Info] Start training from score 0.481912
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001201 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1706
[LightGBM] [Info] Number of data points in the train set: 2400, number of used features: 17
[LightGBM] [Info] Start training from score 0.486092
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000270 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1704
[LightGBM] [Info] Number of data points in the train set: 

[I 2025-07-17 16:48:34,134] Trial 5 finished with value: 0.0689379207578064 and parameters: {'model': 'lgbm', 'num_leaves': 71, 'max_depth': 4, 'learning_rate': 0.15464946179338523, 'n_estimators': 312}. Best is trial 4 with value: 0.058777196949509204.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004997 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1705
[LightGBM] [Info] Number of data points in the train set: 2400, number of used features: 17
[LightGBM] [Info] Start training from score 0.481912
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000850 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1706
[LightGBM] [Info] Number of data points in the train set: 2400, number of used features: 17
[LightGBM] [Info] Start training from score 0.486092
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000617 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1704
[LightGBM] [Info] Number of data points in the train set: 2400, number of used features: 17
[LightGBM] [Info] Start traini

[I 2025-07-17 16:48:38,636] Trial 6 finished with value: 0.07272406295196414 and parameters: {'model': 'lgbm', 'num_leaves': 42, 'max_depth': 6, 'learning_rate': 0.018204200747368282, 'n_estimators': 408}. Best is trial 4 with value: 0.058777196949509204.




[I 2025-07-17 16:48:45,066] Trial 7 finished with value: 0.05994985335817822 and parameters: {'model': 'xgb', 'max_depth': 7, 'learning_rate': 0.13825338372964857, 'n_estimators': 271, 'subsample': 0.8415943438679497, 'colsample_bytree': 0.6388169070280605}. Best is trial 4 with value: 0.058777196949509204.
[W 2025-07-17 16:48:58,014] Trial 8 failed with parameters: {'model': 'rf', 'max_depth': 6, 'n_estimators': 580, 'max_features': 'sqrt'} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "c:\Users\a3599\anaconda3\Lib\site-packages\optuna\study\_optimize.py", line 201, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "C:\Users\a3599\AppData\Local\Temp\ipykernel_3752\98016037.py", line 47, in objective
    score = cross_val_score(model, X, y, cv=cv, scoring="neg_mean_squared_error").mean()
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\a3599\anaconda

KeyboardInterrupt: 

In [None]:
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
import optuna
import numpy as np
from sklearn.metrics import mean_squared_error

# 1. 각 모델을 (최적 파라미터로) 학습
lgbm_best = LGBMRegressor(num_leaves=28, max_depth=9, learning_rate=0.12386189308599076, n_estimators=979, random_state=42)
xgb_best = XGBRegressor(max_depth=10, learning_rate=0.06088753981020666, n_estimators=595, subsample=0.9268259172450455, colsample_bytree=0.8156490359901913, random_state=42)
rf_best  = RandomForestRegressor(max_depth=15, n_estimators=458, max_features='sqrt', random_state=42)

lgbm_best.fit(x_train, y_train)
xgb_best.fit(x_train, y_train)
rf_best.fit(x_train, y_train)

# 2. train set에서 예측값 저장
pred_lgbm = lgbm_best.predict(x_train)
pred_xgb  = xgb_best.predict(x_train)
pred_rf   = rf_best.predict(x_train)

# 3. Optuna로 앙상블 가중치 튜닝
def ensemble_objective(trial):
    w1 = trial.suggest_float("w1", 0, 1)
    w2 = trial.suggest_float("w2", 0, 1)
    w3 = trial.suggest_float("w3", 0, 1)
    if w1 + w2 + w3 == 0:
        return float("inf")
    ensemble_pred = (w1 * pred_lgbm + w2 * pred_xgb + w3 * pred_rf) / (w1 + w2 + w3)
    score = np.sqrt(mean_squared_error(y_train, ensemble_pred))
    return score

study_ens = optuna.create_study(direction="minimize")
study_ens.optimize(ensemble_objective, n_trials=50)

print("Best ensemble weights:", study_ens.best_params)
print("Best ensemble RMSE:", study_ens.best_value)

# 4. 단일 모델(XGBoost) RMSE와 비교
rmse_xgb_only = np.sqrt(mean_squared_error(y_train, pred_xgb))
print("XGBoost 단일 모델 RMSE:", rmse_xgb_only)

# 5. test 데이터에 최적 가중치 적용
w1, w2, w3 = study_ens.best_params["w1"], study_ens.best_params["w2"], study_ens.best_params["w3"]
pred_lgbm_test = lgbm_best.predict(test)
pred_xgb_test  = xgb_best.predict(test)
pred_rf_test   = rf_best.predict(test)
ensemble_pred_test = (w1 * pred_lgbm_test + w2 * pred_xgb_test + w3 * pred_rf_test) / (w1 + w2 + w3)

# 6. 더 좋은 쪽으로 제출
if study_ens.best_value < rmse_xgb_only:
    print("앙상블이 더 좋으므로 앙상블 예측값을 사용합니다.")
    final_pred = ensemble_pred_test
else:
    print("단일 XGBoost가 더 좋으므로 단일 모델 예측값을 사용합니다.")
    final_pred = pred_xgb_test

# 7. 제출 파일 생성
submission = pd.read_csv('sample_submission.csv')
submission['stress_score'] = final_pred
submission.to_csv('submission.csv', index=False)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000255 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1713
[LightGBM] [Info] Number of data points in the train set: 3000, number of used features: 17
[LightGBM] [Info] Start training from score 0.482130


[I 2025-07-17 14:46:16,181] A new study created in memory with name: no-name-1270ad80-c742-4c5f-858a-05a1b83dd805
[I 2025-07-17 14:46:16,183] Trial 0 finished with value: 0.06211064785167615 and parameters: {'w1': 0.6372487987018085, 'w2': 0.616843517179041, 'w3': 0.7649958721047152}. Best is trial 0 with value: 0.06211064785167615.
[I 2025-07-17 14:46:16,184] Trial 1 finished with value: 0.01973650598065638 and parameters: {'w1': 0.9139159275338749, 'w2': 0.7148212933636224, 'w3': 0.18328082027800296}. Best is trial 1 with value: 0.01973650598065638.
[I 2025-07-17 14:46:16,185] Trial 2 finished with value: 0.024135211466216708 and parameters: {'w1': 0.7282054255072767, 'w2': 0.04262040539157008, 'w3': 0.09817717745819032}. Best is trial 1 with value: 0.01973650598065638.
[I 2025-07-17 14:46:16,186] Trial 3 finished with value: 0.06733454640168794 and parameters: {'w1': 0.7967891533041962, 'w2': 0.5252780047175322, 'w3': 0.9208326389996416}. Best is trial 1 with value: 0.01973650598065

Best ensemble weights: {'w1': 0.8150998425571588, 'w2': 0.7378206165305172, 'w3': 0.0020760021779370483}
Best ensemble RMSE: 0.006580766466688857
XGBoost 단일 모델 RMSE: 0.0010607905499861305
단일 XGBoost가 더 좋으므로 단일 모델 예측값을 사용합니다.


In [None]:
model = XGBRegressor(
    max_depth=10,
    learning_rate=0.06088753981020666,
    n_estimators=595,
    subsample=0.9268259172450455,
    colsample_bytree=0.8156490359901913,
    random_state=42
)
model.fit(x_train, y_train)
pred = model.predict(test)

In [None]:
submission = pd.read_csv('sample_submission.csv')

In [None]:
submission['stress_score'] = pred
submission.head()

Unnamed: 0,ID,stress_score
0,TEST_0000,0.51951
1,TEST_0001,0.875738
2,TEST_0002,0.384725
3,TEST_0003,0.438501
4,TEST_0004,0.574812


In [None]:
submission.to_csv('submit.csv', index=False)