## 1. 데이터 불러오기

In [1]:
import numpy as np
import pandas as pd
from lightgbm import LGBMRegressor
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

In [2]:
TRAIN_CSV = r"D:\새 폴더 (2)\LGAimers\open\train.csv" # 각자 컴퓨터 파일 경로로 바꾸셔야할 듯
TEST_CSV  = r"D:\새 폴더 (2)\LGAimers\open\test.csv"

train = pd.read_csv(TRAIN_CSV)
test  = pd.read_csv(TEST_CSV)

In [3]:
#train = pd.read_csv('./train.csv')
#test = pd.read_csv('./test.csv')

"""
[train.csv]                                         [test.csv]
ID : 샘플별 고유 ID                                 ID : 샘플별 고유 ID
gender : 성별                                       gender : 성별
age : 연령                                          age : 연령
height : 키(cm)                                     height : 키(cm)
weight : 몸무게(kg)                                 weight : 몸무게(kg)
cholesterol : 콜레스테롤 수치                       cholesterol : 콜레스테롤 수치
systolic_blood_pressure : 수축기 혈압               systolic_blood_pressure : 수축기 혈압
diastolic_blood_pressure : 이완기 혈압              diastolic_blood_pressure : 이완기 혈압
glucose : 혈당 수치(mg/dL)                          glucose : 혈당 수치(mg/dL)
bone_density : 골밀도(g/cm²)                        bone_density : 골밀도(g/cm²)
activity : 생활시 운동 강도                         activity : 생활시 운동 강도
smoke_status : 흡연 상태                            smoke_status : 흡연 상태
medical_history : 만성질환                          medical_history : 만성질환
family_medical_history : 가족력                     family_medical_history : 가족력
sleep_pattern : 수면패턴                            sleep_pattern : 수면패턴
edu_level : 학력                                    edu_level : 학력
mean_working : 1주일당 평균 근로 시간               mean_working : 1주일당 평균 근로 시간
stress_score : (TARGET) 스트레스 점수                    
"""

display(train.shape)

(3000, 18)

## 2. 데이터 전처리

In [4]:
# 카테고리형 변수의 결측값을 최빈값으로 대체
# 최빈값 -> 가장 많이 관측된 값
categorical_na_cols = train.select_dtypes(include=['object', 'category']).columns.drop('ID')

for col in categorical_na_cols:
    # # 학습 데이터에서 최빈값 계산
    # most_frequent = train[col].mode()[0]
    # print(f"{col}: {most_frequent}")
    # # 학습 데이터와 테스트 데이터 모두 해당 칼럼의 최빈값으로 대체
    # train[col] = train[col].fillna(most_frequent)
    # test[col] = test[col].fillna(most_frequent)

    train[col] = train[col].fillna('None')
    test[col] = test[col].fillna('None')

    print(f"{col}: {train[col].unique()}")

gender: ['F' 'M']
activity: ['moderate' 'light' 'intense']
smoke_status: ['ex-smoker' 'non-smoker' 'current-smoker']
medical_history: ['high blood pressure' 'None' 'diabetes' 'heart disease']
family_medical_history: ['diabetes' 'None' 'high blood pressure' 'heart disease']
sleep_pattern: ['sleep difficulty' 'normal' 'oversleeping']
edu_level: ['bachelors degree' 'graduate degree' 'high school diploma' 'None']


In [5]:
# case1: mean_working 결측치 -> 0
#train['mean_working'] = train['mean_working'].fillna(0)
#test['mean_working'] = test['mean_working'].fillna(0)

#display(train.columns)

In [6]:
# case2: mean_working에 대해 중앙값 대체
#median_value = train['mean_working'].median()

#train_missing = train['mean_working'].isna()
#test_missing = test['mean_working'].isna()

#train['mean_working'] = train['mean_working'].fillna(median_value)
#test['mean_working'] = test['mean_working'].fillna(median_value)

#train['is_working'] = (~train_missing)
#test['is_working'] = (~test_missing)

#display(train[['mean_working','is_working']].head())

In [7]:
# case 3 : mean_working 결측치를 평균값으로 대체
mean_value = train['mean_working'].mean()

train_missing = train['mean_working'].isna()
test_missing = test['mean_working'].isna()

train['mean_working'] = train['mean_working'].fillna(mean_value).astype(int)
test['mean_working'] = test['mean_working'].fillna(mean_value).astype(int)

train['is_working'] = (~train_missing)
test['is_working'] = (~test_missing)

display(train[['mean_working','is_working']].head())

Unnamed: 0,mean_working,is_working
0,8,False
1,8,False
2,9,True
3,8,False
4,8,False


In [8]:
# One-Hot Encoding 적용 열 - gender, smoke_status, medical_history, family_medical_history
# 원핫 인코딩 => 카테고리끼리 순서 없을때 사용
# smoke_status는 좀 애매하긴 한듯
def one_hot_encoding(df):
    one_hot_encoding_cols = ['gender', 'smoke_status', 'medical_history', 'family_medical_history']
    encoding_df = pd.get_dummies(df, columns=one_hot_encoding_cols)
    return encoding_df

train = one_hot_encoding(train)
test = one_hot_encoding(test)

display(train.columns)
display(train.shape)

Index(['ID', 'age', 'height', 'weight', 'cholesterol',
       'systolic_blood_pressure', 'diastolic_blood_pressure', 'glucose',
       'bone_density', 'activity', 'sleep_pattern', 'edu_level',
       'mean_working', 'stress_score', 'is_working', 'gender_F', 'gender_M',
       'smoke_status_current-smoker', 'smoke_status_ex-smoker',
       'smoke_status_non-smoker', 'medical_history_None',
       'medical_history_diabetes', 'medical_history_heart disease',
       'medical_history_high blood pressure', 'family_medical_history_None',
       'family_medical_history_diabetes',
       'family_medical_history_heart disease',
       'family_medical_history_high blood pressure'],
      dtype='object')

(3000, 28)

In [9]:
# Label Encoding 적용 열 - activity, sleep_pattern, edu_level
# 라벨 인코딩 => 카테고리끼리 순서 있을때 사용
def label_encoding(df):
    label_encoding_cols = ['activity', 'sleep_pattern', 'edu_level']
    for feature in label_encoding_cols:
        le = LabelEncoder()
        le.fit(df[feature])
        df[feature] = le.transform(df[feature])
    return df

train = label_encoding(train)
test = label_encoding(test)

## 3. 피처 엔지니어링

In [10]:
# BMI => 몸무게[kg] / 키[cm]^2
# 혈압차(맥압) => 수축성 혈압 - 이완성 혈압
def add_features(df):
    df['BMI'] = df['weight'] / ((df['height']/100)**2)
    df['bp_diff'] = df['systolic_blood_pressure'] - df['diastolic_blood_pressure']
    return df

train = add_features(train)
test = add_features(test)

In [11]:
display(train['mean_working'].describe())

# working_group 피처 만들기
bins = [3, 8, 9, 16]  # 적절한 경계값 설정
labels = [0, 1, 2]     # 저, 중하, 중상, 고 근무

train['working_group'] = pd.cut(train['mean_working'], bins=bins, labels=labels, include_lowest=True)
test['working_group'] = pd.cut(test['mean_working'], bins=bins, labels=labels, include_lowest=True)

overwork_threshold = train['mean_working'].quantile(0.95)  # 상위 5% 기준값
train['is_overwork'] = (train['mean_working'] >= overwork_threshold).astype(int)
test['is_overwork'] = (test['mean_working'] >= overwork_threshold).astype(int)

display(train['working_group'].value_counts())
display(train['is_overwork'].value_counts())

count    3000.000000
mean        8.470333
std         1.362500
min         4.000000
25%         8.000000
50%         8.000000
75%         9.000000
max        16.000000
Name: mean_working, dtype: float64

working_group
0    1920
2     543
1     537
Name: count, dtype: int64

is_overwork
0    2803
1     197
Name: count, dtype: int64

In [12]:
# mean_working & sleep_pattern 피처 엔지니어링

train['work_sleep_imbalance'] = train['mean_working'] / (train['sleep_pattern'] + 1)
test['work_sleep_imbalance'] = test['mean_working'] / (test['sleep_pattern'] + 1)

display(train[['work_sleep_imbalance', 'stress_score']])

for col in train.columns:
    if col not in ['ID', 'stress_score']:
        corr_val = train['stress_score'].corr(train[col])
        print(f"{col} ↔ stress_score 상관계수: {corr_val:.4f}")

Unnamed: 0,work_sleep_imbalance,stress_score
0,2.666667,0.63
1,8.000000,0.83
2,9.000000,0.70
3,4.000000,0.17
4,2.666667,0.36
...,...,...
2995,4.000000,0.02
2996,9.000000,0.16
2997,9.000000,0.16
2998,4.000000,0.18


age ↔ stress_score 상관계수: 0.0187
height ↔ stress_score 상관계수: -0.0057
weight ↔ stress_score 상관계수: 0.0113
cholesterol ↔ stress_score 상관계수: 0.0213
systolic_blood_pressure ↔ stress_score 상관계수: 0.0156
diastolic_blood_pressure ↔ stress_score 상관계수: 0.0254
glucose ↔ stress_score 상관계수: -0.0061
bone_density ↔ stress_score 상관계수: -0.0226
activity ↔ stress_score 상관계수: 0.0070
sleep_pattern ↔ stress_score 상관계수: 0.0002
edu_level ↔ stress_score 상관계수: -0.0271
mean_working ↔ stress_score 상관계수: 0.1402
is_working ↔ stress_score 상관계수: -0.0228
gender_F ↔ stress_score 상관계수: 0.0136
gender_M ↔ stress_score 상관계수: -0.0136
smoke_status_current-smoker ↔ stress_score 상관계수: 0.0261
smoke_status_ex-smoker ↔ stress_score 상관계수: -0.0320
smoke_status_non-smoker ↔ stress_score 상관계수: 0.0087
medical_history_None ↔ stress_score 상관계수: -0.0504
medical_history_diabetes ↔ stress_score 상관계수: 0.0245
medical_history_heart disease ↔ stress_score 상관계수: 0.0000
medical_history_high blood pressure ↔ stress_score 상관계수: 0.0373
family_medical

## 4. 교차검증

In [13]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error

In [14]:
import optuna
from xgboost import XGBRegressor, callback
from sklearn.model_selection import cross_val_score, KFold

  from .autonotebook import tqdm as notebook_tqdm


In [15]:
!pip install -U xgboost




[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: C:\Users\tree4\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [16]:
# working_group 컬럼을 정수형으로 변환
train['working_group'] = train['working_group'].cat.codes
test ['working_group'] = test ['working_group'].cat.codes

# (만약 다른 category 컬럼이 남아 있다면 동일하게 변환)
for df in (train, test):
    for col in df.select_dtypes('category').columns:
        df[col] = df[col].cat.codes

In [17]:
feature_cols = [c for c in train.columns if c not in ['ID','stress_score']] # ID와 stress_score를 제외한 나머지 칼럼 이름만 뽑아서 리스트
x_train = train[feature_cols]
y_train = train['stress_score']

y_binned = pd.qcut(y_train, q= 5, labels = False)
kf = KFold(n_splits = 5, shuffle = True, random_state = 42) # Stratified K-Fold 설정: 5fold, 데이터 분포를 유지하며 섞기

rmse_scores = []

In [18]:
def objective(trial):
    params = {
        'n_stimators': trial.suggest_int('n_stimators', 100, 1000, step=100),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 0.3),
        'max_depth': trial.suggest_int('max_depth',3,10),
        'min_child_weight': trial.suggest_int('min_child_weight',1,10),
        'subsample': trial.suggest_uniform('subsample',0.5,1.0),
        'colsample_bytree':trial.suggest_uniform('colsample_bytree',0.5,1.0),
        'gamma': trial.suggest_loguniform('gamma',1e-8,10.0),
        'reg_alpha': trial.suggest_loguniform('reg_alpha',1e-8,10.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda',1e-8,10.0),
        'objective': 'reg:squarederror',
        'random_state': 42,
        'enable_categorical': True
    }
    model = XGBRegressor(**params)

    scores = -cross_val_score(
        model,
        x_train, y_train,
        scoring = 'neg_root_mean_squared_error',
        cv = kf,
        n_jobs = -1
    )
    return scores.mean()

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

print('Best RMSE: ', study.best_value)
print('Best params: ', study.best_params)

[I 2025-07-24 00:39:20,688] A new study created in memory with name: no-name-7afc0679-a173-4669-b56e-4871dbf4897e
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 0.3),
  'subsample': trial.suggest_uniform('subsample',0.5,1.0),
  'colsample_bytree':trial.suggest_uniform('colsample_bytree',0.5,1.0),
  'gamma': trial.suggest_loguniform('gamma',1e-8,10.0),
  'reg_alpha': trial.suggest_loguniform('reg_alpha',1e-8,10.0),
  'reg_lambda': trial.suggest_loguniform('reg_lambda',1e-8,10.0),
[I 2025-07-24 00:39:25,477] Trial 0 finished with value: 0.26478543159255363 and parameters: {'n_stimators': 200, 'learning_rate': 0.04231665424472106, 'max_depth': 8, 'min_child_weight': 7, 'subsample': 0.8120283117709135, 'colsample_bytree': 0.8943349701981138, 'gamma': 0.14851078283187225, 'reg_alpha': 0.00012388348086318394, 'reg_lambda': 0.0004428840187278762}. Best is trial 0 with value: 0.26478543159255363.
[I 2025-07-24 00:39:30,233] Trial 1 finished with value: 0.2761902736258318 an

Best RMSE:  0.24213858950327652
Best params:  {'n_stimators': 500, 'learning_rate': 0.11254159327349363, 'max_depth': 9, 'min_child_weight': 3, 'subsample': 0.7281300077223388, 'colsample_bytree': 0.7848493377059798, 'gamma': 4.1224017201510745e-07, 'reg_alpha': 0.0005559030996231133, 'reg_lambda': 0.2915804860350388}


In [19]:
for fold, (tr_idx, val_idx) in enumerate(kf.split(x_train, y_binned),1): # 각 fold별 학습 & 검증
    X_tr, X_val = x_train.iloc[tr_idx], x_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[tr_idx], y_train.iloc[val_idx]
    # tr_idx에는 4개의 fold 인덱스(학습셋) val_idx에는 1개의 fold 인덱스(검증셋)
    # 5번 반복 시 데이터의 모든 부분을 한번씩 검증셋 역할(5Fold 교차검증)
    # 4) XGBoost 모델 생성
    model = XGBRegressor(
        n_stimators=1000,
        learning_rate=0.1334220186786167,
        random_state=42,
        objective='reg:squarederror',
        eval_metric='rmse',
        enable_categorical = True
    )

    # 5) 학습 (early stopping 포함)
    model.fit(
        X_tr, y_tr,
        eval_set=[(X_val, y_val)],
        #early_stopping_rounds=50,
        verbose=False
    )

    # 6) 검증 예측 & RMSE 계산
    val_pred = model.predict(X_val)
    mse = mean_squared_error(y_val, val_pred)
    rmse = np.sqrt(mse)
    rmse_scores.append(rmse)
    print(f"Fold {fold} RMSE: {rmse:.4f}")

# 7) 전체 Fold RMSE 요약
print(f"\nCV RMSE Mean: {np.mean(rmse_scores):.4f}, Std: {np.std(rmse_scores):.4f}")

# 8) 최종 모델 학습 (전체 학습 데이터)
final_model = XGBRegressor(
    n_stimators=1000,
    learning_rate=0.1334220186786167,
    max_depth=10,
    subsample=0.908646389154687,
    colsample_bytree=0.7627227639552128,
    random_state=42,
    objective='reg:squarederror'
)
final_model.fit(x_train, y_train)

Parameters: { "n_stimators" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Fold 1 RMSE: 0.2492


Parameters: { "n_stimators" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Fold 2 RMSE: 0.2566


Parameters: { "n_stimators" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Fold 3 RMSE: 0.2574


Parameters: { "n_stimators" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Fold 4 RMSE: 0.2708


Parameters: { "n_stimators" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Fold 5 RMSE: 0.2551

CV RMSE Mean: 0.2578, Std: 0.0071


Parameters: { "n_stimators" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.7627227639552128
,device,
,early_stopping_rounds,
,enable_categorical,False


In [20]:
X_test = test[feature_cols]
submission = pd.read_csv("D:\새 폴더 (2)\LGAimers\open\sample_submission.csv")
preds = final_model.predict(X_test)
submission['stress_score'] = preds
submission.to_csv('submission.csv', index=False)

print(submission.head())

          ID  stress_score
0  TEST_0000      0.479582
1  TEST_0001      0.860080
2  TEST_0002      0.373561
3  TEST_0003      0.417946
4  TEST_0004      0.458300
