In [1]:
# 1. 라이브러리 불러오기
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder

# 2. 데이터 불러오기
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')
sample_submission = pd.read_csv('./data/sample_submission.csv')

# 3. 결측치 처리
fillna_cat_cols = ['medical_history', 'family_medical_history', 'edu_level']
for col in fillna_cat_cols:
    train[col] = train[col].fillna('Missing')
    test[col] = test[col].fillna('Missing')

train['mean_working'] = train['mean_working'].fillna(train['mean_working'].median())
test['mean_working'] = test['mean_working'].fillna(train['mean_working'].median())

# 4. 파생 피처 생성 함수 정의
def encode_sleep(sleep):
    if sleep == 'normal':
        return 0
    elif sleep == 'sleep difficulty':
        return 1
    elif sleep == 'oversleeping':
        return 0.5
    else:
        return 0.5

def encode_activity(activity):
    if activity == 'light':
        return 0.5
    elif activity == 'moderate':
        return 1
    elif activity == 'intense':
        return 1.5
    else:
        return 1

def add_features(df):
    # BMI
    df['bmi'] = df['weight'] / (df['height'] / 100) ** 2
    # 혈압차이
    df['bp_diff'] = df['systolic_blood_pressure'] - df['diastolic_blood_pressure']
    # 근무시간 구간
    df['working_group'] = pd.cut(df['mean_working'], bins=[-1, 0, 20, 40, 60, 168], labels=[0, 1, 2, 3, 4])
    # 병력/가족력 스코어
    df['has_medical_history'] = df['medical_history'].notnull().astype(int)
    df['has_family_history'] = df['family_medical_history'].notnull().astype(int)
    df['health_risk_score'] = df['has_medical_history'] + df['has_family_history']
    # lifestyle score
    df['lifestyle_score'] = df.apply(
        lambda row: encode_activity(row['activity']) - encode_sleep(row['sleep_pattern']),
        axis=1
    )
    return df

# 파생 피처 적용
train = add_features(train)
test = add_features(test)

# 5. 범주형 인코딩
cat_cols = train.select_dtypes(include='object').drop(columns='ID').columns

for col in cat_cols:
    le = LabelEncoder()
    le.fit(pd.concat([train[col], test[col]]).astype(str))
    train[col] = le.transform(train[col].astype(str))
    test[col] = le.transform(test[col].astype(str))

# 6. 피처 및 타겟 정의
X = train.drop(columns=['ID', 'stress_score'])
y = train['stress_score']
X_test = test.drop(columns=['ID'])


In [2]:
# 6. 데이터 분할
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# 범주형 컬럼들을 category에서 int로 변환 (XGBoost 용)
X_train_xgb = X_train.copy()
X_valid_xgb = X_valid.copy()

for col in X_train_xgb.select_dtypes(['category']).columns:
    X_train_xgb[col] = X_train_xgb[col].cat.codes
    X_valid_xgb[col] = X_valid_xgb[col].cat.codes

In [7]:
# 생략 가능: 이미 int로 되어 있음
X_train_xgb = X_train.copy()
X_valid_xgb = X_valid.copy()
X_test_xgb = X_test.copy()


In [4]:
import lightgbm as lgb
import xgboost as xgb
from sklearn.metrics import mean_absolute_error

# ✅ LightGBM 최적 모델 (이전에 튜닝된 best params 사용)
lgb_model = lgb.LGBMRegressor(
    colsample_bytree=0.8,
    learning_rate=0.1,
    max_depth=-1,
    n_estimators=500,
    num_leaves=63,
    subsample=0.8,
    random_state=42
)
lgb_model.fit(X_train, y_train)
lgb_val_pred = lgb_model.predict(X_valid)
lgb_test_pred = lgb_model.predict(X_test)

X_test_xgb = X_test.copy()

# ✅ XGBoost 최적 모델 (튜닝된 best params 기반)
xgb_model = xgb.XGBRegressor(
    colsample_bytree=0.8,
    learning_rate=0.1,
    max_depth=6,
    n_estimators=500,
    subsample=0.8,
    random_state=42
)
xgb_model.fit(X_train_xgb, y_train)
xgb_val_pred = xgb_model.predict(X_valid_xgb)
xgb_test_pred = xgb_model.predict(X_test_xgb)

# ✅ 앙상블 (평균)
ensemble_val_pred = (lgb_val_pred + xgb_val_pred) / 2
ensemble_test_pred = (lgb_test_pred + xgb_test_pred) / 2




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000216 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1790
[LightGBM] [Info] Number of data points in the train set: 2400, number of used features: 19
[LightGBM] [Info] Start training from score 0.481912


In [5]:
# 8. 검증 점수 출력
from sklearn.metrics import mean_absolute_error

ensemble_val_pred = (lgb_val_pred + xgb_val_pred) / 2
mae = mean_absolute_error(y_valid, ensemble_val_pred)
print(f"LGBM MAE: {mean_absolute_error(y_valid, lgb_val_pred):.4f}")
print(f"XGB MAE: {mean_absolute_error(y_valid, xgb_val_pred):.4f}")
print(f"Ensemble MAE: {mean_absolute_error(y_valid, ensemble_val_pred):.4f}")


LGBM MAE: 0.1637
XGB MAE: 0.1642
Ensemble MAE: 0.1604


In [6]:
# 9. 위에서 predict()한 결과를 합치는 것

ensemble_test_pred = (lgb_test_pred + xgb_test_pred) / 2

In [20]:
# 10. 제출 파일 생성
submission = sample_submission.copy()
submission['stress_score'] = ensemble_test_pred
submission.to_csv('ensemble_submission.csv', index=False)
print("✅ 앙상블 제출 파일 저장 완료: ensemble_submission.csv")

✅ 앙상블 제출 파일 저장 완료: ensemble_submission.csv
