In [17]:
# 1. 라이브러리 불러오기
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

In [18]:
# 2. 데이터 불러오기
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')
sample_submission = pd.read_csv('./data/sample_submission.csv')

In [None]:
# 결측치 확인
print("🧼 Train 결측치 개수:")
print(train.isnull().sum())

print("\n🧼 Test 결측치 개수:")
print(test.isnull().sum())

# 또는 한눈에 보기
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 4))
sns.heatmap(train.isnull(), cbar=False)
plt.title("Train 결측치 분포")
plt.show()


In [20]:
# 3. 결측치 처리
fillna_cat_cols = ['medical_history', 'family_medical_history', 'edu_level']
for col in fillna_cat_cols:
    train[col] = train[col].fillna('Missing')
    test[col] = test[col].fillna('Missing')

In [21]:
train['mean_working'] = train['mean_working'].fillna(train['mean_working'].median())
test['mean_working'] = test['mean_working'].fillna(train['mean_working'].median())

In [22]:
# 4. 인코딩
from sklearn.preprocessing import LabelEncoder

In [23]:
cat_cols = train.select_dtypes(include='object').drop(columns='ID').columns
for col in cat_cols:
    le = LabelEncoder()
    le.fit(pd.concat([train[col], test[col]]))
    train[col] = le.transform(train[col])
    test[col] = le.transform(test[col])

In [24]:
# 5. 피처 및 타겟 정의
X = train.drop(columns=['ID', 'stress_score'])
y = train['stress_score']
X_test = test.drop(columns=['ID'])

In [25]:
# 6. 데이터 분할
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

In [26]:
# 범주형 컬럼들을 category에서 int로 변환 (XGBoost 용)
X_train_xgb = X_train.copy()
X_valid_xgb = X_valid.copy()

for col in X_train_xgb.select_dtypes(['category']).columns:
    X_train_xgb[col] = X_train_xgb[col].cat.codes
    X_valid_xgb[col] = X_valid_xgb[col].cat.codes

In [28]:
import lightgbm as lgb
import xgboost as xgb
from sklearn.metrics import mean_absolute_error

# ✅ LightGBM 최적 모델 (이전에 튜닝된 best params 사용)
lgb_model = lgb.LGBMRegressor(
    colsample_bytree=0.8,
    learning_rate=0.1,
    max_depth=-1,
    n_estimators=500,
    num_leaves=63,
    subsample=0.8,
    random_state=42
)
lgb_model.fit(X_train, y_train)
lgb_val_pred = lgb_model.predict(X_valid)
lgb_test_pred = lgb_model.predict(X_test)

# ✅ XGBoost 데이터 전처리 (category → int)
X_train_xgb = X_train.copy()
X_valid_xgb = X_valid.copy()
X_test_xgb = X_test.copy()
for col in X_train_xgb.select_dtypes(['category']).columns:
    X_train_xgb[col] = X_train_xgb[col].cat.codes
    X_valid_xgb[col] = X_valid_xgb[col].cat.codes
    X_test_xgb[col] = X_test_xgb[col].cat.codes

# ✅ XGBoost 최적 모델 (튜닝된 best params 기반)
xgb_model = xgb.XGBRegressor(
    colsample_bytree=0.8,
    learning_rate=0.1,
    max_depth=6,
    n_estimators=500,
    subsample=0.8,
    random_state=42
)
xgb_model.fit(X_train_xgb, y_train)
xgb_val_pred = xgb_model.predict(X_valid_xgb)
xgb_test_pred = xgb_model.predict(X_test_xgb)

# ✅ 앙상블 (평균)
ensemble_val_pred = (lgb_val_pred + xgb_val_pred) / 2
ensemble_test_pred = (lgb_test_pred + xgb_test_pred) / 2




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000286 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1449
[LightGBM] [Info] Number of data points in the train set: 2400, number of used features: 16
[LightGBM] [Info] Start training from score 0.481912


In [29]:
# 8. 검증 점수 출력
from sklearn.metrics import mean_absolute_error

ensemble_val_pred = (lgb_val_pred + xgb_val_pred) / 2
mae = mean_absolute_error(y_valid, ensemble_val_pred)
print(f"🎯 앙상블 Validation MAE: {mae:.4f}")


🎯 앙상블 Validation MAE: 0.1654


In [30]:
# 9. 테스트 데이터 예측
lgb_test_pred = lgb_model.predict(X_test)
xgb_test_pred = xgb_model.predict(X_test_xgb)

ensemble_test_pred = (lgb_test_pred + xgb_test_pred) / 2

In [31]:
# 10. 제출 파일 생성
submission = sample_submission.copy()
submission['stress_score'] = ensemble_test_pred
submission.to_csv('ensemble_submission.csv', index=False)
print("✅ 앙상블 제출 파일 저장 완료: ensemble_submission.csv")

✅ 앙상블 제출 파일 저장 완료: ensemble_submission.csv
