In [14]:
# 1. 라이브러리 불러오기
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

In [15]:
# 2. 데이터 불러오기
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')
sample_submission = pd.read_csv('./data/sample_submission.csv')

In [None]:
# 결측치 확인
print("🧼 Train 결측치 개수:")
print(train.isnull().sum())

print("\n🧼 Test 결측치 개수:")
print(test.isnull().sum())

# 또는 한눈에 보기
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 4))
sns.heatmap(train.isnull(), cbar=False)
plt.title("Train 결측치 분포")
plt.show()


In [16]:
# 3. 결측치 처리
fillna_cat_cols = ['medical_history', 'family_medical_history', 'edu_level']
for col in fillna_cat_cols:
    train[col] = train[col].fillna('Missing')
    test[col] = test[col].fillna('Missing')

In [17]:
train['mean_working'] = train['mean_working'].fillna(train['mean_working'].median())
test['mean_working'] = test['mean_working'].fillna(train['mean_working'].median())

In [18]:
# 4. 인코딩
from sklearn.preprocessing import LabelEncoder

In [19]:
cat_cols = train.select_dtypes(include='object').drop(columns='ID').columns
for col in cat_cols:
    le = LabelEncoder()
    le.fit(pd.concat([train[col], test[col]]))
    train[col] = le.transform(train[col])
    test[col] = le.transform(test[col])

In [20]:
# 5. 피처 및 타겟 정의
X = train.drop(columns=['ID', 'stress_score'])
y = train['stress_score']
X_test = test.drop(columns=['ID'])

In [21]:
# 6. 데이터 분할
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

In [23]:
# 7. LightGBM 모델 학습
from sklearn.model_selection import GridSearchCV
import lightgbm as lgb

#(튜닝된 best_params 직접 사용)
lgb_model = lgb.LGBMRegressor(
    colsample_bytree=0.8,
    learning_rate=0.1,
    max_depth=-1,
    n_estimators=500,
    num_leaves=63,
    subsample=0.8,
    random_state=42
)

lgb_model.fit(X_train, y_train)

# 최적 파라미터 및 결과 출력
print("✅ Best Parameters:", grid_search.best_params_)
print("🔍 Best MAE (negated):", -grid_search.best_score_)


val_pred = lgb_model.predict(X_valid)
mae = mean_absolute_error(y_valid, val_pred)
print(f"Validation MAE: {mae:.4f}")



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000327 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1449
[LightGBM] [Info] Number of data points in the train set: 2400, number of used features: 16
[LightGBM] [Info] Start training from score 0.481912
✅ Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': -1, 'n_estimators': 500, 'num_leaves': 63, 'subsample': 0.8}
🔍 Best MAE (negated): 0.2054743002852144
Validation MAE: 0.1671


In [24]:
# 8. 검증 점수 출력
val_pred = lgb_model.predict(X_valid)
mae = mean_absolute_error(y_valid, val_pred)
print(f"Validation MAE: {mae:.4f}")

Validation MAE: 0.1671


In [25]:
# 9. 테스트 데이터 예측
test_pred = lgb_model.predict(X_test)

In [26]:
# 10. 제출 파일 생성
submission = sample_submission.copy()
submission['stress_score'] = test_pred
submission.to_csv('baseline_lgbm_submission.csv', index=False)
print("✅ 제출 파일 저장 완료: baseline_lgbm_submission.csv")

✅ 제출 파일 저장 완료: baseline_lgbm_submission.csv
