In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb

# 1. 데이터 불러오기
try:
    train_df = pd.read_csv('../data/train.csv')
    test_df = pd.read_csv('../data/test.csv')
    submission_df = pd.read_csv('../data/sample_submission.csv')
except FileNotFoundError:
    print("파일 경로를 다시 확인해주세요.")
    exit()

# 2. 전처리 및 파생변수 생성 함수
def preprocess_and_feature_engineer(df):
    df_processed = df.copy()
    df_processed.loc[df_processed['bone_density'] < 0, 'bone_density'] = 0
    df_processed[['medical_history', 'family_medical_history', 'edu_level']] = df_processed[['medical_history', 'family_medical_history', 'edu_level']].fillna('unknown')
    df_processed['mean_working'] = df_processed.groupby(['smoke_status', 'edu_level'])['mean_working'].transform(lambda x: x.fillna(x.median()))
    df_processed['mean_working'] = df_processed['mean_working'].fillna(df_processed['mean_working'].median())
    categorical_cols = ['gender', 'activity', 'smoke_status', 'sleep_pattern',
                        'medical_history', 'family_medical_history', 'edu_level']
    df_processed = pd.get_dummies(df_processed, columns=categorical_cols, drop_first=True)
    df_processed['BMI'] = df_processed['weight'] / (df_processed['height'] / 100) ** 2
    df_processed['Hypertension_flag'] = ((df_processed['systolic_blood_pressure'] >= 140) | (df_processed['diastolic_blood_pressure'] >= 90)).astype(int)
    df_processed['Pulse_Pressure'] = df_processed['systolic_blood_pressure'] - df_processed['diastolic_blood_pressure']
    df_processed['bp_interaction'] = df_processed['systolic_blood_pressure'] * df_processed['diastolic_blood_pressure']
    df_processed['cholesterol_glucose_ratio'] = df_processed['cholesterol'] / (df_processed['glucose'] + 1)
    bins = [0, 8, 12, 16, 24]
    labels = ['~8h', '8~12h', '12~16h', '16h~']
    df_processed['Working_Hour_Group'] = pd.cut(df_processed['mean_working'], bins=bins, labels=labels, right=False)
    df_processed = pd.get_dummies(df_processed, columns=['Working_Hour_Group'], prefix='Working_Hour_Group', drop_first=True)
    return df_processed

# 3. 데이터에 함수 적용
train_final = preprocess_and_feature_engineer(train_df)
test_final = preprocess_and_feature_engineer(test_df)

# 4. 학습 데이터와 테스트 데이터의 컬럼 일치시키기
train_cols = set(train_final.columns)
test_cols = set(test_final.columns)
common_cols = list(train_cols.intersection(test_cols))
common_cols = [col for col in common_cols if col not in ['ID', 'stress_score']]

# 중요도가 낮은 변수 리스트
low_importance_features = ['sleep_pattern_sleep difficulty', 'activity_moderate', 'mean_working']

# 중요도가 낮은 변수를 제거한 최종 변수 리스트
final_cols = [col for col in common_cols if col not in low_importance_features]

X_train = train_final[final_cols]
y_train = train_final['stress_score']
X_test = test_final[final_cols]

# 5. 최종 모델 학습 및 예측 (최적의 파라미터 적용)
lgb_model = lgb.LGBMRegressor(
    random_state=42,
    n_estimators=300,
    learning_rate=0.1,
    num_leaves=40
)
lgb_model.fit(X_train, y_train)
test_predictions = lgb_model.predict(X_test)

# 6. 제출 파일 생성
submission_df['stress_score'] = test_predictions
submission_df.to_csv('../data/submission_filtered.csv', index=False)

print("---")
print("submission_filtered.csv 파일이 성공적으로 생성되었습니다! 이 파일을 제출해 보세요.")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000132 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2300
[LightGBM] [Info] Number of data points in the train set: 3000, number of used features: 29
[LightGBM] [Info] Start training from score 0.482130
---
submission_filtered.csv 파일이 성공적으로 생성되었습니다! 이 파일을 제출해 보세요.
