In [None]:
# -*- coding: utf-8 -*-
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from catboost import CatBoostRegressor

# =========================================================
# 1) Load & Feature Engineering
# =========================================================
train = pd.read_csv('train.csv')
test  = pd.read_csv('test.csv')
sample_submission = pd.read_csv('sample_submission.csv')
target = train.columns[-1]

def add_features(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    # 간단 문자열 결측 대치(범주형 파이프라인에서 다시 처리 예정)
    for c in out.columns:
        if out[c].dtype == 'object':
            out[c] = out[c].fillna('Null')

    # BMI
    if {'height','weight'}.issubset(out.columns):
        h_m = out['height'] / 100.0
        out['BMI'] = out['weight'] / np.clip(h_m**2, 1e-9, None)

    # 혈압 파생
    if {'systolic_blood_pressure','diastolic_blood_pressure'}.issubset(out.columns):
        sys = out['systolic_blood_pressure']
        dia = out['diastolic_blood_pressure']
        out['pulse_pressure'] = sys - dia
        out['map_pressure']   = dia + (out['pulse_pressure'] / 3.0)

    # 포도당/콜레스테롤 비
    if {'glucose','cholesterol'}.issubset(out.columns):
        out['glu_chol_ratio'] = out['glucose'] / np.clip(out['cholesterol'], 1e-9, None)

    # 간단 상호작용
    if {'height','weight'}.issubset(out.columns):
        out['h_x_w'] = out['height'] * out['weight']

    if 'age' in out.columns:
        if 'activity' in out.columns:
            act_map = {'light': 0, 'moderate': 1, 'high': 2}
            out['_activity_level'] = out['activity'].map(act_map).fillna(-1).astype(int)
            out['age_x_activity'] = out['age'] * out['_activity_level']
        else:
            out['age_x_activity'] = 0

    # 문맥형 대표 결측 치환
    if 'medical_history' in out.columns:
        out['medical_history'] = out['medical_history'].replace('Null', '만성질환 없음')
    if 'family_medical_history' in out.columns:
        out['family_medical_history'] = out['family_medical_history'].replace('Null', '없음')
    return out

train = add_features(train)
test  = add_features(test)

# ID 제거
for col in ['ID','id']:
    if col in train.columns:
        train = train.drop(columns=[col])
    if col in test.columns:
        test  = test.drop(columns=[col])

# =========================================================
# 2) Preprocessor (Full train 기준으로 fit)
#    - OneHotEncoder 버전 호환
# =========================================================
def make_preprocessor(Xdf: pd.DataFrame):
    num_cols = [c for c in Xdf.select_dtypes(include=[np.number]).columns if c != target]
    cat_cols = Xdf.select_dtypes(include=['object','category']).columns.tolist()

    ohe_kwargs = dict(handle_unknown='ignore')
    try:
        OneHotEncoder(sparse_output=True, **ohe_kwargs)  # sklearn >=1.2
        ohe_kwargs['sparse_output'] = True
    except TypeError:
        ohe_kwargs['sparse'] = True                      # sklearn <1.2

    return ColumnTransformer(
        transformers=[
            ("num", Pipeline([
                ("imp", SimpleImputer(strategy="median")),
                ("sc",  StandardScaler())
            ]), num_cols),
            ("cat", Pipeline([
                ("imp", SimpleImputer(strategy="most_frequent")),
                ("ohe", OneHotEncoder(**ohe_kwargs))
            ]), cat_cols)
        ],
        remainder='drop'
    )

pre_full = make_preprocessor(train)
X_full = pre_full.fit_transform(train.drop(columns=[target]))
X_test = pre_full.transform(test)

# 타깃 스케일
ss_full = StandardScaler()
y_full_sc = ss_full.fit_transform(train[[target]].values).ravel()

# =========================================================
# 3) CatBoost 단일 모델 학습
# =========================================================
# CatBoost — 고정 파라미터 (로그 best)
cat_params_fixed = {
    'iterations': 2400,
    'depth': 10,
    'learning_rate': 0.029752101640529684,
    'l2_leaf_reg': 1.8538789372311681,
    'bagging_temperature': 1.9290162265703152,
    'random_strength': 0.7961477702014803,
    'loss_function': 'RMSE',
    'random_seed': 42,
    'verbose': False
}
cat_full = CatBoostRegressor(**cat_params_fixed)
cat_full.fit(X_full, y_full_sc, verbose=False)
pred_test_sc = cat_full.predict(X_test)
pred_test = ss_full.inverse_transform(pred_test_sc.reshape(-1,1)).ravel()

# =========================================================
# 4) 결과 저장
# =========================================================
def postprocess(p):
    return np.clip(np.round(p, 2), 0, 1)

sub_catboost = sample_submission.copy()
sub_catboost[target] = postprocess(pred_test)
sub_catboost.to_csv('result_catboost_single.csv', index=False)

print("📁 Saved → result_catboost_single.csv")

📁 Saved → result_catboost_single.csv


Best params: {'n_estimators': 531, 'max_depth': 36, 'max_features': 'sqrt', 'min_samples_split': 2, 'min_samples_leaf': 1, 'bootstrap': False}
0.15222
