<a href="https://colab.research.google.com/github/kahram-y/first-repository/blob/master/DATAthon/smokerpred2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use("seaborn-v0_8")
sns.set(font_scale=1.1)

In [4]:
from google.colab import files
uploaded = files.upload()

Saving test.csv to test.csv
Saving train.csv to train.csv


In [5]:
train_full = pd.read_csv("/content/train.csv")
test_full = pd.read_csv("/content/test.csv")

# EDA

In [None]:
print("Train Shape:", train_full.shape)
print("Test Shape:", test_full.shape)

In [None]:
train_full.head(), test_full.head()

In [None]:
train_full.info()

In [None]:
# 통계 요약
train_full.describe().T.head()

In [None]:
# 결측치 비율 확인
missing = train_full.isnull().mean().sort_values(ascending=False)
missing[missing > 0]   # 결측치가 있는 변수만 출력

In [None]:
# Target 분포 확인
plt.figure(figsize=(5,4))
sns.countplot(data=train_full, x="smoking")
plt.title("Target Distribution (0 = Non-Smoker, 1 = Smoker)")
plt.show()

train_full["smoking"].value_counts(normalize=True)

In [None]:
# 수치형 변수 리스트 추출
num_cols = train_full.select_dtypes(include=['int64','float64']).columns.tolist()
num_cols.remove("smoking")  # 타깃 제외

# 상위 9개만 샘플 시각화
sample_cols = num_cols[:9]

train_full[sample_cols].hist(figsize=(15,10), bins=30)
plt.suptitle("Numeric Feature Distributions", fontsize=15)
plt.show()

In [None]:
# 상관관계 히트맵
plt.figure(figsize=(12,10))
corr = train_full[num_cols + ["smoking"]].corr()

sns.heatmap(corr, cmap="coolwarm", vmin=-1, vmax=1)
plt.title("Correlation Heatmap")
plt.show()

# 타깃과 상관 높은 상위 변수 TOP10
corr["smoking"].abs().sort_values(ascending=False).head(10)

# Variance Inflation Factor(VIF) 분석


In [None]:
# 박스플롯으로 이상치 탐색
plt.figure(figsize=(15,10))
for i, col in enumerate(sample_cols, 1):
    plt.subplot(3,3,i)
    sns.boxplot(x=train_full[col], color="skyblue")
    plt.title(col)
plt.tight_layout()
plt.show()

In [None]:
# 타깃 분류에 따라 분포가 다른지 비교
feature = sample_cols[2]  # 아무 변수 하나 선택 (예: 0-첫 번째 컬럼)

plt.figure(figsize=(6,4))
sns.kdeplot(train_full[train_full["smoking"]==0][feature], label="Non-Smoker")
sns.kdeplot(train_full[train_full["smoking"]==1][feature], label="Smoker")
plt.title(f"KDE Distribution by smoking: {feature}")
plt.legend()
plt.show()

# Feature Engineering

In [6]:
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

In [7]:
# 제거할 컬럼 정의 (Urine protein(요단백), dental caries(치아 우식) 두 변수 모두 심한 불균형 + 직접적 연관성이 낮아 노이즈 제거 효과 있음)
cols_to_drop = ["id", "Urine protein", "dental caries"]

# 타깃 분리
X = train_full.drop("smoking", axis=1)
y = train_full["smoking"]

# 제거 전 데이터셋
X_before = X.copy()

# 제거 후 데이터셋
X_after = X.drop(cols_to_drop, axis=1)

# 동일한 Train/Validation 분할
X_train_b, X_val_b, y_train_b, y_val_b = train_test_split(X_before, y, test_size=0.2, random_state=42, stratify=y)
X_train_a, X_val_a, y_train_a, y_val_a = train_test_split(X_after, y, test_size=0.2, random_state=42, stratify=y)

test  = test_full.drop(cols_to_drop, axis=1)

In [8]:
# 이상치 처리 (이상치 처리에 강건한 RobustScaler 또는 Winsorization(상위/하위 1~2% 컷))

# 수치형 컬럼만 선택
num_cols = X_after.select_dtypes(include=['int64', 'float64']).columns.tolist()

# -----------------------------------------
# 1) IQR Winsorization 파이프라인
# -----------------------------------------

# IQR 기반 Winsorization 함수
def remove_outliers_iqr(df, cols):
    df = df.copy()
    for c in cols:
        Q1 = df[c].quantile(0.25)
        Q3 = df[c].quantile(0.75)
        IQR = Q3 - Q1
        low = Q1 - 1.5 * IQR
        high = Q3 + 1.5 * IQR
        df[c] = df[c].clip(lower=low, upper=high)
    return df

# 이상치 처리
X_iqr = remove_outliers_iqr(X_after, num_cols)

# Train/Valid Split
X_train_iqr, X_valid_iqr, y_train_iqr, y_valid_iqr = train_test_split(
    X_iqr, y, test_size=0.2, random_state=42
)

# -----------------------------------------
# 2) RobustScaler 파이프라인 (이상치 처리에 강건. Logistic Regression / SVM / XGBoost 같은 모델에서 효과가 더 좋음. 단, Tree 모델에는 필요 없음.)
# -----------------------------------------

# RobustScaler 적용
rs = RobustScaler()
X_robust_scaled = pd.DataFrame(rs.fit_transform(X_after[num_cols]), columns=num_cols)
test_scaled = rs.transform(test)    # test에도 동일 적용

# Train/Valid Split
X_train_rb, X_valid_rb, y_train_rb, y_valid_rb = train_test_split(
    X_robust_scaled, y, test_size=0.2, random_state=42
)

In [9]:
# Feature 생성 (비율 변수 / 차이 변수 / 조합 변수 등)

# 1) HRV(심박수 변동성) Proxy 생성  : RMSSD 비슷한 변동성
# 2) 분산, 범위, 로그 변환
# 3) Interaction(교차항) 자동 생성

def feature_engineering(df):
    df = df.copy()
    cols = df.select_dtypes(include=['int64', 'float64']).columns

    # 1) HRV Proxy
    for c in cols:
        df[f"{c}_sq"] = df[c] ** 2
        df[f"{c}_abs"] = df[c].abs()

    # 2) 범위/분산 특징
    df["var_all"] = df[cols].var(axis=1)
    df["range_all"] = df[cols].max(axis=1) - df[cols].min(axis=1)
    # 로그 변환 (양수만)
    for c in cols:
        if (df[c] > 0).sum() > 0:
            df[f"{c}_log"] = np.log1p(df[c].clip(lower=0))

    # 3) 상호작용 (상위 5개 변수만 예시)
    top5 = cols[:5]
    for i in range(len(top5)):
        for j in range(i+1, len(top5)):
            df[f"inter_{top5[i]}_{top5[j]}"] = df[top5[i]] * df[top5[j]]
    return df

# 피처 엔지니어링
X_fe = feature_engineering(X_iqr)
test_fe = feature_engineering(test)

# Split
X_train, X_valid, y_train, y_valid = train_test_split(X_fe, y, test_size=0.2, random_state=42)

In [None]:
# # 상관관계 분석 및 차원 축소(PCA 등)

# # 수치형 컬럼만 스케일링
# scaler = StandardScaler()
# X_scaled_num = scaler.fit_transform(X[num_cols])
# test_scaled_num = scaler.transform(test[num_cols])

# # PCA
# pca = PCA(n_components=0.95)
# X_pca = pca.fit_transform(X_scaled_num)
# test_pca = pca.transform(test_scaled_num)

# # PCA 데이터프레임 생성
# X_pca_df = pd.DataFrame(X_pca).add_prefix("pca_")
# test_pca_df = pd.DataFrame(test_pca).add_prefix("pca_")

# # 타깃 붙이기
# X_pca_df["smoking"] = y

# Modeling

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
!pip install catboost
from catboost import CatBoostClassifier

import warnings
warnings.filterwarnings("ignore")

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [11]:
# Baseline 모델링 (Logistic Regression, CatBoost / LightGBM / XGBoost)

# Logistic Regression
lr = LogisticRegression(max_iter=2000)
lr.fit(X_train, y_train)
lr_pred = lr.predict_proba(X_valid)[:,1]
print("LR AUC:", roc_auc_score(y_valid, lr_pred))

# LightGBM
lgb = LGBMClassifier(n_estimators=1000, learning_rate=0.02)
lgb.fit(X_train, y_train)
lgb_pred = lgb.predict_proba(X_valid)[:,1]
print("LGB AUC:", roc_auc_score(y_valid, lgb_pred))

# XGBoost
xgb = XGBClassifier(
    n_estimators=800,
    learning_rate=0.03,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric='logloss'
)
xgb.fit(X_train, y_train)
xgb_pred = xgb.predict_proba(X_valid)[:,1]
print("XGB AUC:", roc_auc_score(y_valid, xgb_pred))

# CatBoost
cat = CatBoostClassifier(iterations=700, learning_rate=0.03, depth=6, verbose=0)
cat.fit(X_train, y_train)
cat_pred = cat.predict_proba(X_valid)[:,1]
print("Cat AUC:", roc_auc_score(y_valid, cat_pred))

# 모델 평가 함수
def evaluate_models(X_train, X_valid, y_train, y_valid):

    models = {
        "lr": LogisticRegression(max_iter=2000),
        "lgb": LGBMClassifier(n_estimators=1000, learning_rate=0.02),
        "xgb": XGBClassifier(
            n_estimators=800,
            learning_rate=0.03,
            max_depth=5,
            subsample=0.8,
            colsample_bytree=0.8,
            eval_metric="logloss"
        ),
        "cat": CatBoostClassifier(iterations=700, learning_rate=0.03, depth=6, verbose=0)
    }

    results = {}

    for name, model in models.items():
        model.fit(X_train, y_train)
        pred = model.predict_proba(X_valid)[:, 1]
        auc = roc_auc_score(y_valid, pred)
        results[name] = auc
        print(f"{name} AUC: {auc:.4f}")

    return results

LR AUC: 0.8357170276696426
[LightGBM] [Info] Number of positive: 55584, number of negative: 71820
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.065567 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7647
[LightGBM] [Info] Number of data points in the train set: 127404, number of used features: 84
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.436281 -> initscore=-0.256268
[LightGBM] [Info] Start training from score -0.256268
LGB AUC: 0.8626491081081665
XGB AUC: 0.8637751087300883
Cat AUC: 0.8612561573293939


In [13]:
# -----------------------------------------
# 컬럼 제거 전/후 모델 성능 비교 (evaluate_models 사용)
# -----------------------------------------

print("\n===== 컬럼 제거 전(Base Dataset) 모델 성능 =====\n")
before_drop_scores = evaluate_models(X_train_b, X_val_b,y_train_b, y_val_b)

print("\n===== 컬럼 제거 후(Feature Dropped Dataset) 모델 성능 =====\n")
after_drop_scores = evaluate_models(X_train_a, X_val_a,y_train_a, y_val_a)

# 비교 테이블 생성
compare_drop_df = pd.DataFrame({
    "Before Drop": before_drop_scores,
    "After Drop": after_drop_scores
})

print("\n\n================= 컬럼 제거 전/후 AUC 비교 =================\n")
print(compare_drop_df)


===== 컬럼 제거 전(Base Dataset) 모델 성능 =====

lr AUC: 0.8132
[LightGBM] [Info] Number of positive: 55722, number of negative: 71682
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.052445 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2390
[LightGBM] [Info] Number of data points in the train set: 127404, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.437365 -> initscore=-0.251865
[LightGBM] [Info] Start training from score -0.251865
lgb AUC: 0.8640
xgb AUC: 0.8653
cat AUC: 0.8621

===== 컬럼 제거 후(Feature Dropped Dataset) 모델 성능 =====

lr AUC: 0.8285
[LightGBM] [Info] Number of positive: 55722, number of negative: 71682
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.026666 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2126
[LightGBM] [Info] Number of data points in the train set: 127404,

In [14]:
# IQR Winsorization 이상치 처리 후 성능
auc_iqr = evaluate_models(X_train_iqr, X_valid_iqr, y_train_iqr, y_valid_iqr)
print("===== IQR Winsorization AUC Scores =====")
print(auc_iqr)

# RobustScaler 이상치 처리 후 성능
auc_rb = evaluate_models(X_train_rb, X_valid_rb, y_train_rb, y_valid_rb)
print("===== RobustScaler AUC Scores =====")
print(auc_rb)

# -----------------------------------------
# 결과 비교
# -----------------------------------------
print("\n\n======== 최종 비교: Winsorization vs RobustScaler =========\n")

compare_df = pd.DataFrame({
    "IQR Winsorization": auc_iqr,
    "RobustScaler": auc_rb
})

print(compare_df)

lr AUC: 0.8352
[LightGBM] [Info] Number of positive: 55584, number of negative: 71820
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007130 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1373
[LightGBM] [Info] Number of data points in the train set: 127404, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.436281 -> initscore=-0.256268
[LightGBM] [Info] Start training from score -0.256268
lgb AUC: 0.8633
xgb AUC: 0.8643
cat AUC: 0.8613
===== IQR Winsorization AUC Scores =====
{'lr': np.float64(0.8351761246507425), 'lgb': np.float64(0.863331843886183), 'xgb': np.float64(0.8643118009328202), 'cat': np.float64(0.8613431235735212)}
lr AUC: 0.8305
[LightGBM] [Info] Number of positive: 55584, number of negative: 71820
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009440 s

In [16]:
# -----------------------------------------
#  Feature Engineering "이전" vs "이후" 성능 비교
# -----------------------------------------

print("\n\n================= Feature Engineering 이전 성능 =================\n")
fe_before_auc = evaluate_models(X_train_rb, X_valid_rb, y_train_rb, y_valid_rb)

print("\n\n================= Feature Engineering 이후 성능 =================\n")
fe_after_auc = evaluate_models(X_train, X_valid, y_train, y_valid)

# -----------------------------------------
# 결과 비교 테이블
# -----------------------------------------
fe_compare_df = pd.DataFrame({
    "Before_FE (기본 변수)": fe_before_auc,
    "After_FE (FE 적용)": fe_after_auc
})

print("\n\n================= Feature Engineering 전/후 AUC 비교 =================\n")
print(fe_compare_df)




lr AUC: 0.8305
[LightGBM] [Info] Number of positive: 55584, number of negative: 71820
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009406 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2123
[LightGBM] [Info] Number of data points in the train set: 127404, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.436281 -> initscore=-0.256268
[LightGBM] [Info] Start training from score -0.256268
lgb AUC: 0.8635
xgb AUC: 0.8642
cat AUC: 0.8615



lr AUC: 0.8357
[LightGBM] [Info] Number of positive: 55584, number of negative: 71820
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.033346 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7647
[LightGBM] [Info] Number o

In [17]:
# Stacking Ensemble

from sklearn.ensemble import StackingClassifier
from sklearn.svm import SVC

estimators = [
    ('lgb', LGBMClassifier(n_estimators=500)),
    ('xgb', XGBClassifier(n_estimators=400, eval_metric="logloss")),
    ('cat', CatBoostClassifier(iterations=400, verbose=0))
]

stack_model = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(),
    passthrough=False
)

stack_model.fit(X_train, y_train)
stack_pred = stack_model.predict_proba(X_valid)[:,1]

print("Stacking AUC:", roc_auc_score(y_valid, stack_pred))

[LightGBM] [Info] Number of positive: 55584, number of negative: 71820
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.033097 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7647
[LightGBM] [Info] Number of data points in the train set: 127404, number of used features: 84
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.436281 -> initscore=-0.256268
[LightGBM] [Info] Start training from score -0.256268
[LightGBM] [Info] Number of positive: 44467, number of negative: 57456
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.036696 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7639
[LightGBM] [Info] Number of data points in the train set: 101923, number of used features: 84
[LightGBM] [Info

In [18]:
# Cross Validation (5-fold 또는 10-fold Stratified K-Fold. 의료/바이오 데이터는 안정성 위해 10-fold 선호)

from sklearn.model_selection import StratifiedKFold

def cv_score(model, X, y, folds=10):
    kf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=42)
    scores = []

    for train_idx, val_idx in kf.split(X, y):
        X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]

        model.fit(X_tr, y_tr)
        pred = model.predict_proba(X_val)[:,1]
        auc = roc_auc_score(y_val, pred)
        scores.append(auc)

    return np.mean(scores), scores

mean_auc, all_scores = cv_score(LGBMClassifier(n_estimators=600), X, y)
print("CV Mean AUC:", mean_auc)
print("Fold Scores:", all_scores)

[LightGBM] [Info] Number of positive: 62687, number of negative: 80643
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.015869 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2414
[LightGBM] [Info] Number of data points in the train set: 143330, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.437361 -> initscore=-0.251878
[LightGBM] [Info] Start training from score -0.251878
[LightGBM] [Info] Number of positive: 62687, number of negative: 80643
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014655 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2417
[LightGBM] [Info] Number of data points in the train set: 143330, number of used features: 23
[LightGBM] [Info

In [None]:
# Soft-Voting Ensemble (3개 모델 평균)


# Hyperparameter Tuning

In [19]:
!pip install optuna
import optuna

Collecting optuna
  Downloading optuna-4.6.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.10.1-py3-none-any.whl.metadata (11 kB)
Downloading optuna-4.6.0-py3-none-any.whl (404 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m404.7/404.7 kB[0m [31m26.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.10.1-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna
Successfully installed colorlog-6.10.1 optuna-4.6.0


In [20]:
# Optuna Hyperparameter Tuning — LightGBM

def objective(trial):

    params = {
        "n_estimators": trial.suggest_int("n_estimators", 400, 2000),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.4, 1.0),
        "subsample": trial.suggest_float("subsample", 0.4, 1.0),
        "num_leaves": trial.suggest_int("num_leaves", 16, 128)
    }

    model = LGBMClassifier(**params)

    score, _ = cv_score(model, X, y, folds=5)
    return score

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30)

print("Best LGBM Score:", study.best_value)
print("Best LGBM Params:", study.best_params)

[I 2025-12-01 00:45:41,417] A new study created in memory with name: no-name-b612b7f3-0771-4bb5-9a6d-1548c47d625a


[LightGBM] [Info] Number of positive: 55722, number of negative: 71682
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.028036 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2399
[LightGBM] [Info] Number of data points in the train set: 127404, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.437365 -> initscore=-0.251865
[LightGBM] [Info] Start training from score -0.251865


[W 2025-12-01 00:46:36,260] Trial 0 failed with parameters: {'n_estimators': 1925, 'max_depth': 6, 'learning_rate': 0.09144764329737867, 'colsample_bytree': 0.7302519594370103, 'subsample': 0.5715978071263, 'num_leaves': 100} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/optuna/study/_optimize.py", line 205, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/tmp/ipython-input-996460303.py", line 16, in objective
    score, _ = cv_score(model, X, y, folds=5)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipython-input-4213427385.py", line 14, in cv_score
    pred = model.predict_proba(X_val)[:,1]
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/lightgbm/sklearn.py", line 1627, in predict_proba
    result = super().predict(
             ^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/lightgbm/sk

KeyboardInterrupt: 

In [None]:
# Optuna Hyperparameter Tuning — XGBoost

def xgb_objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 400, 2000),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "gamma": trial.suggest_float("gamma", 0.0, 5.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 20),
        "eval_metric": "logloss"
    }

    model = XGBClassifier(**params, use_label_encoder=False)
    score, _ = cv_score(model, X, y, folds=5)
    return score

study_xgb = optuna.create_study(direction="maximize")
study_xgb.optimize(xgb_objective, n_trials=30)

print("Best XGB Score:", study_xgb.best_value)
print("Best XGB Params:", study_xgb.best_params)

In [None]:
# Optuna Hyperparameter Tuning — CatBoost

def cat_objective(trial):
    params = {
        "iterations": trial.suggest_int("iterations", 300, 1500),
        "depth": trial.suggest_int("depth", 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1.0, 10.0),
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0, 5),
        "random_strength": trial.suggest_float("random_strength", 0.1, 10),
        "border_count": trial.suggest_int("border_count", 32, 255),
        "loss_function": "Logloss",
        "verbose": 0
    }

    model = CatBoostClassifier(**params)
    score, _ = cv_score(model, X, y, folds=5)
    return score

study_cat = optuna.create_study(direction="maximize")
study_cat.optimize(cat_objective, n_trials=30)

print("Best CatBoost Score:", study_cat.best_value)
print("Best CatBoost Params:", study_cat.best_params)

In [None]:
# ============================================================
# Optuna 결과로 튜닝된 최적 하이퍼파라미터로 모델 재정의
# ============================================================

best_lgb_params = study.best_params
best_xgb_params = study_xgb.best_params
best_cat_params = study_cat.best_params

best_lgb_params['eval_metric'] = 'logloss'
best_xgb_params['eval_metric'] = 'logloss'
best_cat_params['loss_function'] = 'Logloss'
best_cat_params['verbose'] = 0


# ============================================================
# 튜닝된 개별 모델 정의
# ============================================================

lgb_best = LGBMClassifier(**best_lgb_params)
xgb_best = XGBClassifier(**best_xgb_params, use_label_encoder=False)
cat_best = CatBoostClassifier(**best_cat_params)


# ============================================================
# Stacking Ensemble 모델 구성 (튜닝된 모델 기반)
# ============================================================

from sklearn.ensemble import StackingClassifier

estimators_best = [
    ('lgb', lgb_best),
    ('xgb', xgb_best),
    ('cat', cat_best)
]

stack_final = StackingClassifier(
    estimators=estimators_best,
    final_estimator=LogisticRegression(max_iter=3000),
    passthrough=False,
    n_jobs=-1
)


# ============================================================
# 모델 피팅 (Train 데이터 전체 사용)
# ============================================================

stack_final.fit(X_fe, y)

# 검증 AUC 체크
valid_pred = stack_final.predict_proba(X_valid)[:, 1]
valid_auc = roc_auc_score(y_valid, valid_pred)

print("\n==============================")
print("Final Stacking AUC (with Tuned Models):", valid_auc)
print("==============================\n")


# ============================================================
# Test 데이터 예측
# ============================================================

test_pred = stack_final.predict_proba(test_fe)[:, 1]

submission = pd.DataFrame({
    "id": test_full["id"],
    "smoking": test_pred
})

submission.head()

# CSV 저장
submission.to_csv("stacking_optuna_submission.csv", index=False)
print("📁 Submission file saved: stacking_optuna_submission.csv")


# 해석

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import shap
shap.initjs()

In [None]:
# Feature Importance + SHAP 기반 모델 (XGBoost / LightGBM / CatBoost) 해석

def plot_feature_importance(model, feature_names, model_name="model"):
    importance = model.feature_importances_
    idx = np.argsort(importance)[::-1]

    plt.figure(figsize=(8, 12))
    sns.barplot(
        x=importance[idx][:30],
        y=np.array(feature_names)[idx][:30],
        orient="h"
    )
    plt.title(f"{model_name} Feature Importance (Top 30)")
    plt.xlabel("Importance")
    plt.ylabel("Features")
    plt.show()


# LightGBM
plot_feature_importance(lgb, X_train.columns, "LightGBM")

# XGBoost
plot_feature_importance(xgb, X_train.columns, "XGBoost")

# CatBoost
plot_feature_importance(cat, X_train.columns, "CatBoost")

In [None]:
# SHAP 기반 모델 해석

# SHAP Explainer 생성
explainer_lgb = shap.TreeExplainer(lgb)
explainer_xgb = shap.TreeExplainer(xgb)
explainer_cat = shap.TreeExplainer(cat)

shap_lgb = explainer_lgb.shap_values(X_valid)
shap_xgb = explainer_xgb.shap_values(X_valid)
shap_cat = explainer_cat.shap_values(X_valid)

In [None]:
# SHAP Summary Plot (가장 중요한 해석 플롯; 색(Feature 값의 크기, x축(SHAP 영향력): 0보다 오른쪽 → 흡연일 확률 ↑, 중요 Feature 순서 확인 가능)

# LightGBM
shap.summary_plot(shap_lgb, X_valid, plot_type="dot", max_display=30)

# XGBoost
shap.summary_plot(shap_xgb, X_valid, plot_type="dot", max_display=30)

# CatBoost
shap.summary_plot(shap_cat, X_valid, plot_type="dot", max_display=30)

In [None]:
# SHAP Bar Plot (가장 평균적으로 중요한 변수만 시각화)

# LightGBM
shap.summary_plot(shap_lgb, X_valid, plot_type="bar", max_display=30)

# XGBoost
shap.summary_plot(shap_xgb, X_valid, plot_type="bar", max_display=30)

# CatBoost
shap.summary_plot(shap_cat, X_valid, plot_type="bar", max_display=30)

In [None]:
# SHAP Dependence Plot (변수 간 상호작용 분석; 가장 중요한 feature 1~2개만 사용)

top_feature = X_train.columns[np.argmax(lgb.feature_importances_)]
shap.dependence_plot(top_feature, shap_lgb, X_valid)

In [None]:
# SHAP Force Plot (개별 샘플 분석; 특정 validation 예측 1개 선택)

# SHAP JS 초기화
shap.initjs()

# 예시: 첫 번째 validation 샘플
i = 0

# 단일 샘플만 DataFrame 형태로 선택
X_sample = X_valid.iloc[i:i+1]

# SHAP value 계산
shap_value_sample = explainer_lgb.shap_values(X_sample)

# Force Plot
shap.plots.force(
    explainer_lgb.expected_value,  # <- base value 추가
    shap_value_sample,
    X_sample,
    matplotlib=True
)

In [None]:
# Partial Dependence Plot (PDP)

In [None]:
# # 최종 모델 선택
# final_model = stack_model  # Stacking Ensemble을 최종 모델로 선택

# # 학습 데이터 기준 컬럼 가져오기
# train_cols = X_train.columns

# # 테스트 데이터 컬럼 맞추기
# for c in train_cols:
#     if c not in test_fe.columns:
#         test_fe[c] = 0  # 없는 컬럼은 0으로 채움

# # 학습 데이터에는 있는데 테스트 데이터에 없는 컬럼 제거
# test_fe = test_fe[train_cols]

# # test 데이터 예측
# test_pred = final_model.predict_proba(test_fe)[:,1]

# # 제출 파일 생성
# submission = pd.DataFrame({
#     "id": pd.read_csv("/content/test.csv")["id"],  # 원본 test.csv에서 id 가져오기
#     "smoking": test_pred
# })

# # CSV 저장
# submission.to_csv("submission.csv", index=False)
# print("제출 파일 생성 완료: submission.csv")