In [None]:
pip install pytorch-tabnet

In [None]:
!git clone https://github.com/kenkang99/FISA_MachineLearning-prac.git

In [None]:
import pandas as pd

data_orig = pd.read_csv('FISA_MachineLearning-prac/data/train.csv')
data_orig

In [None]:
data = data_orig.drop('ID', axis=1)
data

In [None]:
from sklearn.model_selection import train_test_split

# Separate features (X) and target (y)
X = data.drop('target', axis=1)
y = data['target']

# Split the data into training and validation sets (80/20)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training set shape:", X_train.shape, y_train.shape)
print("Validation set shape:", X_valid.shape, y_valid.shape)

In [None]:
import numpy as np
from sklearn.metrics import f1_score
from pytorch_tabnet.tab_model import TabNetClassifier
from pytorch_tabnet.metrics import Metric

# 1) 멀티클래스용 Macro-F1 (권장)
class MacroF1(Metric):
    def __init__(self):
        self._name = "macro_f1"
        self._maximize = True  # 값이 클수록 좋은 지표인 경우 True

    def __call__(self, y_true, y_score):
        """
        y_true: (N,) int labels
        y_score: (N, C) class probabilities/logits for classification
        """
        # TabNet은 eval 시 보통 (N, C) 점수를 줍니다. argmax로 라벨 변환
        y_pred = np.argmax(y_score, axis=1)
        return f1_score(y_true, y_pred, average="macro")


In [None]:
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from pytorch_tabnet.tab_model import TabNetClassifier
from pytorch_tabnet.metrics import Metric
import torch

# --- 커스텀 Macro-F1 Metric (TabNet 내부 모니터링용) ---
class MacroF1(Metric):
    def __init__(self):
        self._name = "macro_f1"
        self._maximize = True  # 값이 클수록 좋은 지표

    def __call__(self, y_true, y_score):
        # y_score: (N, C) class probabilities/logits
        y_pred = np.argmax(y_score, axis=1)
        return f1_score(y_true, y_pred, average="macro")


# 1) 스케일링 -> float32 보장
scaler = StandardScaler()
X_train1 = scaler.fit_transform(X_train).astype(np.float32)
X_valid1 = scaler.transform(X_valid).astype(np.float32)

# 2) 라벨 -> int64 1D
to_int64 = lambda y: (y.to_numpy() if hasattr(y, "to_numpy") else np.asarray(y)).astype(np.int64).ravel()
y_train1 = to_int64(y_train)
y_valid1 = to_int64(y_valid)

# --- CV 설정 ---
cv = 10
skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=42)

best_f1 = -1.0
best_clf = None

fold = 0
for tr_idx, va_idx in skf.split(X_train1, y_train1):
    fold += 1
    X_tr, y_tr = X_train1[tr_idx], y_train1[tr_idx]
    X_va, y_va = X_train1[va_idx], y_train1[va_idx]

    clf_fold = TabNetClassifier()

    # TabNet의 early stopping/모니터링은 커스텀 metric으로 진행
    clf_fold.fit(
        X_tr, y_tr,
        eval_set=[(X_va, y_va)],
        eval_name=["valid"],
        eval_metric=[MacroF1],   # 커스텀 Metric 인스턴스
        max_epochs=100,
        patience=10,
        batch_size=128,
        virtual_batch_size=128,
        # num_workers=0  # (필요시 설정)
        # drop_last=False
    )

    # fold 성능을 macro-F1로 직접 확인
    va_pred = clf_fold.predict(X_va)
    va_f1 = f1_score(y_va, va_pred, average="macro")
    print(f"[Fold {fold:02d}] macro-F1: {va_f1:.5f}")

    # 최고 성능 모델 갱신
    if va_f1 > best_f1:
        best_f1 = va_f1
        best_clf = clf_fold  # 가장 성능 좋은 모델 객체를 유지

# --- 최종 clf를 최고 성능 fold의 모델로 지정 ---
clf = best_clf
print(f"\nBest CV macro-F1: {best_f1:.5f}")

# --- 홀드아웃(원래 valid) 성능 확인 및 예측 ---
preds = clf.predict(X_valid1)
holdout_f1 = f1_score(y_valid1, preds, average="macro")
print(f"Holdout macro-F1: {holdout_f1:.5f}")


In [None]:
import numpy as np

feature_names = list(X_train.columns)  # 시각화/정렬용
fi = clf.feature_importances_          # shape: (n_features,)

# 상위 Top-K 출력
K = 30
order = np.argsort(fi)[::-1][:K]
for i in range(len(order)):
    print(f"{i+1:>2}. {feature_names[order[i]]}: {fi[order[i]]:.10f}")


In [None]:
test = pd.read_csv('/content/FISA_MachineLearning-prac/data/test.csv')

test_x = test.drop(columns=['ID'])
preds = clf.predict(test_x.to_numpy(dtype=np.float32))

submission = pd.read_csv('/content/FISA_MachineLearning-prac/data/sample_submission.csv')

submission['target'] = preds
submission

submission.to_csv('./baseline_submit_tabnet_cv.csv', index=False, encoding='utf-8-sig')