In [1]:
import os
import math
import time
import argparse
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss, Module
from torch.optim.lr_scheduler import ExponentialLR
from torchmetrics import AUROC, Accuracy, MeanSquaredError
from tqdm import tqdm
from sklearn.metrics import roc_auc_score

from torch_frame import stype
from torch_frame.data import Dataset, DataLoader
from torch_frame.datasets import (
    ForestCoverType, KDDCensusIncome, DataFrameBenchmark,
    AdultCensusIncome, BankMarketing, Dota2
)
from torch_frame.gbdt import CatBoost, LightGBM, XGBoost
from torch_frame.nn import (
    EmbeddingEncoder, FTTransformer, LinearBucketEncoder,
    LinearEncoder, LinearPeriodicEncoder, ResNet, TabNet, TabTransformer
)
from torch_frame.nn.models import (
    MLP, ExcelFormer, Trompt
)
from torch_frame.typing import TaskType



# Use GPU for faster training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
import optuna

  from .autonotebook import tqdm as notebook_tqdm


# DL

In [3]:
df = pd.read_csv("thyroid_cancer_risk_data.csv")

In [4]:
# 분류 task
is_classification = True

In [5]:
from torch_frame import numerical, categorical, text_embedded, embedding

## 칼럼 별 Dtype 지정 
col_to_stype={#"Patient_ID" : numerical,
              "Age" : numerical,
              "Gender" : categorical,
              "Country" : categorical,
              "Ethnicity" : categorical,
              "Family_History" : categorical,
              "Radiation_Exposure" : categorical,
              "Iodine_Deficiency" : categorical,
              "Smoking" : categorical,
              "Obesity" : categorical,
              "Diabetes" : categorical,
              "TSH_Level" : numerical,
              "T3_Level" : numerical,
              "T4_Level" : numerical,
              "Nodule_Size" : categorical,
              "Thyroid_Cancer_Risk" : categorical,
              "Diagnosis" : categorical}

dataset = Dataset(df = df, 
                  col_to_stype = col_to_stype, 
                  target_col = "Diagnosis")

dataset.materialize()

## split
train_dataset, val_dataset, test_dataset = dataset[:0.6], dataset[0.6:0.7], dataset[0.7:]

### ResNet / FT-T

In [6]:
parser = argparse.ArgumentParser()
parser.add_argument('--dataset', type=str, default='adult')
parser.add_argument('--numerical_encoder_type', type=str, default='linear',
                    choices=['linear', 'linearbucket', 'linearperiodic'])
parser.add_argument('--model_type', type=str, default='fttransformer',
                    choices=['fttransformer', 'resnet'])
parser.add_argument('--channels', type=int, default=256)
parser.add_argument('--num_layers', type=int, default=4)
parser.add_argument('--batch_size', type=int, default=512)
parser.add_argument('--lr', type=float, default=0.0001)
parser.add_argument('--epochs', type=int, default=100)
parser.add_argument('--seed', type=int, default=0)
parser.add_argument('--compile', action='store_true')

_StoreTrueAction(option_strings=['--compile'], dest='compile', nargs=0, const=True, default=False, type=None, choices=None, required=False, help=None, metavar=None)

In [None]:
# Jupyter에서 실행될 때는 sys.argv를 조정
args = parser.parse_args([
    #'--dataset', 'adult',
    #'--numerical_encoder_type', 'linear',
    #'--model_type', 'resnet',       # fttransformer : FT-T / resnet : ResNet
    #'--channels', '256',
    #'--num_layers', '4',
    #'--batch_size', '256',  # 데이터를 256개씩 한번에 
    #'--lr', '0.0001',
    '--epochs', '10',
    #'--seed', '0'
])

In [None]:
stype_encoder_dict = {
    stype.categorical: EmbeddingEncoder(),
    stype.numerical: numerical_encoder,
}

if is_classification:
    #output_channels = dataset.num_classes    ->   contains StatType.COUNT을 포함하지 않아서 오류(?)
    output_channels = 2 # 그냥 수동으로 설정.,,,,   => 분류 칼럼 unique 개수로 설정 
else:
    output_channels = 1

In [9]:
def train_one_epoch(model, loader, optimizer):
    model.train()
    loss_sum = sample_cnt = 0

    for tf in loader:
        tf = tf.to(device)
        pred = model(tf)

        loss = F.cross_entropy(pred, tf.y.long()) if is_classification \
               else F.mse_loss(pred.view(-1), tf.y.view(-1))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        loss_sum   += loss.item() * len(tf.y)
        sample_cnt += len(tf.y)

    return loss_sum / sample_cnt

@torch.no_grad()
def evaluate(model, loader):
    model.eval()
    correct_or_error, sample_cnt = 0, 0

    for tf in loader:
        tf = tf.to(device)
        pred = model(tf)

        if is_classification:
            correct_or_error += (pred.argmax(dim=-1) == tf.y).sum().item()
        else:
            correct_or_error += F.mse_loss(
                pred.view(-1), tf.y.view(-1), reduction='sum'
            ).item()
        sample_cnt += len(tf.y)

    if is_classification:
        return correct_or_error / sample_cnt      # accuracy ↑
    else:
        return (correct_or_error / sample_cnt) ** 0.5   # RMSE ↓

In [13]:
# ──────────────────────── Optuna objective ──────────────────────── #
def objective(trial: optuna.trial.Trial) -> float:
    # ── 1) 탐색할 하이퍼파라미터 ────────────────────────── #
    model_type             = trial.suggest_categorical("model_type", ["fttransformer"])    # resnet / fttransformer 
    numerical_encoder_type = trial.suggest_categorical(
        "numerical_encoder_type", ["linear", "linearbucket", "linearperiodic"])
    channels   = trial.suggest_categorical("channels", [128, 256, 512])
    num_layers = trial.suggest_int("num_layers", 2, 6)
    batch_size = trial.suggest_categorical("batch_size", [128, 256, 512])
    lr         = trial.suggest_float("lr", 1e-5, 1e-3, log=True)

    # ── 2) 데이터로더 (배치 크기마다 새로 만듦) ─────────── #
    train_loader = DataLoader(
        train_dataset.tensor_frame, batch_size=batch_size, shuffle=True)
    val_loader   = DataLoader(
        val_dataset.tensor_frame,   batch_size=batch_size)
    
    # ── 3) 각 trial-별 인코더 구성 ─────────────────────── #
    if numerical_encoder_type == "linear":
        numerical_encoder = LinearEncoder()
    elif numerical_encoder_type == "linearbucket":
        numerical_encoder = LinearBucketEncoder()
    else:
        numerical_encoder = LinearPeriodicEncoder()

    stype_encoder_dict = {
        stype.categorical: EmbeddingEncoder(),
        stype.numerical:   numerical_encoder,
    }

    output_channels = 2  # Diagnosis 클래스 수
    # ── 4) 모델 생성 ─────────────────────────────────── #
    if model_type == "fttransformer":
        model = FTTransformer(
            channels=channels,
            out_channels=output_channels,
            num_layers=num_layers,
            col_stats=dataset.col_stats,
            col_names_dict=train_dataset.tensor_frame.col_names_dict,
            stype_encoder_dict=stype_encoder_dict,
        ).to(device)
    else:
        model = ResNet(
            channels=channels,
            out_channels=output_channels,
            num_layers=num_layers,
            col_stats=dataset.col_stats,
            col_names_dict=train_dataset.tensor_frame.col_names_dict,
        ).to(device)

    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)

    # ── 5) 학습 loop (epoch 10 고정) + Pruning 보고 ───── #
    for epoch in range(1, 11):  # 1~10
        train_one_epoch(model, train_loader, optimizer)
        val_score = evaluate(model, val_loader)

        # Optuna에 현재 epoch의 score 보고
        trial.report(val_score, step=epoch)

        # MedianPruner가 “쓸모없다”고 판단하면 중단
        if trial.should_prune():
            raise optuna.TrialPruned()

    return val_score     # 방향: 분류면 maximize(accuracy), 회귀면 minimize(RMSE)

In [None]:
# ──────────────────────── Optuna Study 실행 ──────────────────────── #
direction = "maximize" if is_classification else "minimize"
pruner    = optuna.pruners.MedianPruner(n_warmup_steps=3)

study = optuna.create_study(direction=direction, pruner=pruner)
study.optimize(objective, n_trials=50, timeout=60*60)   # 예: 50 trial or 1시간 제한

# ──────────────────────── 결과 확인 ──────────────────────── #
best_trial = study.best_trial
print(f"Best {direction} value : {best_trial.value:.4f}")
print("Best hyper-parameters  :")
for k, v in best_trial.params.items():
    print(f"  {k:25s}: {v}")

[I 2025-05-21 05:59:28,793] A new study created in memory with name: no-name-0794afc2-0c3b-410b-945c-7f105f5edec0
[I 2025-05-21 06:02:01,766] Trial 0 finished with value: 0.830927641167897 and parameters: {'model_type': 'fttransformer', 'numerical_encoder_type': 'linearperiodic', 'channels': 256, 'num_layers': 4, 'batch_size': 128, 'lr': 0.00014885411789079686}. Best is trial 0 with value: 0.830927641167897.
[I 2025-05-21 06:03:45,475] Trial 1 finished with value: 0.830927641167897 and parameters: {'model_type': 'fttransformer', 'numerical_encoder_type': 'linearbucket', 'channels': 256, 'num_layers': 5, 'batch_size': 256, 'lr': 3.0031027460910578e-05}. Best is trial 0 with value: 0.830927641167897.
[I 2025-05-21 06:05:04,514] Trial 2 finished with value: 0.830927641167897 and parameters: {'model_type': 'fttransformer', 'numerical_encoder_type': 'linear', 'channels': 128, 'num_layers': 3, 'batch_size': 512, 'lr': 2.583559406337251e-05}. Best is trial 0 with value: 0.830927641167897.
[I 

- 선택된 파라미터로 다시 학습

In [None]:
# ─── 1. 최적 하이퍼파라미터 가져오기 ───────────────────────────── #
best_params    = study.best_trial.params
model_type             = best_params["model_type"]
numerical_encoder_type = best_params["numerical_encoder_type"]
channels   = best_params["channels"]
num_layers = best_params["num_layers"]
batch_size = best_params["batch_size"]
lr         = best_params["lr"]

print("▶ Best hyper-parameters")
for k, v in best_params.items():
    print(f"  {k:25s}: {v}")

# ─── 2. 데이터로더 (train+val 통합) ───────────────────────────── #
full_train_dataset = dataset[:0.7]           # train 60% + val 10%  → 70%
full_train_loader  = DataLoader(
    full_train_dataset.tensor_frame, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(
    test_dataset.tensor_frame, batch_size=batch_size)

# ─── 3. 인코더·모델 구성 ─────────────────────────────────────── #
if numerical_encoder_type == "linear":
    numerical_encoder = LinearEncoder()
elif numerical_encoder_type == "linearbucket":
    numerical_encoder = LinearBucketEncoder()
else:
    numerical_encoder = LinearPeriodicEncoder()

stype_encoder_dict = {
    stype.categorical: EmbeddingEncoder(),
    stype.numerical:   numerical_encoder,
}

output_channels = 2     # Diagnosis 클래스 수
if model_type == "fttransformer":
    final_model = FTTransformer(
        channels=channels,
        out_channels=output_channels,
        num_layers=num_layers,
        col_stats=dataset.col_stats,
        col_names_dict=full_train_dataset.tensor_frame.col_names_dict,
        stype_encoder_dict=stype_encoder_dict,
    ).to(device)
else:
    final_model = ResNet(
        channels=channels,
        out_channels=output_channels,
        num_layers=num_layers,
        col_stats=dataset.col_stats,
        col_names_dict=full_train_dataset.tensor_frame.col_names_dict,
    ).to(device)

optimizer = torch.optim.AdamW(final_model.parameters(), lr=lr)

# ─── 4. 학습(Epoch 10 고정) ─────────────────────────────────── #
for epoch in range(1, 11):
    train_one_epoch(final_model, full_train_loader, optimizer)
    train_acc = evaluate(final_model, full_train_loader)
    print(f"[Fin] Epoch {epoch:02d} | Train Acc: {train_acc:.4f}")

# ─── 5. 최종 테스트 ─────────────────────────────────────────── #
test_metric = evaluate(final_model, test_loader)
metric_name = "Accuracy" if is_classification else "RMSE"
print(f"\n★ Final {metric_name} on TEST set: {test_metric:.4f}")


▶ Best hyper-parameters
  model_type               : resnet
  numerical_encoder_type   : linearperiodic
  channels                 : 512
  num_layers               : 5
  batch_size               : 512
  lr                       : 0.00015036039946508727
[Fin] Epoch 01 | Train Acc: 0.8276
[Fin] Epoch 02 | Train Acc: 0.8276
[Fin] Epoch 03 | Train Acc: 0.8276
[Fin] Epoch 04 | Train Acc: 0.8276
[Fin] Epoch 05 | Train Acc: 0.8276
[Fin] Epoch 06 | Train Acc: 0.8276
[Fin] Epoch 07 | Train Acc: 0.8276
[Fin] Epoch 08 | Train Acc: 0.8276
[Fin] Epoch 09 | Train Acc: 0.8276
[Fin] Epoch 10 | Train Acc: 0.8276

★ Final Accuracy on TEST set: 0.8268


### TabNet

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
is_classification = True

# ─────────────────── 데이터 로더 생성 함수 ──────────────────── #
def make_loaders(batch_size, use_val=True):
    train_tf = train_dataset.tensor_frame
    val_tf   = val_dataset.tensor_frame
    test_tf  = test_dataset.tensor_frame
    train_loader = DataLoader(train_tf, batch_size=batch_size, shuffle=True)
    val_loader   = DataLoader(val_tf,   batch_size=batch_size) if use_val else None
    test_loader  = DataLoader(test_tf,  batch_size=batch_size)
    return train_loader, val_loader, test_loader


# ─────────────────── 공통 train / evaluate ──────────────────── #
def train_one_epoch(model, loader, optimizer):
    model.train()
    loss_sum = samp = 0
    for tf in loader:
        tf = tf.to(device)
        pred = model(tf)
        loss = F.cross_entropy(pred, tf.y.long())
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        loss_sum += loss.item() * len(tf.y)
        samp     += len(tf.y)
    return loss_sum / samp


@torch.no_grad()
def evaluate(model, loader):
    model.eval()
    correct = total = 0
    for tf in loader:
        tf = tf.to(device)
        pred = model(tf).argmax(dim=-1)
        correct += (pred == tf.y).sum().item()
        total   += len(tf.y)
    return correct / total


# ─────────────────── Optuna objective ──────────────────── #
def objective(trial):
    # 1) 하이퍼파라미터 샘플링
    params = dict(
        channels   = trial.suggest_categorical("channels",  [128, 256, 512]),
        gamma      = trial.suggest_float("gamma",    1.0, 2.0),
        num_layers = trial.suggest_int ("num_layers", 2, 6),
        batch_size = trial.suggest_categorical("batch_size", [128, 256, 512]),
        lr         = trial.suggest_float("lr", 1e-5, 1e-3, log=True),
    )

    # 2) 데이터로더
    train_loader, val_loader, _ = make_loaders(params["batch_size"], use_val=True)

    # 3) 모델
    model = TabNet(
        out_channels = 2,                                   # 클래스 수
        num_layers   = params["num_layers"],
        split_attn_channels  = params["channels"],
        split_feat_channels  = params["channels"],
        gamma = params["gamma"],
        col_stats     = dataset.col_stats,
        col_names_dict= train_dataset.tensor_frame.col_names_dict,
    ).to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=params["lr"])
    scheduler = ExponentialLR(optimizer, gamma=0.95)

    # 4) epoch-10 학습 + pruning
    for epoch in range(1, 11):
        train_one_epoch(model, train_loader, optimizer)
        val_acc = evaluate(model, val_loader)
        scheduler.step()

        trial.report(val_acc, step=epoch)
        if trial.should_prune():
            raise optuna.TrialPruned()

    return val_acc                    # maximize


# ─────────────────── Optuna Study 실행 ──────────────────── #
study = optuna.create_study(direction="maximize",
                            pruner=optuna.pruners.MedianPruner(n_warmup_steps=3))
study.optimize(objective, n_trials=50, timeout=60*60)      # 예: 50 trial / 1 h

print("Best trial:", study.best_trial.value)
print("Best params:", study.best_trial.params)


In [None]:
# ─────────────────── 최적 파라미터로 파이널 핏 ──────────────────── #
bp = study.best_trial.params
train_loader, _, test_loader = make_loaders(bp["batch_size"], use_val=False)  # train+val 통합

final_model = TabNet(
    out_channels = 2,
    num_layers   = bp["num_layers"],
    split_attn_channels  = bp["channels"],
    split_feat_channels  = bp["channels"],
    gamma = bp["gamma"],
    col_stats     = dataset.col_stats,
    col_names_dict= train_dataset.tensor_frame.col_names_dict,
).to(device)

optimizer  = torch.optim.Adam(final_model.parameters(), lr=bp["lr"])
scheduler  = ExponentialLR(optimizer, gamma=0.95)

for epoch in range(1, 11):            # 다시 10 epoch
    train_one_epoch(final_model, train_loader, optimizer)
    scheduler.step()

test_acc = evaluate(final_model, test_loader)
print(f"\n★ FINAL Accuracy on TEST set: {test_acc:.4f}")

### Tab_Transformer

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
is_classification = True              # 분류 문제

# ────────────────── 데이터로더 헬퍼 ────────────────── #
def make_loaders(batch_size, use_val=True):
    tr_tf, va_tf, te_tf = (train_dataset.tensor_frame,
                           val_dataset.tensor_frame,
                           test_dataset.tensor_frame)
    tr_loader = DataLoader(tr_tf, batch_size=batch_size, shuffle=True)
    va_loader = DataLoader(va_tf, batch_size=batch_size) if use_val else None
    te_loader = DataLoader(te_tf, batch_size=batch_size)
    return tr_loader, va_loader, te_loader

# ────────────────── 공통 학습·평가 ────────────────── #
def train_one_epoch(model, loader, optim):
    model.train()
    loss_sum = samp = 0
    for tf in loader:
        tf = tf.to(device)
        loss = F.cross_entropy(model(tf), tf.y.long())
        optim.zero_grad()
        loss.backward()
        optim.step()
        loss_sum += loss.item() * len(tf.y)
        samp += len(tf.y)
    return loss_sum / samp

@torch.no_grad()
def evaluate(model, loader):
    model.eval()
    correct = total = 0
    for tf in loader:
        tf = tf.to(device)
        pred = model(tf).argmax(dim=-1)
        correct += (pred == tf.y).sum().item()
        total += len(tf.y)
    return correct / total

# ────────────────── Optuna objective ────────────────── #
def objective(trial):
    # 1) 탐색할 파라미터
    p = dict(
        channels       = trial.suggest_categorical("channels",  [128, 256, 512]),
        num_heads      = trial.suggest_categorical("num_heads", [4, 8, 16]),
        num_layers     = trial.suggest_int   ("num_layers", 2, 6),
        encoder_pad_sz = trial.suggest_int   ("encoder_pad_size", 1, 4),
        attn_dp        = trial.suggest_float ("attn_dropout", 0.1, 0.5),
        ffn_dp         = trial.suggest_float ("ffn_dropout",  0.1, 0.5),
        batch_size     = trial.suggest_categorical("batch_size", [128, 256, 512]),
        lr             = trial.suggest_float ("lr", 1e-5, 1e-3, log=True),
    )

    # 2) 데이터
    tr_loader, va_loader, _ = make_loaders(p["batch_size"], use_val=True)

    # 3) 모델
    model = TabTransformer(
        channels      = p["channels"],
        out_channels  = 2,
        num_layers    = p["num_layers"],
        num_heads     = p["num_heads"],
        encoder_pad_size = p["encoder_pad_sz"],
        attn_dropout  = p["attn_dp"],
        ffn_dropout   = p["ffn_dp"],
        col_stats     = dataset.col_stats,
        col_names_dict= train_dataset.tensor_frame.col_names_dict,
    ).to(device)

    optim = torch.optim.Adam(model.parameters(), lr=p["lr"])
    sched = ExponentialLR(optim, gamma=0.95)

    # 4) 10 epoch 학습 + pruning
    for ep in range(1, 11):
        train_one_epoch(model, tr_loader, optim)
        val_acc = evaluate(model, va_loader)
        sched.step()

        trial.report(val_acc, step=ep)
        if trial.should_prune():
            raise optuna.TrialPruned()

    return val_acc                     # maximize ACC

# ────────────────── Study 실행 ────────────────── #
study = optuna.create_study(direction="maximize",
                            pruner=optuna.pruners.MedianPruner(n_warmup_steps=3))
study.optimize(objective, n_trials=40, timeout=60*60)   # 40 trial or 1h

print("◇ Best val ACC:", study.best_trial.value)
print("◇ Best params:")
for k, v in study.best_trial.params.items():
    print(f"  {k:20s}: {v}")

In [None]:
# ────────────────── 파이널 핏 (train+val) ────────────────── #
bp = study.best_trial.params
tr_loader, _, te_loader = make_loaders(bp["batch_size"], use_val=False)

final_model = TabTransformer(
    channels      = bp["channels"],
    out_channels  = 2,
    num_layers    = bp["num_layers"],
    num_heads     = bp["num_heads"],
    encoder_pad_size = bp["encoder_pad_sz"],
    attn_dropout  = bp["attn_dp"],
    ffn_dropout   = bp["ffn_dp"],
    col_stats     = dataset.col_stats,
    col_names_dict= train_dataset.tensor_frame.col_names_dict,
).to(device)

optim = torch.optim.Adam(final_model.parameters(), lr=bp["lr"])
sched = ExponentialLR(optim, gamma=0.95)

for ep in range(1, 11):        # 다시 10 epoch
    train_one_epoch(final_model, tr_loader, optim)
    sched.step()

test_acc = evaluate(final_model, te_loader)
print(f"\n★ FINAL Test ACC: {test_acc:.4f}")