In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader, Subset
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold

# -----------------------------
# 데이터 로드
# -----------------------------
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

train_x = train.drop(columns=['ID', 'target']).values
train_y = train['target'].values
test_x = test.drop(columns=['ID']).values

# -----------------------------
# 스케일링
# -----------------------------
scaler = StandardScaler()
train_x = scaler.fit_transform(train_x)
test_x = scaler.transform(test_x)

# torch tensor 변환
X_train = torch.tensor(train_x, dtype=torch.float32)
y_train = torch.tensor(train_y, dtype=torch.long)
X_test = torch.tensor(test_x, dtype=torch.float32)

# -----------------------------
# FT-Transformer 정의
# -----------------------------
class FeatureTokenizer(nn.Module):
    def __init__(self, num_features, d_model):
        super().__init__()
        self.linear = nn.Linear(num_features, d_model)

    def forward(self, x):
        return self.linear(x)


class FTTransformer(nn.Module):
    def __init__(self, num_features, num_classes, d_model=128, n_heads=4, n_layers=4, dropout=0.1):
        super().__init__()
        self.tokenizer = FeatureTokenizer(num_features, d_model)

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=n_heads,
            dim_feedforward=d_model * 4,
            dropout=dropout,
            activation='gelu',
            batch_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=n_layers)

        self.cls_token = nn.Parameter(torch.randn(1, 1, d_model))
        self.fc_out = nn.Linear(d_model, num_classes)

    def forward(self, x):
        batch_size = x.size(0)
        x = self.tokenizer(x).unsqueeze(1)
        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
        x = torch.cat([cls_tokens, x], dim=1)
        x = self.transformer(x)
        cls_output = x[:, 0]
        return self.fc_out(cls_output)


# -----------------------------
# EarlyStopping 클래스
# -----------------------------
class EarlyStopping:
    def __init__(self, patience=10, verbose=False):
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_loss = np.inf
        self.early_stop = False

    def __call__(self, val_loss):
        if val_loss < self.best_loss:
            self.best_loss = val_loss
            self.counter = 0
        else:
            self.counter += 1
            if self.verbose:
                print(f"EarlyStopping counter {self.counter}/{self.patience}")
            if self.counter >= self.patience:
                self.early_stop = True


# -----------------------------
# 학습 준비
# -----------------------------
num_features = X_train.shape[1]
num_classes = len(np.unique(train_y))
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

batch_size = 128
epochs = 200
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

all_preds = np.zeros((len(test_x), num_classes))

# -----------------------------
# 5-Fold CV
# -----------------------------
for fold, (train_idx, val_idx) in enumerate(kfold.split(X_train)):
    print(f"\n===== Fold {fold+1} / 5 =====")

    train_subset = Subset(TensorDataset(X_train, y_train), train_idx)
    val_subset = Subset(TensorDataset(X_train, y_train), val_idx)

    train_loader = DataLoader(train_subset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_subset, batch_size=batch_size, shuffle=False)

    model = FTTransformer(num_features, num_classes, d_model=128, n_heads=4, n_layers=3, dropout=0.2).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.7, patience=3)
    early_stopping = EarlyStopping(patience=10, verbose=True)

    for epoch in range(epochs):
        # ---- Training ----
        model.train()
        running_loss = 0.0
        for batch_x, batch_y in train_loader:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)
            optimizer.zero_grad()
            outputs = model(batch_x)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        avg_train_loss = running_loss / len(train_loader)

        # ---- Validation ----
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for batch_x, batch_y in val_loader:
                batch_x, batch_y = batch_x.to(device), batch_y.to(device)
                outputs = model(batch_x)
                loss = criterion(outputs, batch_y)
                val_loss += loss.item()
        avg_val_loss = val_loss / len(val_loader)

        scheduler.step(avg_val_loss)
        print(f"Epoch {epoch+1}/{epochs} | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")

        early_stopping(avg_val_loss)
        if early_stopping.early_stop:
            print("⏹️ Early stopping triggered")
            break

    # ---- Test 예측 ----
    model.eval()
    with torch.no_grad():
        X_test = X_test.to(device)
        outputs = model(X_test)
        all_preds += torch.softmax(outputs, dim=1).cpu().numpy()

# -----------------------------
# 최종 예측 (앙상블: Softmax 평균)
# -----------------------------
final_preds = np.argmax(all_preds, axis=1)

# -----------------------------
# 제출 파일 생성
# -----------------------------
submission = pd.read_csv('../data/5fold_submit.csv')
submission['target'] = final_preds
submission.to_csv('../data/fttransformer_5fold_submit.csv',
                  index=False, encoding='utf-8-sig')
print("✅ 최종 제출 파일 생성 완료")



===== Fold 1 / 5 =====
Epoch 1/200 | Train Loss: 1.4391 | Val Loss: 0.9231
Epoch 2/200 | Train Loss: 0.8656 | Val Loss: 0.7782
Epoch 3/200 | Train Loss: 0.7743 | Val Loss: 0.7194
Epoch 4/200 | Train Loss: 0.7163 | Val Loss: 0.6545


KeyboardInterrupt: 