In [1]:
import numpy as np
import pandas as pd
import os
import torch
from tqdm import tqdm
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import random
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import StandardScaler

In [2]:
# Set seed and device
def set_seed(seed: int = 42) -> None:
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ["PYTHONHASHSEED"] = str(seed)

set_seed(42)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Подготовка данных

In [3]:
df = pd.read_csv('train.csv')
X = df.drop(columns=['smoking'])
y = df['smoking']

In [4]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

In [5]:
class CustomDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.FloatTensor(X)
        self.y = torch.FloatTensor(y.values).reshape(-1, 1)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_dataset = CustomDataset(X_train, y_train)
val_dataset = CustomDataset(X_val, y_val)

In [6]:
BATCH_SIZE = 32
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)



# Реализуем модель TabNet, которая является ондой из луччших архитектур для табличных данных

Сначала реализуем GLU слои

Затем слой FeatureTransformer, преобразующий исходные фичи в скрытое представление

Также реализуем AttentiveTransformer, который нужен, для определения важности фичей на каждом шаге

In [7]:
class GLU(nn.Module):
    def __init__(self, input_dim, output_dim, p=0.2):
        super().__init__()
        self.fc = nn.Linear(input_dim, 2 * output_dim)
        self.bn = nn.BatchNorm1d(2 * output_dim)
        self.drop = nn.Dropout(p)

    def forward(self, x):
        x = self.fc(x)
        x = self.bn(x)
        x = self.drop(x)
        out, gate = x.chunk(2, dim=1)
        return out * torch.sigmoid(gate)

class FeatureTransformer(nn.Module):
    def __init__(self, input_dim, shared=2, steps=2, hidden=32, p=0.2):
        super().__init__()
        self.shared = nn.ModuleList([
            GLU(input_dim if i ==0 else hidden, hidden, p)
            for i in range(shared)
        ])
        self.steps = nn.ModuleList([
            GLU(hidden, hidden, p)
            for _ in range(steps)
        ])

    def forward(self, x):
        for layer in self.shared:
            x = layer(x)

        step_outputs = []
        for layer in self.steps:
            x = layer(x)
            step_outputs.append(x)
        return x, step_outputs

class AttentiveTransformer(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.fc = nn.Linear(input_dim, output_dim)
        self.bn = nn.BatchNorm1d(output_dim)
        nn.init.xavier_normal_(self.fc.weight)
        nn.init.zeros_(self.fc.bias)

    def forward(self, x, prior):
        x = self.fc(x)
        x = self.bn(x)
        x = x * prior
        return F.softmax(x, dim=1)

# Реализуем пошаговую модель TabNet

In [10]:
class TabNetClassifier(nn.Module):
    def __init__(self, input_dim, n_steps=3, shared_layers=2, hidden_dim=32, gamma=1.3, sparsity=1e-4, p=0.2):
        super().__init__()
        self.feature_transformer = FeatureTransformer(
            input_dim,
            shared=shared_layers,
            steps=n_steps,
            hidden=hidden_dim,
            p=p
        )
        self.attentive_trans = nn.ModuleList([
            AttentiveTransformer(hidden_dim, input_dim)
            for _ in range(n_steps)
        ])

        self.mask_projectors = nn.ModuleList([
            nn.Linear(input_dim, hidden_dim)
            for _ in range(n_steps)
        ])

        self.fc = nn.Linear(hidden_dim, 1)
        self.gamma = gamma
        self.sparsity = sparsity
        self.register_buffer('prior', torch.ones(1, input_dim))

    def forward(self, x):
        features, step_outputs = self.feature_transformer(x)
        total_out = 0
        prior = self.prior.expand(x.size(0), -1)
        masks = []

        for step in range(len(self.attentive_trans)):
            mask = self.attentive_trans[step](features, prior)
            masks.append(mask)

            masked_features = x * mask
            projected_features = self.mask_projectors[step](masked_features)
            features = F.relu(projected_features)

            # Используем текущий шаг для step_outputs
            total_out += self.fc(step_outputs[step])

            prior = prior * (self.gamma - mask)

        mask_loss = self.sparsity * torch.mean(
            torch.sum(torch.stack([m.mean(dim=0) for m in masks]), dim=0)
        )
        return total_out.squeeze(), mask_loss

# Train-loop и eval-loop

In [11]:
def train_model(model, optimizer, train_loader, val_loader, epochs=30, lr=1e-3):
    model.to(device)
    criterion = nn.BCEWithLogitsLoss()
    best_auc = 0

    for epoch in range(epochs):
        model.train()
        train_preds, train_true = [], []

        for x, y in tqdm(train_loader, desc=f'Epoch {epoch+1}'):
            x, y = x.to(device), y.to(device)
            optimizer.zero_grad()

            logits, mask_loss = model(x)
            loss = criterion(logits, y.squeeze()) + mask_loss

            loss.backward()
            optimizer.step()

            train_preds.extend(torch.sigmoid(logits.detach().cpu()))
            train_true.extend(y.cpu().squeeze())

        val_auc = eval_model(model, val_loader)
        train_auc = roc_auc_score(train_true, train_preds)
        print(f"Epoch {epoch+1} | Train AUC: {train_auc:.4f} | Val AUC: {val_auc:.4f}")

        # Save best model
        if val_auc > best_auc:
            best_auc = val_auc
            torch.save(model.state_dict(), 'best_model.pth')

    return model

In [12]:
def eval_model(model, loader):
    model.eval()
    preds, true = [], []

    with torch.no_grad():
        for x, y in loader:
            x = x.to(device)
            logits, _ = model(x)
            preds.extend(torch.sigmoid(logits.cpu()))
            true.extend(y.squeeze().cpu().numpy())

    return roc_auc_score(true, preds)


# Обучим модель и посмотрим на качество во время обучения

Наверное лучше, если есть время, запустить optuna для поиска лучших гиперпараметров и использовать для этого кроссвалидацию

In [22]:
model = TabNetClassifier(
    input_dim=X_train.shape[1],
    n_steps=2,
    hidden_dim=64,
    shared_layers=3,
    sparsity=1e-3
)

optimizer = optim.AdamW(model.parameters(), lr=1e-3)
train_model(model, optimizer, train_loader, val_loader, epochs=15)

Epoch 1: 100%|██████████| 375/375 [00:04<00:00, 85.38it/s] 


Epoch 1 | Train AUC: 0.8419 | Val AUC: 0.8790


Epoch 2: 100%|██████████| 375/375 [00:03<00:00, 105.43it/s]


Epoch 2 | Train AUC: 0.8630 | Val AUC: 0.8833


Epoch 3: 100%|██████████| 375/375 [00:03<00:00, 96.84it/s] 


Epoch 3 | Train AUC: 0.8683 | Val AUC: 0.8815


Epoch 4: 100%|██████████| 375/375 [00:03<00:00, 113.23it/s]


Epoch 4 | Train AUC: 0.8743 | Val AUC: 0.8822


Epoch 5: 100%|██████████| 375/375 [00:03<00:00, 109.47it/s]


Epoch 5 | Train AUC: 0.8733 | Val AUC: 0.8837


Epoch 6: 100%|██████████| 375/375 [00:04<00:00, 87.63it/s] 


Epoch 6 | Train AUC: 0.8748 | Val AUC: 0.8805


Epoch 7: 100%|██████████| 375/375 [00:03<00:00, 113.47it/s]


Epoch 7 | Train AUC: 0.8767 | Val AUC: 0.8798


Epoch 8: 100%|██████████| 375/375 [00:03<00:00, 114.59it/s]


Epoch 8 | Train AUC: 0.8758 | Val AUC: 0.8828


Epoch 9: 100%|██████████| 375/375 [00:04<00:00, 79.99it/s]


Epoch 9 | Train AUC: 0.8770 | Val AUC: 0.8810


Epoch 10: 100%|██████████| 375/375 [00:03<00:00, 112.22it/s]


Epoch 10 | Train AUC: 0.8792 | Val AUC: 0.8828


Epoch 11: 100%|██████████| 375/375 [00:03<00:00, 111.16it/s]


Epoch 11 | Train AUC: 0.8792 | Val AUC: 0.8818


Epoch 12: 100%|██████████| 375/375 [00:04<00:00, 85.06it/s]


Epoch 12 | Train AUC: 0.8792 | Val AUC: 0.8844


Epoch 13: 100%|██████████| 375/375 [00:03<00:00, 103.45it/s]


Epoch 13 | Train AUC: 0.8785 | Val AUC: 0.8796


Epoch 14: 100%|██████████| 375/375 [00:03<00:00, 116.19it/s]


Epoch 14 | Train AUC: 0.8807 | Val AUC: 0.8845


Epoch 15: 100%|██████████| 375/375 [00:05<00:00, 72.55it/s] 


Epoch 15 | Train AUC: 0.8812 | Val AUC: 0.8831


TabNetClassifier(
  (feature_transformer): FeatureTransformer(
    (shared): ModuleList(
      (0): GLU(
        (fc): Linear(in_features=23, out_features=128, bias=True)
        (bn): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (drop): Dropout(p=0.2, inplace=False)
      )
      (1-2): 2 x GLU(
        (fc): Linear(in_features=64, out_features=128, bias=True)
        (bn): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (drop): Dropout(p=0.2, inplace=False)
      )
    )
    (steps): ModuleList(
      (0-1): 2 x GLU(
        (fc): Linear(in_features=64, out_features=128, bias=True)
        (bn): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (drop): Dropout(p=0.2, inplace=False)
      )
    )
  )
  (attentive_trans): ModuleList(
    (0-1): 2 x AttentiveTransformer(
      (fc): Linear(in_features=64, out_features=23, bias=True)
      (bn): BatchNorm1d(23, eps=

# Сделаем финальный submission

In [23]:
test_df = pd.read_csv("test.csv")

X_test = scaler.transform(test_df)  # Убедитесь, что колонки совпадают с train

test_dataset = CustomDataset(X_test, pd.Series([0]*len(X_test)))  # Фиктивные метки
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [25]:
# model.load_state_dict(torch.load("best_model.pth"))
model.eval()

# Получение предсказаний
test_preds = []
with torch.no_grad():
    for x, _ in test_loader:
        x = x.to(device)
        logits, _ = model(x)
        probs = torch.sigmoid(logits).cpu().numpy()
        test_preds.extend(probs)


result_df = pd.DataFrame({
    "id": test_df["id"],
    "smoking": test_preds  # Или "predict", если название колонки должно быть predict
})

result_df

result_df.to_csv("submission.csv", index=False)