In [2]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_curve, auc
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import os
import time
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [3]:
DATA_PATH = './datasets/data/test/adult.csv'
PLOTS_DIR = './images'
MODEL_PATH = './models'

In [4]:
# Параметры TabNet
params = {
    'batch_size': 128,
    'n_d': 4,
    'n_a': 4,
    'n_steps': 3,
    'gamma': 0.9,
    'n_independent': 2,
    'n_shared': 1,
    'epochs': 10,
    'learning_rate': 0.01,
}

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Устройство:", device)

Устройство: cuda


In [5]:
data = pd.read_csv(DATA_PATH)
data = data.dropna()
data = data.fillna(0).replace([np.inf, -np.inf], 0)
data.columns = data.columns.str.strip()
print(f"Датасет: {data.shape[0]} строк, {data.shape[1]} столбцов")

Датасет: 48842 строк, 15 столбцов


In [7]:
# Проверка данных
data = data.dropna()
data = data.replace([np.inf, -np.inf], 0)
print(f"Датасет: {data.shape[0]} строк, {data.shape[1]} столбцов")

# Определение числовых столбцов
numerical_cols = ['age', 'fnlwgt', 'educational-num', 'capital-gain', 'capital-loss', 'hours-per-week']
target_col = 'income'

# Проверка наличия числовых столбцов и целевого столбца
all_expected_cols = numerical_cols + [target_col]
missing_cols = [col for col in all_expected_cols if col not in data.columns]
if missing_cols:
    print(f"Отсутствуют столбцы: {missing_cols}")
    raise ValueError("Некоторые ожидаемые столбцы отсутствуют в датасете")

# Проверка типов данных для числовых столбцов
print("\nТипы данных для числовых столбцов:")
for col in numerical_cols:
    print(f"{col}: {data[col].dtype}")
    if not np.issubdtype(data[col].dtype, np.number):
        print(f"Внимание: {col} содержит нечисловые данные!")
        raise ValueError(f"Столбец {col} должен быть числовым")

# Распределение классов
print("\nРаспределение классов:")
print(data[target_col].value_counts(normalize=True) * 100)

# Подготовка данных
X = data[numerical_cols]  # Используем только числовые столбцы
y = data[target_col]

# Кодирование целевой переменной
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
class_names = label_encoder.classes_
print("\nКлассы:", class_names)

# Нормализация числовых признаков
scaler = StandardScaler()
X = scaler.fit_transform(X)
X = pd.DataFrame(X, columns=numerical_cols)

# Разделение данных
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)
print("\nРазмер тренировочного набора:", X_train.shape)
print("Размер тестового набора:", X_test.shape)

# Подготовка тензоров для TabNet и MLP
X_train_tensor = torch.FloatTensor(X_train.values).to(device)
y_train_tensor = torch.LongTensor(y_train).to(device)
X_test_tensor = torch.FloatTensor(X_test.values).to(device)
y_test_tensor = torch.LongTensor(y_test).to(device)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
train_loader = DataLoader(train_dataset, batch_size=params['batch_size'], shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=params['batch_size'], shuffle=False)

Датасет: 48842 строк, 15 столбцов

Типы данных для числовых столбцов:
age: int64
fnlwgt: int64
educational-num: int64
capital-gain: int64
capital-loss: int64
hours-per-week: int64

Распределение классов:
income
<=50K    76.071823
>50K     23.928177
Name: proportion, dtype: float64

Классы: ['<=50K' '>50K']

Размер тренировочного набора: (39073, 6)
Размер тестового набора: (9769, 6)


In [8]:
results = []

In [9]:
def evaluate_model(y_true, y_pred, y_scores, model_name, training_time):
    accuracy = accuracy_score(y_true, y_pred) * 100
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='binary')
    fpr, tpr, _ = roc_curve(y_true, y_scores)
    roc_auc = auc(fpr, tpr)
    return {
        'model': model_name,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'roc_auc': roc_auc,
        'fpr': fpr,
        'tpr': tpr,
        'time': training_time  # Добавляем время обучения
    }

In [11]:
print("\nОбучение TabNet...")
start_time = time.time()
tabnet_model = TabNetClassifier(
    n_d=params['n_d'],
    n_a=params['n_a'],
    n_steps=params['n_steps'],
    gamma=params['gamma'],
    n_independent=params['n_independent'],
    n_shared=params['n_shared'],
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=params['learning_rate'], weight_decay=1e-4),
    scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
    scheduler_params={'mode': 'min', 'factor': 0.5, 'patience': 2},
    verbose=1
)
tabnet_model.fit(
    X_train.values, y_train,  # Используем .values для преобразования в NumPy
    eval_set=[(X_test.values, y_test)],  # То же для X_test
    eval_metric=['accuracy'],
    max_epochs=params['epochs'],
    patience=3,
    batch_size=params['batch_size']
)
tabnet_time = time.time() - start_time

y_pred_tabnet = tabnet_model.predict(X_test.values)
y_scores_tabnet = tabnet_model.predict_proba(X_test.values)[:, 1]
results.append(evaluate_model(y_test, y_pred_tabnet, y_scores_tabnet, 'TabNet', tabnet_time))
print(f"TabNet: Время обучения = {tabnet_time:.2f} сек")


Обучение TabNet...




epoch 0  | loss: 0.43217 | val_0_accuracy: 0.81237 |  0:00:09s
epoch 1  | loss: 0.4054  | val_0_accuracy: 0.81841 |  0:00:16s
epoch 2  | loss: 0.40076 | val_0_accuracy: 0.82291 |  0:00:23s
epoch 3  | loss: 0.39962 | val_0_accuracy: 0.81861 |  0:00:29s
epoch 4  | loss: 0.39494 | val_0_accuracy: 0.82608 |  0:00:35s
epoch 5  | loss: 0.39583 | val_0_accuracy: 0.82608 |  0:00:42s
epoch 6  | loss: 0.39488 | val_0_accuracy: 0.82793 |  0:00:48s
epoch 7  | loss: 0.39153 | val_0_accuracy: 0.82444 |  0:00:54s
epoch 8  | loss: 0.39158 | val_0_accuracy: 0.82404 |  0:01:00s
epoch 9  | loss: 0.39009 | val_0_accuracy: 0.82086 |  0:01:07s

Early stopping occurred at epoch 9 with best_epoch = 6 and best_val_0_accuracy = 0.82793




TabNet: Время обучения = 71.09 сек


In [12]:
# Класс MLP (без изменений)
class MLP(nn.Module):
    def __init__(self, input_dim, hidden_dim=128, output_dim=2):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(0.3)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)
        return x

# Обучение MLP с выводом в стиле TabNet
print("\nОбучение MLP...")
start_time = time.time()
mlp_model = MLP(input_dim=X_train.shape[1]).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(mlp_model.parameters(), lr=params['learning_rate'], weight_decay=1e-4)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2)

best_val_loss = float('inf')
patience = 3
counter = 0
epoch_start_time = start_time

for epoch in range(params['epochs']):
    mlp_model.train()
    train_loss = 0
    train_batches = 0
    for data, target in train_loader:
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = mlp_model(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        train_batches += 1

    train_loss /= train_batches

    mlp_model.eval()
    val_loss = 0
    val_correct = 0
    val_total = 0
    y_pred = []
    y_true = []
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = mlp_model(data)
            loss = criterion(output, target)
            val_loss += loss.item()
            pred = output.argmax(dim=1)
            val_correct += pred.eq(target).sum().item()
            val_total += target.size(0)
            y_pred.extend(pred.cpu().numpy())
            y_true.extend(target.cpu().numpy())

    val_loss /= len(test_loader)
    val_accuracy = val_correct / val_total
    scheduler.step(val_loss)

    # Вывод в стиле TabNet
    epoch_time = time.time() - epoch_start_time
    print(f"epoch {epoch} | loss: {train_loss:.5f} | val_0_accuracy: {val_accuracy:.5f} | {int(epoch_time // 60):02d}:{int(epoch_time % 60):02d}s")

mlp_time = time.time() - start_time

# Оценка модели
mlp_model.eval()
y_pred_mlp = []
y_scores_mlp = []
with torch.no_grad():
    for data, _ in test_loader:
        data = data.to(device)
        output = mlp_model(data)
        scores = torch.softmax(output, dim=1)[:, 1]
        y_scores_mlp.extend(scores.cpu().numpy())
        y_pred_mlp.extend(output.argmax(dim=1).cpu().numpy())

results.append(evaluate_model(y_test, y_pred_mlp, y_scores_mlp, 'MLP', mlp_time))
print(f"MLP: Время обучения = {mlp_time:.2f} сек")


Обучение MLP...
epoch 0 | loss: 0.40821 | val_0_accuracy: 0.82301 | 00:01s
epoch 1 | loss: 0.39774 | val_0_accuracy: 0.82772 | 00:01s
epoch 2 | loss: 0.39522 | val_0_accuracy: 0.82700 | 00:02s
epoch 3 | loss: 0.39580 | val_0_accuracy: 0.82639 | 00:03s
epoch 4 | loss: 0.39570 | val_0_accuracy: 0.82250 | 00:04s
epoch 5 | loss: 0.39582 | val_0_accuracy: 0.82854 | 00:05s
epoch 6 | loss: 0.39017 | val_0_accuracy: 0.83222 | 00:06s
epoch 7 | loss: 0.38949 | val_0_accuracy: 0.82741 | 00:07s
epoch 8 | loss: 0.38856 | val_0_accuracy: 0.83110 | 00:08s
epoch 9 | loss: 0.38912 | val_0_accuracy: 0.83151 | 00:08s
MLP: Время обучения = 8.88 сек


In [13]:
print("\nОбучение XGBoost...")
start_time = time.time()
xgb_model = XGBClassifier(
    max_depth=6,
    learning_rate=params['learning_rate'],
    n_estimators=params['epochs'],
    random_state=42,
    n_jobs=-1
)
xgb_model.fit(X_train, y_train)
xgb_time = time.time() - start_time

y_pred_xgb = xgb_model.predict(X_test)
y_scores_xgb = xgb_model.predict_proba(X_test)[:, 1]
results.append(evaluate_model(y_test, y_pred_xgb, y_scores_xgb, 'XGBoost', xgb_time))
print(f"XGBoost: Время обучения = {xgb_time:.2f} сек")


Обучение XGBoost...
XGBoost: Время обучения = 0.05 сек


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [14]:
print("\nОбучение Random Forest...")
start_time = time.time()
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    random_state=42,
    n_jobs=-1
)
rf_model.fit(X_train, y_train)
rf_time = time.time() - start_time

y_pred_rf = rf_model.predict(X_test)
y_scores_rf = rf_model.predict_proba(X_test)[:, 1]
results.append(evaluate_model(y_test, y_pred_rf, y_scores_rf, 'Random Forest', rf_time))
print(f"Random Forest: Время обучения = {rf_time:.2f} сек")


Обучение Random Forest...
Random Forest: Время обучения = 0.34 сек


In [15]:
results_df = pd.DataFrame(results)
results_df = results_df[['model', 'accuracy', 'precision', 'recall', 'f1', 'roc_auc', 'time']]  # Добавляем time
results_df = results_df.round(4)
print("\nСравнение моделей:")
print(results_df)


Сравнение моделей:
           model  accuracy  precision  recall      f1  roc_auc     time
0         TabNet   82.7925     0.7386  0.4350  0.5475   0.8533  71.0943
1            MLP   83.1508     0.7420  0.4538  0.5632   0.8546   8.8777
2        XGBoost   76.0672     0.0000  0.0000  0.0000   0.8470   0.0473
3  Random Forest   84.1232     0.8176  0.4333  0.5664   0.8659   0.3434
