In [38]:
import os
import sys


notebook_dir = os.getcwd()
project_root = os.path.abspath(os.path.join(notebook_dir, ".."))
if project_root not in sys.path:
    sys.path.append(project_root)

In [39]:
from scripts.utils import merge_frames
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme(style="whitegrid")


context = pd.read_csv("../data/context_df.csv")
pq = pd.read_parquet("../data/test_task.parquet")
percent_columns = [
	"inflation", "key_rate", "deposit_1", "deposit_3", "deposit_6", "deposit_12",
	"fa_delta", "usd_delta", "IMOEX_delta", "RGBI_delta"
    ]
df = merge_frames(pq, context)
df.head()

Unnamed: 0,date,cus_class,quarter_idx,context_data_from,context_data_to,quarter,inflation,key_rate,deposit_1,deposit_3,deposit_6,deposit_12,fa_delta,usd_delta,IMOEX_delta,RGBI_delta
0,2016-07-01,105.0,0,2016-07-01,2016-09-30,3,7.34,10.5,6.14,7.05,6.86,8.17,1.52,-5.3,1.85,2.48
1,2016-07-01,1.0,0,2016-07-01,2016-09-30,3,7.34,10.5,6.14,7.05,6.86,8.17,1.52,-5.3,1.85,2.48
2,2016-07-01,103.0,0,2016-07-01,2016-09-30,3,7.34,10.5,6.14,7.05,6.86,8.17,1.52,-5.3,1.85,2.48
3,2016-07-01,106.0,0,2016-07-01,2016-09-30,3,7.34,10.5,6.14,7.05,6.86,8.17,1.52,-5.3,1.85,2.48
4,2016-07-01,106.0,0,2016-07-01,2016-09-30,3,7.34,10.5,6.14,7.05,6.86,8.17,1.52,-5.3,1.85,2.48


In [40]:
df_fe = df.copy()

# Сезонные признаки
df_fe["year"] = df_fe["date"].dt.year
df_fe["month"] = df_fe["date"].dt.month
df_fe["quarter"] = df_fe["date"].dt.quarter

# Feature engineering
df_fe["deposit_spread"] = df_fe["deposit_12"] - df_fe["deposit_1"]
df_fe["usd_inverted"] = -df_fe["usd_delta"]
df_fe["fa_vs_usd"] = df_fe["fa_delta"] - df_fe["usd_delta"]

# Квартальное изменение инфляции внутри группы
df_fe = df_fe.sort_values(by=["quarter_idx", "date"])
df_fe["diff_inflation"] = df_fe.groupby("quarter_idx")["inflation"].diff().fillna(0)

base_percent_columns = [
    "inflation", "key_rate", "deposit_1", "deposit_3", "deposit_6", "deposit_12",
    "fa_delta", "usd_delta", "IMOEX_delta", "RGBI_delta"
]
engineered_columns = [
    "deposit_spread", "usd_inverted", "fa_vs_usd", "diff_inflation",
    "year", "month", "quarter"
]
final_columns = base_percent_columns + engineered_columns


X = df_fe[final_columns].values
y = df_fe["cus_class"].astype(int).values

In [41]:
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
import torch

le = LabelEncoder()
y_encoded = le.fit_transform(y)

class_weights = compute_class_weight(class_weight="balanced", classes=np.unique(y_encoded), y=y_encoded)
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float32)


In [42]:
from sklearn.model_selection import StratifiedShuffleSplit

splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=42)
train_idx, test_val_idx = next(splitter.split(X, df_fe["quarter"]))

X_train, y_train = X[train_idx], y_encoded[train_idx]
X_temp, y_temp = X[test_val_idx], y_encoded[test_val_idx]

val_splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=42)
val_idx, test_idx = next(val_splitter.split(X_temp, df_fe.iloc[test_val_idx]["quarter"]))

X_val, y_val = X_temp[val_idx], y_temp[val_idx]
X_test, y_test = X_temp[test_idx], y_temp[test_idx]


In [43]:
from pytorch_tabnet.tab_model import TabNetClassifier
import mlflow
import mlflow.pytorch

device = "cuda" if torch.cuda.is_available() else "cpu"
weights_tensor = class_weights_tensor.to(device)

model = TabNetClassifier(
    n_d=64,
    n_a=64,
    n_steps=5,
    gamma=1.5,
    lambda_sparse=1e-4,
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-3),
    mask_type='sparsemax',
    device_name=device,
    verbose=10
)




In [44]:
from sklearn.metrics import f1_score
from mlflow.models import infer_signature

mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("tabnet_sota_baseline")
with mlflow.start_run():
    model.fit(
        X_train=X_train, y_train=y_train,
        eval_set=[(X_val, y_val)],
        eval_metric=["accuracy"],
        loss_fn=torch.nn.CrossEntropyLoss(weight=weights_tensor),
        max_epochs=200,
        patience=20,
        batch_size=128,
        drop_last=False
    )

    preds = model.predict(X_test)
    f1_macro = f1_score(y_test, preds, average="macro")
    f1_micro = f1_score(y_test, preds, average="micro")

    input_example = torch.tensor(X_train[:5], dtype=torch.float32).to(device)
    output_example = model.predict(input_example.cpu().numpy())

    mlflow.log_metric("f1_macro", f1_macro)
    mlflow.log_metric("f1_micro", f1_micro)
    mlflow.log_param("model", "TabNet_baseline")
    mlflow.sklearn.log_model(
        model, name="TabNet_baseline",
        signature=infer_signature(X_train[:5], output_example),
    )


2025/07/30 19:22:24 INFO mlflow.tracking.fluent: Experiment with name 'tabnet_sota_baseline' does not exist. Creating a new experiment.


epoch 0  | loss: 3.22878 | val_0_accuracy: 0.04537 |  0:00:02s
epoch 10 | loss: 2.24267 | val_0_accuracy: 0.20333 |  0:00:27s
epoch 20 | loss: 2.1813  | val_0_accuracy: 0.28819 |  0:00:52s
epoch 30 | loss: 2.14988 | val_0_accuracy: 0.33975 |  0:01:16s
epoch 40 | loss: 2.13121 | val_0_accuracy: 0.28688 |  0:01:39s
epoch 50 | loss: 2.14612 | val_0_accuracy: 0.26567 |  0:02:02s

Early stopping occurred at epoch 50 with best_epoch = 30 and best_val_0_accuracy = 0.33975




🏃 View run unruly-carp-664 at: http://localhost:5000/#/experiments/606614082137499270/runs/212b8c207eea4e31a430ef7bbd8be04f
🧪 View experiment at: http://localhost:5000/#/experiments/606614082137499270


In [45]:
from sklearn.metrics import classification_report


print("Test F1 macro:", f1_macro)
print("Test F1 micro:", f1_micro)
print(classification_report(y_test, preds, zero_division=0))


Test F1 macro: 0.18236020484335805
Test F1 micro: 0.3171288743882545
              precision    recall  f1-score   support

           0       0.36      0.28      0.31       407
           1       0.07      0.46      0.12        13
           2       0.00      0.00      0.00       530
           3       0.56      0.36      0.44       787
           4       0.16      0.54      0.25        28
           5       0.06      0.62      0.10        21
           6       0.04      0.09      0.06        58
           7       0.62      0.84      0.71       388
           8       0.00      0.00      0.00         8
           9       0.00      0.00      0.00        51
          10       0.08      0.12      0.10        91
          11       0.38      0.44      0.41       177
          12       0.00      0.00      0.00        13
          13       0.31      0.47      0.38       114
          14       0.26      0.11      0.15       278
          15       0.09      0.27      0.14        45
          16

In [46]:
model.save_model("../models/tabnet_baseline_model")

Successfully saved model at ../models/tabnet_baseline_model.zip


'../models/tabnet_baseline_model.zip'

# Проверим качество на бустинге

In [47]:
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report, f1_score


model_c = CatBoostClassifier(
    iterations=200,
    learning_rate=0.1,
    depth=6,
    loss_function='MultiClass',
    verbose=0,
    random_seed=42
)

model_c.fit(X_train, y_train)

y_pred = model_c.predict(X_test)

print("Test F1 macro:", f1_score(y_test, y_pred, average="macro"))
print("Test F1 micro:", f1_score(y_test, y_pred, average="micro"))
print(classification_report(y_test, y_pred, zero_division=0))


Test F1 macro: 0.16129717652145248
Test F1 micro: 0.4515497553017945
              precision    recall  f1-score   support

           0       0.35      0.43      0.39       407
           1       0.00      0.00      0.00        13
           2       0.41      0.30      0.35       530
           3       0.48      0.71      0.57       787
           4       0.00      0.00      0.00        28
           5       0.00      0.00      0.00        21
           6       0.00      0.00      0.00        58
           7       0.62      0.88      0.73       388
           8       0.00      0.00      0.00         8
           9       0.00      0.00      0.00        51
          10       0.50      0.01      0.02        91
          11       0.43      0.42      0.42       177
          12       0.00      0.00      0.00        13
          13       0.28      0.61      0.38       114
          14       0.24      0.03      0.05       278
          15       0.00      0.00      0.00        45
          16

### Можно пользоваться более Lightweight моделями, также подойти к задаче с помощью **arima, prophet, lstm, sequence-based nn** для работы с time-series