In [535]:
import pandas as pd
import numpy as np

context = pd.read_csv("../data/context_df.csv")
pq = pd.read_parquet("../data/test_task.parquet")
percent_columns = [
	"inflation", "key_rate", "deposit_1", "deposit_3", "deposit_6", "deposit_12",
	"fa_delta", "usd_delta", "IMOEX_delta", "RGBI_delta"
    ]

In [536]:
# Кол-во классов
pq.cus_class.value_counts()

cus_class
5.0      5492
4.0      3585
10.0     2667
1.0      2666
106.0    1758
103.0    1168
105.0     968
102.0     543
101.0     432
8.0       348
107.0     292
109.0     219
108.0     218
6.0       157
7.0       148
104.0     120
2.0        63
100.0      44
Name: count, dtype: int64

In [537]:
# Интервалы дат
pq.iloc[:, 0].min(), pq.iloc[:, 0].max()

(Timestamp('2009-04-17 00:00:00'), Timestamp('2024-05-14 00:00:00'))

In [538]:
# NaN'ы
context.isna().sum()

context_data_from    0
context_data_to      0
quarter              0
inflation            1
key_rate             1
deposit_1            0
deposit_3            0
deposit_6            0
deposit_12           0
fa_delta             1
usd_delta            1
IMOEX_delta          1
RGBI_delta           1
dtype: int64

In [539]:
from typing import List


def preprocess_parquet(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df.columns = ["date", *df.columns[1:]]
    df["date"] = pd.to_datetime(df["date"])
    return df


def preprocess_context(df: pd.DataFrame, columns: List[str] = percent_columns) -> pd.DataFrame:
    df = df.copy()
    df["context_data_from"] = pd.to_datetime(df["context_data_from"])
    df["context_data_to"] = pd.to_datetime(df["context_data_to"])
    df.dropna(inplace=True)

    df[columns] = df[columns].apply(lambda x: x.str.rstrip('%').astype(float))
    
    return df

In [540]:
def merge_frames(pq: pd.DataFrame, context: pd.DataFrame) -> pd.DataFrame:
    pq = preprocess_parquet(pq)
    context = preprocess_context(context)

    intervals = pd.IntervalIndex.from_arrays(
        context["context_data_from"],
        context["context_data_to"],
        closed="both"
    )

    pq = pq[pq["date"] >= context["context_data_from"].min()].copy()
    pq["quarter_idx"] = intervals.get_indexer(pq["date"])
    pq = pq[pq["quarter_idx"] != -1]

    context = context.reset_index(drop=True)
    context["quarter_idx"] = context.index

    merged = pq.merge(context, on="quarter_idx", how="left")
    return merged

In [541]:
df = merge_frames(pq, context)

In [542]:
from sklearn.preprocessing import LabelEncoder

df["cus_class"] = df["cus_class"].astype(int)
label_encoder = LabelEncoder()
df["class_idx"] = label_encoder.fit_transform(df["cus_class"])


In [543]:
X = df[percent_columns].values
y = df["class_idx"].values


In [544]:
from sklearn.preprocessing import StandardScaler


scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [545]:
from sklearn.model_selection import train_test_split


X_train, X_temp, y_train, y_temp = train_test_split(X_scaled, y, test_size=0.3, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)


In [546]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset


X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.long)

train_loader = DataLoader(TensorDataset(X_train_tensor, y_train_tensor), batch_size=128, shuffle=True)
val_loader = DataLoader(TensorDataset(X_val_tensor, y_val_tensor), batch_size=128)

class MLPClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),

            nn.Linear(hidden_dim, output_dim)
        )

    def forward(self, x):
        return self.model(x)

model = MLPClassifier(input_dim=X_train.shape[1], hidden_dim=64, output_dim=len(label_encoder.classes_))
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)


In [547]:
from sklearn.metrics import f1_score
import mlflow
import mlflow.pytorch
from mlflow.models import infer_signature

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


mlflow.set_experiment("classification_experiment")
with mlflow.start_run():
    for epoch in range(1, 11):
        print(f"Starting Epoch {epoch}")  # Уникальный вывод для каждой эпохи
        model.train()
        total_loss = 0

        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        avg_train_loss = total_loss / len(train_loader)

        # Валидация
        model.eval()
        y_true, y_pred = [], []
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                X_batch = X_batch.to(device)
                outputs = model(X_batch)
                preds = outputs.argmax(dim=1).cpu().numpy()
                y_pred.extend(preds)
                y_true.extend(y_batch.numpy())

        f1_macro = f1_score(y_true, y_pred, average="macro")
        f1_micro = f1_score(y_true, y_pred, average="micro")

        print(f"Epoch {epoch}: loss={avg_train_loss:.4f}, F1_macro={f1_macro:.4f}, F1_micro={f1_micro:.4f}")

        # Логирование
        mlflow.log_metric("train_loss", avg_train_loss, step=epoch)
        mlflow.log_metric("f1_macro", f1_macro, step=epoch)
        mlflow.log_metric("f1_micro", f1_micro, step=epoch)

    input_example = X_train[:5].astype(np.float32)
    outputs = model(torch.tensor(input_example, dtype=torch.float32).to(device))
    outputs = outputs.cpu().detach().numpy()
    
    # Сохраняем модель
    signature = infer_signature(input_example, outputs)
    mlflow.pytorch.log_model(
        model,
        name="mlp_model",
        signature=signature,
        input_example=input_example
    )
    mlflow.log_params({
        "model": "MLP",
        "hidden_dim": 64,
        "lr": 1e-3,
        "batch_size": 128,
        "epochs": 10
    })

Starting Epoch 1
Epoch 1: loss=2.2762, F1_macro=0.0796, F1_micro=0.3743
Starting Epoch 2
Epoch 2: loss=1.8800, F1_macro=0.0837, F1_micro=0.3884
Starting Epoch 3
Epoch 3: loss=1.8025, F1_macro=0.1159, F1_micro=0.4044
Starting Epoch 4
Epoch 4: loss=1.7662, F1_macro=0.1150, F1_micro=0.4037
Starting Epoch 5
Epoch 5: loss=1.7437, F1_macro=0.1233, F1_micro=0.4005
Starting Epoch 6
Epoch 6: loss=1.7294, F1_macro=0.1172, F1_micro=0.4044
Starting Epoch 7
Epoch 7: loss=1.7189, F1_macro=0.1172, F1_micro=0.4044
Starting Epoch 8
Epoch 8: loss=1.7106, F1_macro=0.1212, F1_micro=0.4106
Starting Epoch 9
Epoch 9: loss=1.7051, F1_macro=0.1209, F1_micro=0.4057
Starting Epoch 10
Epoch 10: loss=1.7005, F1_macro=0.1232, F1_micro=0.4076
