In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
import xgboost as xgb

### Загружаем данные

In [7]:
df = pd.read_csv("../data/train.csv")
df = df.iloc[:120000]
df = df.sort_values("local_timestamp").reset_index(drop=True)
df.head()

Unnamed: 0,local_timestamp,bid_price_1,bid_qty_1,bid_price_2,bid_qty_2,bid_price_3,bid_qty_3,bid_price_4,bid_qty_4,bid_price_5,...,ask_qty_16,ask_price_17,ask_qty_17,ask_price_18,ask_qty_18,ask_price_19,ask_qty_19,ask_price_20,ask_qty_20,y
0,1749513606160,2681.58,22.2349,2681.51,1.4442,2681.5,2.407,2681.4,2.407,2681.34,...,0.4987,2682.14,1.14,2682.15,0.003,2682.23,0.0028,2682.25,4.5718,0
1,1749513606260,2681.58,22.2349,2681.51,1.4442,2681.5,2.407,2681.4,2.407,2681.34,...,0.4987,2682.14,1.14,2682.15,0.003,2682.23,0.0028,2682.25,4.5718,0
2,1749513606360,2681.79,36.4176,2681.7,2.407,2681.6,2.407,2681.59,3.8292,2681.58,...,0.9872,2685.38,2.9744,0.0,0.0,0.0,0.0,0.0,0.0,0
3,1749513606460,2681.88,37.2793,2681.87,0.4,2681.8,11.0509,2681.71,0.7221,2681.7,...,1.1566,2682.78,1.14,2683.21,7.4592,2683.5,0.7554,2683.57,0.1845,2
4,1749513606560,2681.88,35.4149,2681.87,0.4,2681.81,3.3153,2681.8,2.407,2681.78,...,1.0736,2682.69,1.1566,2682.78,1.14,2682.84,3.9072,2683.2,3.7271,2


### Базовые фичи стакана (spread, mid, delta-mid)

In [8]:
df["spread"] = df["ask_price_1"] - df["bid_price_1"]
df["mid"] = (df["ask_price_1"] + df["bid_price_1"]) / 2
df["mid_delta"] = df["mid"].diff().fillna(0)

bid_qty_cols = [f"bid_qty_{i}" for i in range(1, 21)]
ask_qty_cols = [f"ask_qty_{i}" for i in range(1, 21)]
bid_price_cols = [f"bid_price_{i}" for i in range(1, 21)]
ask_price_cols = [f"ask_price_{i}" for i in range(1, 21)]

# суммарная ликвидность
df["bid_liq"] = df[bid_qty_cols].sum(axis=1)
df["ask_liq"] = df[ask_qty_cols].sum(axis=1)
df["imbalance"] = df["bid_liq"] / (df["bid_liq"] + df["ask_liq"] + 1e-9)

# новые признаки: разности цен соседних уровней
for i in range(1, 20):
    df[f"bid_price_diff_{i}"] = df[f"bid_price_{i}"] - df[f"bid_price_{i+1}"]
    df[f"ask_price_diff_{i}"] = df[f"ask_price_{i+1}"] - df[f"ask_price_{i}"]

# доля каждой ликвидности
for i in range(1, 21):
    df[f"bid_qty_ratio_{i}"] = df[f"bid_qty_{i}"] / (df["bid_liq"] + 1e-9)
    df[f"ask_qty_ratio_{i}"] = df[f"ask_qty_{i}"] / (df["ask_liq"] + 1e-9)

# delta spread и imbalance
df["spread_delta"] = df["spread"].diff().fillna(0)
df["imbalance_delta"] = df["imbalance"].diff().fillna(0)

# top-5 ликвидность и соотношение
df["bid_liq_top5"] = df[[f"bid_qty_{i}" for i in range(1,6)]].sum(axis=1)
df["ask_liq_top5"] = df[[f"ask_qty_{i}" for i in range(1,6)]].sum(axis=1)
df["liq_top5_ratio"] = df["bid_liq_top5"] / (df["bid_liq_top5"] + df["ask_liq_top5"] + 1e-9)

# логарифмические признаки
df["log_bid_liq"] = np.log1p(df["bid_liq"])
df["log_ask_liq"] = np.log1p(df["ask_liq"])
df["log_spread"] = np.log1p(df["spread"].abs())


### Лаги и роллинги

In [9]:
lag_features = ["mid", "spread", "bid_liq", "ask_liq", "imbalance"]
rolling_windows = [5, 10, 20]

for feat in lag_features:
    for lag in [1, 2, 3]:
        df[f"{feat}_lag{lag}"] = df[feat].shift(lag)
    
    for window in rolling_windows:
        df[f"{feat}_roll_mean_{window}"] = df[feat].rolling(window).mean()
        df[f"{feat}_roll_std_{window}"] = df[feat].rolling(window).std()
        df[f"{feat}_roll_norm_{window}"] = (df[feat] - df[f"{feat}_roll_mean_{window}"]) / (df[f"{feat}_roll_std_{window}"] + 1e-9)

df.fillna(0, inplace=True) 

  df[f"{feat}_roll_mean_{window}"] = df[feat].rolling(window).mean()
  df[f"{feat}_roll_std_{window}"] = df[feat].rolling(window).std()
  df[f"{feat}_roll_norm_{window}"] = (df[feat] - df[f"{feat}_roll_mean_{window}"]) / (df[f"{feat}_roll_std_{window}"] + 1e-9)
  df[f"{feat}_roll_mean_{window}"] = df[feat].rolling(window).mean()
  df[f"{feat}_roll_std_{window}"] = df[feat].rolling(window).std()
  df[f"{feat}_roll_norm_{window}"] = (df[feat] - df[f"{feat}_roll_mean_{window}"]) / (df[f"{feat}_roll_std_{window}"] + 1e-9)
  df[f"{feat}_lag{lag}"] = df[feat].shift(lag)
  df[f"{feat}_lag{lag}"] = df[feat].shift(lag)
  df[f"{feat}_lag{lag}"] = df[feat].shift(lag)
  df[f"{feat}_roll_mean_{window}"] = df[feat].rolling(window).mean()
  df[f"{feat}_roll_std_{window}"] = df[feat].rolling(window).std()
  df[f"{feat}_roll_norm_{window}"] = (df[feat] - df[f"{feat}_roll_mean_{window}"]) / (df[f"{feat}_roll_std_{window}"] + 1e-9)
  df[f"{feat}_roll_mean_{window}"] = df[feat].rolling(window).mean()
  df

### Формирование train/validation

In [10]:
X = df.drop(["y", "local_timestamp"], axis=1)
y = df["y"]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, shuffle=False)

### Использование RandomizedSearchCV

In [11]:
'''param_dist = {
    "n_estimators": [200, 400, 600],
    "max_depth": [4, 6, 8],
    "learning_rate": [0.03, 0.1],
    "subsample": [0.7, 1.0],
    "colsample_bytree": [0.7, 1.0],
}
'''

'param_dist = {\n    "n_estimators": [200, 400, 600],\n    "max_depth": [4, 6, 8],\n    "learning_rate": [0.03, 0.1],\n    "subsample": [0.7, 1.0],\n    "colsample_bytree": [0.7, 1.0],\n}\n'

In [12]:
#cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

In [13]:
'''random_search = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_dist,
    scoring=f1_scorer,
    n_iter=10,
    cv=cv,
    verbose=2,
    n_jobs=1,
    random_state=42
)

random_search.fit(X_train, y_train)
'''

'random_search = RandomizedSearchCV(\n    estimator=model,\n    param_distributions=param_dist,\n    scoring=f1_scorer,\n    n_iter=10,\n    cv=cv,\n    verbose=2,\n    n_jobs=1,\n    random_state=42\n)\n\nrandom_search.fit(X_train, y_train)\n'

### Формирование весов классов

In [14]:
classes = np.unique(y_train)
class_weights = compute_class_weight('balanced', classes=classes, y=y_train)
weights_dict = dict(zip(classes, class_weights))
weights_array = y_train.map(weights_dict)


### Обучаем модель на результатах подбора гиперпараметров
Лучшие параметры: {'subsample': 0.7, 'n_estimators': 600, 'max_depth': 8, 'learning_rate': 0.1, 'colsample_bytree': 0.7}

In [15]:
num_classes = y_train.nunique()
model = xgb.XGBClassifier(
    objective="multi:softprob",
    num_class=num_classes,
    subsample=0.7,
    n_estimators=600,
    max_depth=8,
    learning_rate=0.1,
    colsample_bytree=0.7,
    tree_method="hist",
    n_jobs=-1,
    eval_metric="mlogloss"
)

model.fit(
    X_train,
    y_train,
    sample_weight=weights_array,
    eval_set=[(X_val, y_val)],
    verbose=50
)


[0]	validation_0-mlogloss:1.56289
[50]	validation_0-mlogloss:0.97111
[100]	validation_0-mlogloss:0.85133
[150]	validation_0-mlogloss:0.78280
[200]	validation_0-mlogloss:0.73988
[250]	validation_0-mlogloss:0.70205
[300]	validation_0-mlogloss:0.68273
[350]	validation_0-mlogloss:0.66747
[400]	validation_0-mlogloss:0.65213
[450]	validation_0-mlogloss:0.64241
[500]	validation_0-mlogloss:0.63574
[550]	validation_0-mlogloss:0.63291
[599]	validation_0-mlogloss:0.63123


### Итоговая оценка модели

In [16]:
pred = model.predict(X_val)
f1_val = f1_score(y_val, pred, average="macro", zero_division=0)

print("F1 на валидации:", f1_val)

F1 на валидации: 0.37424421387339113
