In [36]:
import numpy as np
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import f1_score, make_scorer
from sklearn.model_selection import train_test_split
import xgboost as xgb
from itertools import product

#import pywt

### Загружаем данные

In [37]:
df = pd.read_csv("/content/train.csv")
df = df.sort_values("local_timestamp").reset_index(drop=True)
df.head()

Unnamed: 0,local_timestamp,bid_price_1,bid_qty_1,bid_price_2,bid_qty_2,bid_price_3,bid_qty_3,bid_price_4,bid_qty_4,bid_price_5,...,ask_qty_16,ask_price_17,ask_qty_17,ask_price_18,ask_qty_18,ask_price_19,ask_qty_19,ask_price_20,ask_qty_20,y
0,1749513606160,2681.58,22.2349,2681.51,1.4442,2681.5,2.407,2681.4,2.407,2681.34,...,0.4987,2682.14,1.14,2682.15,0.003,2682.23,0.0028,2682.25,4.5718,0
1,1749513606260,2681.58,22.2349,2681.51,1.4442,2681.5,2.407,2681.4,2.407,2681.34,...,0.4987,2682.14,1.14,2682.15,0.003,2682.23,0.0028,2682.25,4.5718,0
2,1749513606360,2681.79,36.4176,2681.7,2.407,2681.6,2.407,2681.59,3.8292,2681.58,...,0.9872,2685.38,2.9744,0.0,0.0,0.0,0.0,0.0,0.0,0
3,1749513606460,2681.88,37.2793,2681.87,0.4,2681.8,11.0509,2681.71,0.7221,2681.7,...,1.1566,2682.78,1.14,2683.21,7.4592,2683.5,0.7554,2683.57,0.1845,2
4,1749513606560,2681.88,35.4149,2681.87,0.4,2681.81,3.3153,2681.8,2.407,2681.78,...,1.0736,2682.69,1.1566,2682.78,1.14,2682.84,3.9072,2683.2,3.7271,2


### Базовые фичи стакана (spread, mid, delta-mid)

In [38]:
df["spread"] = df["ask_price_1"] - df["bid_price_1"]
df["mid"] = (df["ask_price_1"] + df["bid_price_1"]) / 2
df["mid_delta"] = df["mid"].diff().fillna(0)

bid_qty_cols = [f"bid_qty_{i}" for i in range(1, 21)]
ask_qty_cols = [f"ask_qty_{i}" for i in range(1, 21)]
bid_price_cols = [f"bid_price_{i}" for i in range(1, 21)]
ask_price_cols = [f"ask_price_{i}" for i in range(1, 21)]

# суммарная ликвидность
df["bid_liq"] = df[bid_qty_cols].sum(axis=1)
df["ask_liq"] = df[ask_qty_cols].sum(axis=1)
df["imbalance"] = df["bid_liq"] / (df["bid_liq"] + df["ask_liq"] + 1e-9)

df["bid_price_mean20"] = df[bid_price_cols].mean(axis=1)
df["ask_price_mean20"] = df[ask_price_cols].mean(axis=1)
df["bid_price_std20"] = df[bid_price_cols].std(axis=1)
df["ask_price_std20"] = df[ask_price_cols].std(axis=1)
df["bid_price_max20"] = df[bid_price_cols].max(axis=1)
df["ask_price_max20"] = df[ask_price_cols].max(axis=1)
df["bid_price_min20"] = df[bid_price_cols].min(axis=1)
df["ask_price_min20"] = df[ask_price_cols].min(axis=1)
df["bid_price_range20"] = df["bid_price_max20"] - df["bid_price_min20"]
df["ask_price_range20"] = df["ask_price_max20"] - df["ask_price_min20"]

df["bid_qty_sum20"] = df[bid_qty_cols].sum(axis=1)
df["ask_qty_sum20"] = df[ask_qty_cols].sum(axis=1)
df["bid_qty_mean20"] = df[bid_qty_cols].mean(axis=1)
df["ask_qty_mean20"] = df[ask_qty_cols].mean(axis=1)
df["bid_qty_std20"] = df[bid_qty_cols].std(axis=1)
df["ask_qty_std20"] = df[ask_qty_cols].std(axis=1)

df["bid_price_top5_bottom5_diff"] = df[[f"bid_price_{i}" for i in range(1,6)]].mean(axis=1) - df[[f"bid_price_{i}" for i in range(16,21)]].mean(axis=1)
df["ask_price_top5_bottom5_diff"] = df[[f"ask_price_{i}" for i in range(1,6)]].mean(axis=1) - df[[f"ask_price_{i}" for i in range(16,21)]].mean(axis=1)

df["bid_qty_top5_bottom5_ratio"] = df[[f"bid_qty_{i}" for i in range(1,6)]].sum(axis=1) / (df[[f"bid_qty_{i}" for i in range(16,21)]].sum(axis=1) + 1e-9)
df["ask_qty_top5_bottom5_ratio"] = df[[f"ask_qty_{i}" for i in range(1,6)]].sum(axis=1) / (df[[f"ask_qty_{i}" for i in range(16,21)]].sum(axis=1) + 1e-9)


# новые признаки: разности цен соседних уровней
for i in range(1, 20):
    df[f"bid_price_diff_{i}"] = df[f"bid_price_{i}"] - df[f"bid_price_{i+1}"]
    df[f"ask_price_diff_{i}"] = df[f"ask_price_{i+1}"] - df[f"ask_price_{i}"]

df["bid_ask_qty_ratio"] = df["bid_qty_sum20"] / (df["bid_qty_sum20"] + df["ask_qty_sum20"] + 1e-9)
df["spread_over_mid"] = df["spread"] / (df["mid"] + 1e-9)
df["imbalance_over_liq"] = df["imbalance"] / (df["bid_qty_sum20"] + df["ask_qty_sum20"] + 1e-9)
df["liq_top5_ratio"] = df[[f"bid_qty_{i}" for i in range(1,6)]].sum(axis=1) / (df[[f"ask_qty_{i}" for i in range(1,6)]].sum(axis=1) + 1e-9)


# delta spread и imbalance
df["spread_delta"] = df["spread"].diff().fillna(0)
df["imbalance_delta"] = df["imbalance"].diff().fillna(0)
df["mid_delta"] = df["mid"].diff().fillna(0)
df["bid_liq_delta"] = df["bid_qty_sum20"].diff().fillna(0)
df["ask_liq_delta"] = df["ask_qty_sum20"].diff().fillna(0)

# top-5 ликвидность и соотношение
df["bid_liq_top5"] = df[[f"bid_qty_{i}" for i in range(1,6)]].sum(axis=1)
df["ask_liq_top5"] = df[[f"ask_qty_{i}" for i in range(1,6)]].sum(axis=1)
df["liq_top5_ratio"] = df["bid_liq_top5"] / (df["bid_liq_top5"] + df["ask_liq_top5"] + 1e-9)

# логарифмические признаки
df["log_bid_qty_sum20"] = np.log1p(df["bid_qty_sum20"])
df["log_ask_qty_sum20"] = np.log1p(df["ask_qty_sum20"])
df["log_spread"] = np.log1p(df["spread"].abs())

for i in range(1, 21):
    df[f"bid_qty_ratio_{i}"] = df[f"bid_qty_{i}"] / (df["bid_liq"] + 1e-9)
    df[f"ask_qty_ratio_{i}"] = df[f"ask_qty_{i}"] / (df["ask_liq"] + 1e-9)



  df[f"ask_qty_ratio_{i}"] = df[f"ask_qty_{i}"] / (df["ask_liq"] + 1e-9)
  df[f"bid_qty_ratio_{i}"] = df[f"bid_qty_{i}"] / (df["bid_liq"] + 1e-9)
  df[f"ask_qty_ratio_{i}"] = df[f"ask_qty_{i}"] / (df["ask_liq"] + 1e-9)
  df[f"bid_qty_ratio_{i}"] = df[f"bid_qty_{i}"] / (df["bid_liq"] + 1e-9)
  df[f"ask_qty_ratio_{i}"] = df[f"ask_qty_{i}"] / (df["ask_liq"] + 1e-9)
  df[f"bid_qty_ratio_{i}"] = df[f"bid_qty_{i}"] / (df["bid_liq"] + 1e-9)
  df[f"ask_qty_ratio_{i}"] = df[f"ask_qty_{i}"] / (df["ask_liq"] + 1e-9)
  df[f"bid_qty_ratio_{i}"] = df[f"bid_qty_{i}"] / (df["bid_liq"] + 1e-9)
  df[f"ask_qty_ratio_{i}"] = df[f"ask_qty_{i}"] / (df["ask_liq"] + 1e-9)
  df[f"bid_qty_ratio_{i}"] = df[f"bid_qty_{i}"] / (df["bid_liq"] + 1e-9)
  df[f"ask_qty_ratio_{i}"] = df[f"ask_qty_{i}"] / (df["ask_liq"] + 1e-9)
  df[f"bid_qty_ratio_{i}"] = df[f"bid_qty_{i}"] / (df["bid_liq"] + 1e-9)
  df[f"ask_qty_ratio_{i}"] = df[f"ask_qty_{i}"] / (df["ask_liq"] + 1e-9)
  df[f"bid_qty_ratio_{i}"] = df[f"bid_qty_{i}"] / (

### Лаги и роллинги

In [None]:
lag_features = ["mid", "spread", "bid_liq", "ask_liq", "imbalance"]
lags = [1,2,3,5,10,20]
rolling_windows = [5,10,20,50,100,200]

lagged = {
    f"{f}_lag{l}": df[f].shift(l)
    for f in lag_features
    for l in lags
}

roll_means = {
    f"{f}_roll_mean_{w}": df[f].rolling(w).mean()
    for f in lag_features
    for w in rolling_windows
}

roll_stds = {
    f"{f}_roll_std_{w}": df[f].rolling(w).std()
    for f in lag_features
    for w in rolling_windows
}

df = pd.concat(
    [df, pd.DataFrame(lagged), pd.DataFrame(roll_means), pd.DataFrame(roll_stds)],
    axis=1
)

df.fillna(0, inplace=True)


### SLOPE-фичи

In [None]:
slope_windows = [5, 10, 20, 50, 100]

slopes = {
    f"mid_slope_{w}": (df["mid"] - df["mid"].shift(w)) / w
    for w in slope_windows
}

spread_slopes = {
    f"spread_slope_{w}": (df["spread"] - df["spread"].shift(w)) / w
    for w in slope_windows
}

df = pd.concat(
    [df, pd.DataFrame(slopes), pd.DataFrame(spread_slopes)],
    axis=1
)

df.fillna(0, inplace=True)


# DIFF FEATURES (дельты признаков)


In [None]:
# ====== SAFE DIFF FEATURES ======
diff_features = ["mid", "spread", "bid_liq", "ask_liq", "imbalance"]

# базовые короткие дифы
basic_diffs = {
    f"{feat}_diff": df[feat].diff()
    for feat in diff_features
}

# многошаговые короткие дифы (шаги 1,2,3,5)
diff_steps = [1, 2, 3, 5]
multi_diffs = {
    f"{feat}_diff_{step}": df[feat].diff(step)
    for feat in diff_features
    for step in diff_steps
}

# дельты по верхним уровням стакана (1-5)
top_levels = range(1, 6)

price_diffs = {
    f"bid_price_{i}_diff": df[f"bid_price_{i}"].diff()
    for i in top_levels
    if f"bid_price_{i}" in df.columns
}
price_diffs.update({
    f"ask_price_{i}_diff": df[f"ask_price_{i}"].diff()
    for i in top_levels
    if f"ask_price_{i}" in df.columns
})

qty_diffs = {
    f"bid_qty_{i}_diff": df[f"bid_qty_{i}"].diff()
    for i in top_levels
    if f"bid_qty_{i}" in df.columns
}
qty_diffs.update({
    f"ask_qty_{i}_diff": df[f"ask_qty_{i}"].diff()
    for i in top_levels
    if f"ask_qty_{i}" in df.columns
})

# ratio diffs для верхних уровней
ratio_cols = [f"bid_qty_ratio_{i}" for i in top_levels] + [f"ask_qty_ratio_{i}" for i in top_levels]
ratio_diffs = {
    f"{col}_diff": df[col].diff()
    for col in ratio_cols
    if col in df.columns
}

# lag diffs с короткими лагами 1,2,3,5
lags = [1, 2, 3, 5]
lag_diff_features = {
    f"{feat}_minus_{feat}_lag{lag}": df[feat] - df[f"{feat}_lag{lag}"]
    for feat in diff_features
    for lag in lags
    if f"{feat}_lag{lag}" in df.columns
}

# ====== CONCAT ======
df = pd.concat(
    [
        df,
        pd.DataFrame(basic_diffs),
        pd.DataFrame(multi_diffs),
        pd.DataFrame(price_diffs),
        pd.DataFrame(qty_diffs),
        pd.DataFrame(ratio_diffs),
        pd.DataFrame(lag_diff_features),
    ],
    axis=1
)

df.fillna(0, inplace=True)


# Новые фичи
(WAVELET FEATURES,
MICROPRICE, OFI, PRICE IMPACT,
CROSS-INTERACTIONS)

In [None]:
# ========== MICROPRICE ==========
df["microprice"] = (
    df["ask_price_1"] * df["bid_qty_1"] +
    df["bid_price_1"] * df["ask_qty_1"]
) / (df["bid_qty_1"] + df["ask_qty_1"] + 1e-9)

# ========== OFI ==========
df["OFI"] = (
    df["bid_qty_1"].diff().fillna(0) -
    df["ask_qty_1"].diff().fillna(0)
)

# ========== PRICE IMPACT ==========
df["price_impact"] = (
    df["mid"].diff().fillna(0) /
    (df["bid_liq"] + df["ask_liq"] + 1e-9)
)

# ========== CROSS-INTERACTIONS ==========
base = ["mid", "spread", "imbalance", "bid_liq", "ask_liq"]

cross = {}

for a in base:
    for b in base:
        if a == b:
            continue
        cross[f"{a}_x_{b}"] = df[a] * df[b]
        cross[f"{a}_div_{b}"] = df[a] / (df[b] + 1e-9)
        cross[f"{a}_minus_{b}"] = df[a] - df[b]

# взаимодействия с дифами
for feat in base:
    if f"{feat}_diff" in df:
        cross[f"{feat}_x_diff"] = df[feat] * df[f"{feat}_diff"]

df = pd.concat([
    df,
    pd.DataFrame(cross),
], axis=1)

df.fillna(0, inplace=True)

### Формирование таргетов

### Формирование train/validation

In [None]:

X = df.drop(columns=["local_timestamp", "y"])
y = df["y"]
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.3, shuffle=False
)

num_classes = y_train.nunique()

In [None]:
def focal_loss_multi(preds, dtrain, gamma=2.0):
    y_true = dtrain.get_label().astype(int)

    preds = preds.reshape(-1, num_classes)
    preds = np.clip(preds, 1e-9, 1 - 1e-9)

    onehot = np.zeros_like(preds)
    onehot[np.arange(len(y_true)), y_true] = 1

    grad = preds - onehot
    hess = preds * (1 - preds)

    pt = np.sum(onehot * preds, axis=1, keepdims=True)
    grad *= (1 - pt) ** gamma
    hess *= (1 - pt) ** gamma

    return grad.ravel(), hess.ravel()


In [None]:
'''classes = np.unique(y_train)
class_weights = compute_class_weight(
    class_weight="balanced",
    classes=classes,
    y=y_train
)

weights_dict = dict(zip(classes, class_weights))
weights_array = y_train.map(weights_dict)'''

In [None]:
'''f1_scorer = make_scorer(
    f1_score,
    average="macro"
)'''

In [None]:
'''base_model = xgb.XGBClassifier(
    objective="multi:softprob",
    num_class=y_train.nunique(),
    tree_method="hist",
    eval_metric="mlogloss",
    n_jobs=-1,
    n_estimators=300,
    learning_rate=0.01
)'''


### Использование RandomizedSearchCV

In [None]:
'''param_dist = {
    "max_depth": [4, 6, 8, 10],
    "min_child_weight": [1, 3, 5, 10],
    "subsample": [0.6, 0.8, 1.0],
    "colsample_bytree": [0.6, 0.8, 1.0],
    "gamma": [0, 0.5, 1, 2]
}'''


In [None]:
#cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

In [None]:
'''random_search = RandomizedSearchCV(
    estimator=base_model,
    param_distributions=param_dist,
    scoring=f1_scorer,
    n_iter=10,
    cv=cv,
    verbose=2,
    n_jobs=1,
    random_state=42
)

random_search.fit(X_train, y_train, sample_weight=weights_array)'''

In [None]:
#print("Best CV F1:", random_search.best_score_)
#print("Best params:", random_search.best_params_)

### Обучаем модель на результатах подбора гиперпараметров
Лучшие параметры с randomsearchcv без весов: {'subsample': 0.7, 'n_estimators': 600, 'max_depth': 8, 'learning_rate': 0.1, 'colsample_bytree': 0.7}

Лучшие параметры с randomsearchcv с распределением весов: {'subsample': 1.0, 'min_child_weight': 3, 'max_depth': 8, 'gamma': 0, 'colsample_bytree': 0.6}

In [None]:

xgb_clf = xgb.XGBClassifier(
    objective="multi:softprob",
    num_class=num_classes,
    n_estimators=400,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.6,
    tree_method="hist",
    n_jobs=-1,
    eval_metric="mlogloss"
)

xgb_clf.fit(X_train, y_train)



In [None]:
import lightgbm as lgb
lgb_clf = lgb.LGBMClassifier(
    objective="multiclass",
    num_class=num_classes,
    n_estimators=400,
    max_depth=7,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.6,
    n_jobs=-1
)

lgb_clf.fit(X_train, y_train)



In [None]:
num_classes = y_train.nunique()

dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)

params = {
    "num_class": num_classes,
    "learning_rate": 0.05,
    "max_depth": 6,
    "subsample": 0.9,
    "colsample_bytree": 0.5,
    "tree_method": "hist",
    "eval_metric": "mlogloss"
}

model = xgb.train(
    params=params,
    dtrain=dtrain,
    num_boost_round=800,
    obj=lambda preds, dtrain: focal_loss_multi(preds, dtrain, gamma=2.0),
    evals=[(dval, "val")],
    verbose_eval=50
)



In [None]:
xgb_proba = xgb_clf.predict_proba(X_val)
lgb_proba = lgb_clf.predict_proba(X_val)

focal_raw = model.predict(xgb.DMatrix(X_val))
focal_proba = focal_raw.reshape(len(X_val), num_classes)



In [None]:
blend_proba = (
    0.4 * xgb_proba +
    0.4 * lgb_proba +
    0.2 * focal_proba
)



In [None]:
thresholds = np.zeros(num_classes)

for c in range(num_classes):
    best_f1 = 0
    best_t = 0.5

    for t in np.linspace(0.1, 0.9, 40):
        pred = (blend_proba[:, c] >= t).astype(int)
        true = (y_val == c).astype(int)
        f1 = f1_score(true, pred)

        if f1 > best_f1:
            best_f1 = f1
            best_t = t

    thresholds[c] = best_t

print("Thresholds:", thresholds)


In [None]:
final_pred = np.full(len(X_val), -1)

for i in range(len(X_val)):
    for c in range(num_classes):
        if blend_proba[i, c] >= thresholds[c]:
            final_pred[i] = c
            break

# fallback
mask = final_pred == -1
final_pred[mask] = blend_proba[mask].argmax(axis=1)

f1 = f1_score(y_val, final_pred, average="macro")
print("Ensemble F1 macro:", f1)



### Итоговая оценка модели

In [None]:
'''import numpy as np
from sklearn.metrics import f1_score
import xgboost as xgb

# --- DMatrix для валидации ---
dval = xgb.DMatrix(X_val, label=y_val)

# --- raw logits (output_margin=True для focal loss) ---
raw_preds = model.predict(dval, output_margin=True)
raw_preds = raw_preds.reshape(-1, num_classes)

# --- Softmax ---
def softmax(x):
    x = x - x.max(axis=1, keepdims=True)
    exp_x = np.exp(x)
    return exp_x / exp_x.sum(axis=1, keepdims=True)

pred_proba = softmax(raw_preds)

# --- Инициализация порогов ---
thresholds = np.array([0.5] * num_classes)  # стартовое guess

# --- Подбор порогов per-class (greedy) ---
best_thresholds = thresholds.copy()
best_f1 = 0

for cls in range(num_classes):
    for t in np.linspace(0.1, 0.9, 17):  # 0.1,0.15,...0.9
        temp_thresholds = best_thresholds.copy()
        temp_thresholds[cls] = t

        # predict с порогами
        preds = np.full(pred_proba.shape[0], -1, dtype=int)
        for i in range(pred_proba.shape[0]):
            # все классы, где вероятность > порог
            above = np.where(pred_proba[i] >= temp_thresholds)[0]
            if len(above) == 0:
                preds[i] = pred_proba[i].argmax()  # fallback
            else:
                preds[i] = above[0]  # если несколько → берем первый
        f1 = f1_score(y_val, preds, average="macro", zero_division=0)
        if f1 > best_f1:
            best_f1 = f1
            best_thresholds[cls] = t

# --- Финальное предсказание с оптимальными порогами ---
preds = np.full(pred_proba.shape[0], -1, dtype=int)
for i in range(pred_proba.shape[0]):
    above = np.where(pred_proba[i] >= best_thresholds)[0]
    if len(above) == 0:
        preds[i] = pred_proba[i].argmax()
    else:
        preds[i] = above[0]

# --- F1 ---
f1 = f1_score(y_val, preds, average="macro", zero_division=0)
print("F1 (macro) на валидации с per-class thresholds:", f1)
print("Оптимальные пороги по классам:", best_thresholds)'''


