In [None]:
import pandas as pd
import pathlib
import psutil
import pickle
import numpy as np
import os
import gc
    

from tqdm.auto import tqdm
from typing import List
from datetime import datetime 

import matplotlib.pyplot as plt

from ml_utils import vimba_level, vimba_up_level, book_depth, weighted_midprice, spread, count_trades

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

In [None]:
%reload_ext autoreload
%autoreload 2

In [None]:
MD_PATH = pathlib.Path(r"C:\Users\Xiaomi\Desktop\cmf new bot\New folder\md2")

In [None]:
bbos = pd.read_parquet(MD_PATH / 'bbo_train.parquet')
trades = pd.read_parquet(MD_PATH / 'trades_train.parquet')
lobs = pd.read_parquet(MD_PATH / 'book_train.parquet')

target = pd.read_parquet(MD_PATH / 'target_train.parquet')

In [None]:
def calc_my_features(
    bbos: pd.DataFrame,
    lobs: pd.DataFrame,
    trades: pd.DataFrame
                 ) -> pd.DataFrame:
        
        spreads_df = spread(lobs)

        w_midprice_df = weighted_midprice(lobs)

        trades_up_1_sec_df = count_trades(trades, up_second=0.5)
        #trades_up_3_sec_df = count_trades(trades, up_second=3)
        #trades_up_10_sec_df = count_trades(trades, up_second=10)

        vimba_up_3_level_df = vimba_up_level(lobs, up_level=3)
        vimba_up_5_level_df = vimba_up_level(lobs, up_level=5)
        vimba_up_10_level_df = vimba_up_level(lobs, up_level=9)

        vimba_at_levels_df = vimba_level(lobs, levels=[0, 1, 2, 3])

        bd_diff_np = book_depth(lobs, size=15, side='ask').values - book_depth(lobs, size=15, side='bid').values
        bd_diff_df = pd.DataFrame({'book_depth_diff_15_btc': bd_diff_np}, index=lobs.index)
        
        return pd.concat([
            spreads_df,
            w_midprice_df,
            trades_up_1_sec_df,
            #trades_up_3_sec_df,
            #trades_up_10_sec_df,
            vimba_up_3_level_df,
            vimba_up_5_level_df,
            vimba_up_10_level_df,
            vimba_at_levels_df,
            bd_diff_df
        ], axis=1).asof(bbos.index)

In [None]:
features_df = calc_my_features(bbos=bbos, lobs=lobs, trades=trades)

In [None]:
features_df = features_df.iloc[1:]

In [None]:
features_df.head(100)

In [None]:
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from catboost.utils import get_gpu_device_count

In [None]:
device = 'GPU' if get_gpu_device_count() > 0 else 'CPU'

In [None]:
device

In [None]:
model_params = {
    'depth': 5,
    'iterations': 1200,   
    'thread_count': 13,
    'learning_rate': 0.01,
    'l2_leaf_reg': 50,

    'task_type': device,
    'eval_metric': 'R2',
    
    'use_best_model': True,
    'silent': True,
}

#### Estimate quality of model 

In [None]:
model = CatBoostRegressor(**model_params)

X_train, X_test, y_train, y_test = train_test_split(features_df, target, test_size=0.2, random_state=42, shuffle=False)

# 

model.fit(X=X_train, y=y_train, eval_set=(X_test, y_test), plot=True)

score = r2_score(y_test, model.predict(X_test))
print("best_score:", score)

#### Fit final model


In [None]:
model_params['use_best_model'] = False

model = CatBoostRegressor(**model_params)
model.fit(X=features_df, y=target, plot=True)

In [None]:
model.save_model('model_baseline.cbm')

In [None]:
loaded_model = CatBoostRegressor()
loaded_model.load_model('model_baseline.cbm')

In [None]:
pd.Series(loaded_model.predict(features_df))

In [None]:
feature_importances = model.get_feature_importance(type='PredictionValuesChange')

In [None]:
plt.figure(figsize=(10, 6))
plt.barh(range(len(X_train.columns)), feature_importances, color='skyblue')
plt.yticks(range(len(X_train.columns)), X_train.columns)
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Feature Importance')
plt.show()