In [274]:
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
from sklearn.model_selection import train_test_split
import pandas as pd


df = pd.read_parquet('./data')
print(f'total dataset: {len(df)} rows')

## filters
df = df[df['stabilized_spread'] < 10] # nem tul nagy spread
df = df[df['stabilized_spread'] > 0.2] # nem tul nagy spread
df = df[abs(df['last_price_delta_since_stabilized']) > 0.5]
df = df[abs(df['last_price_delta_since_stabilized']) < 12]

## df = df[df['stabilized_amount_mean'] < 0.4]  # nem tul nagy tradek
## df = df[df['stabilized_nr_trades'] < 300]  # nem tul sok trade

df.sort_index(axis=1, inplace=True)

y = df['last_price_delta_since_stabilized']
X = df.drop('last_price_delta_since_stabilized', axis=1)

X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)
X_train

total dataset: 38177 rows


Unnamed: 0_level_0,0_amount_mean,0_nr_trades,0_price_delta,0_spread,1_amount_mean,1_nr_trades,1_price_delta,1_spread,2_amount_mean,2_nr_trades,...,3_spread,4_amount_mean,4_nr_trades,4_price_delta,4_spread,stabilized_amount_mean,stabilized_at_ms,stabilized_nr_trades,stabilized_spread,wave_direction
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0526,108.0,0.91,3.20,0.0526,109.0,0.91,3.20,0.0522,110.0,...,3.20,0.0518,123.0,0.76,3.35,0.0531,159,132.0,4.11,max
0,0.1226,20.0,0.50,1.08,0.1179,21.0,0.50,1.08,0.1126,22.0,...,1.08,0.1005,25.0,0.49,1.09,0.0641,309,46.0,1.58,max
0,0.0047,19.0,0.03,0.56,0.0046,20.0,0.03,0.56,0.0044,21.0,...,0.56,0.0060,23.0,0.00,0.59,0.0061,800,25.0,0.59,min
0,0.0009,16.0,0.20,0.27,0.0009,17.0,0.20,0.28,0.0009,18.0,...,0.30,0.0070,20.0,0.00,0.50,0.0067,310,21.0,0.50,min
0,0.0236,18.0,0.22,1.08,0.0259,19.0,0.22,1.08,0.0247,20.0,...,1.08,0.0228,22.0,0.22,1.08,0.0210,340,25.0,1.30,max
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,0.0294,6.0,0.88,0.21,0.0302,7.0,0.85,0.24,0.0375,16.0,...,0.47,0.0360,26.0,0.03,1.06,0.0363,41,27.0,1.09,min
0,0.0734,3.0,0.46,0.17,0.0484,5.0,0.46,0.17,0.0405,6.0,...,0.56,0.0307,8.0,0.07,0.56,0.0274,490,9.0,0.63,max
0,0.0077,3.0,0.81,0.36,0.0262,5.0,0.41,0.76,0.0303,6.0,...,0.76,0.0213,9.0,0.41,0.76,0.0192,161,10.0,1.17,min
0,0.0197,25.0,0.71,1.19,0.0225,32.0,0.71,1.19,0.0231,33.0,...,1.20,0.0230,35.0,0.69,1.21,0.0202,777,55.0,1.90,max


In [275]:
from spreadsurfer.price_engine import FeatureEngineer
from catboost import CatBoostRegressor
# from sklearn.ensemle import RandomForestRegressor

model = CatBoostRegressor(learning_rate=0.007, depth=7, loss_function='RMSE', random_state=0, verbose=False, iterations=1500)

pipeline = Pipeline(steps=[
    ('preprocessor', FeatureEngineer()),
    # ('model', RandomForestRegressor(n_estimators=50, random_state=0))
    ('model',  model)
])

pipeline.fit(X_train, y_train)
score = pipeline.score(X_valid, y_valid)
display(score)


fitting on 10780 records


0.6199711496289153

In [276]:
from spreadsurfer import now_isoformat

save = False
if save:
    model.save_model(f'./models/{now_isoformat()}-score-{1000 * round(score, 3)}.cat')


In [277]:
sample = pd.DataFrame([
    {'0_amount_mean': 0.094411, '0_nr_trades': 75.0, '0_price_delta': 0.61, '0_spread': 1.44, '1_amount_mean': 0.093298, '1_nr_trades': 76.0, '1_price_delta': 0.61, '1_spread':1.44, '2_amount_mean': 0.092106, '2_nr_trades': 77.0, '2_price_delta': 0.61, '2_spread': 1.44, '3_amount_mean': 0.084998, '3_nr_trades': 85.0, '3_price_delta': 0.11, '3_spread': 1.94, '4_amount_mean': 0.084199, '4_nr_trades': 86.0, '4_price_delta': 0.04, '4_spread': 2.01, 'last_price_delta_since_stabilized': 1.97, 'stabilized_amount_mean': 0.08325, 'stabilized_at_ms': 123, 'stabilized_nr_trades': 87.0, 'stabilized_spread': 2.05, 'wave_direction': 'min'}
])

print('expected: ', sample.last_price_delta_since_stabilized.mean())
sample.drop('last_price_delta_since_stabilized', axis=1, inplace=True)

from timeit import timeit
timeit(lambda: pipeline.predict(sample), number=1) * 1000
pipeline.predict(sample)[0]


expected:  1.97


1.5980001775973667

In [278]:
from glob import glob
cat_filename = glob('*.cat')[0]
cat_filename

model = CatBoostRegressor()
model.load_model(fname=cat_filename)
print(f'catboost model loaded from {cat_filename}')


catboost model loaded from 2023-01-21T20:37:02-score-773.0.cat
