In [107]:
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
from sklearn.model_selection import train_test_split
import pandas as pd

class FeatureEngineer(BaseEstimator, TransformerMixin):
    def fit(self, X, y):
        print(f'fitting on {len(X)} records')
        return self

    def transform(self, X, y=None):
        df = X.copy()
        df.loc[df['wave_direction'] == 'min', 'wave_direction'] = 1
        df.loc[df['wave_direction'] == 'max'] = -1
        df = df.astype({"wave_direction": 'float64'})
        return df

df = pd.read_parquet('./data')
print(f'total dataset: {len(df)} rows')
df = df[abs(df['last_price_delta_since_stabilized']) < 8]
df = df[abs(df['last_price_delta_since_stabilized']) > 1]

if '0_price' in df.columns:
    df.drop('0_price', axis=1, inplace=True)
    df.drop('1_price', axis=1, inplace=True)
    df.drop('2_price', axis=1, inplace=True)
    df.drop('3_price', axis=1, inplace=True)
    df.drop('4_price', axis=1, inplace=True)

df = df[df['stabilized_spread'] < 10] # nem tul nagy spread
# df = df[df['stabilized_amount_mean'] < 0.4]  # nem tul nagy tradek
# df = df[df['stabilized_nr_trades'] < 300]  # nem tul sok trade


# for col in df.columns:
#     if 'nr_trades' in col or 'price_delta' in col and not col.startswith('last'):
#         df.drop(col, axis=1, inplace=True)

df.sort_index(axis=1, inplace=True)

y = df['last_price_delta_since_stabilized']
X = df.drop('last_price_delta_since_stabilized', axis=1)

X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=0)
X_train

total dataset: 20857 rows


Unnamed: 0_level_0,0_amount_mean,0_nr_trades,0_price_delta,0_spread,1_amount_mean,1_nr_trades,1_price_delta,1_spread,2_amount_mean,2_nr_trades,...,3_spread,4_amount_mean,4_nr_trades,4_price_delta,4_spread,stabilized_amount_mean,stabilized_at_ms,stabilized_nr_trades,stabilized_spread,wave_direction
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0206,11.0,0.03,1.37,0.0190,12.0,0.03,1.37,0.0176,13.0,...,1.37,0.0157,15.0,0.03,1.37,0.0156,135,17.0,1.40,min
0,0.0538,3.0,0.20,0.23,0.0407,4.0,0.20,0.23,0.0339,5.0,...,0.23,0.0330,7.0,0.00,0.43,0.0296,108,8.0,0.43,min
0,0.0041,3.0,0.56,0.56,0.0131,4.0,0.23,0.89,0.1375,8.0,...,1.12,0.2326,10.0,0.00,1.12,0.3702,75,11.0,1.12,min
0,0.0037,4.0,0.88,0.18,0.0124,5.0,0.88,0.18,0.0115,6.0,...,1.06,0.0091,8.0,0.00,1.06,0.0119,200,9.0,1.06,max
0,0.0412,3.0,0.37,0.99,0.0646,4.0,0.36,1.00,0.0558,5.0,...,1.23,0.0601,7.0,0.04,1.32,0.0527,49,8.0,1.36,max
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,0.0101,5.0,1.09,0.74,0.0116,22.0,0.12,2.35,0.0116,23.0,...,2.46,0.0130,27.0,0.00,2.47,0.0125,72,28.0,2.47,min
0,0.0047,15.0,0.13,1.51,0.0069,16.0,0.13,1.51,0.0077,17.0,...,1.51,0.0105,19.0,0.13,1.51,0.0113,351,20.0,1.64,min
0,0.0267,11.0,0.59,0.44,0.0251,13.0,0.59,0.47,0.0330,16.0,...,1.06,0.1183,18.0,0.00,1.06,0.1090,63,20.0,1.06,max
0,0.0030,4.0,0.36,0.00,0.0030,5.0,0.36,0.00,0.0039,6.0,...,0.00,0.0049,9.0,0.00,0.36,0.0044,160,10.0,0.36,max


In [121]:
from catboost import CatBoostRegressor
# from sklearn.ensemle import RandomForestRegressor

model = CatBoostRegressor(learning_rate=0.007, depth=7, loss_function='RMSE', random_state=0, verbose=False, iterations=1500)

pipeline = Pipeline(steps=[
    ('preprocessor', FeatureEngineer()),
    # ('model', RandomForestRegressor(n_estimators=50, random_state=0))
    ('model',  model)
])

pipeline.fit(X_train, y_train)
score = pipeline.score(X_valid, y_valid)
display(score)


fitting on 5736 records


0.7683819996926021

In [125]:
from spreadsurfer import now_isoformat

model.save_model(f'./models/{now_isoformat()}-score-{1000 * round(score, 3)}.cat')

In [116]:
sample = pd.DataFrame([
    {'0_amount_mean': 0.033154, '0_nr_trades': 10.0, '0_price_delta': 0.12, '0_spread': 0.2, '1_amount_mean': 0.030374, '1_nr_trades': 11.0, '1_price_delta': 0.12, '1_spread': 0.3, '2_amount_mean': 0.034509, '2_nr_trades': 12.0, '2_price_delta': 0.0, '2_spread': 0.42, '3_amount_mean': 0.038008, '3_nr_trades': 13.0, '3_price_delta': 0.0, '3_spread': 0.42, '4_amount_mean': 0.039321, '4_nr_trades': 14.0, '4_price_delta': 0.0, '4_spread': 0.42, 'last_price_delta_since_stabilized': 0.31, 'stabilized_amount_mean': 0.038274, 'stabilized_at_ms': 231, 'stabilized_nr_trades': 15.0, 'stabilized_spread': 0.42, 'wave_direction': 'min'}
])

print('expected: ', sample.last_price_delta_since_stabilized.mean())
sample.drop('last_price_delta_since_stabilized', axis=1, inplace=True)

from timeit import timeit
# timeit(lambda: pipeline.predict(sample), number=1) * 1000
pipeline.predict(sample)



expected:  0.31


array([2.17286802])