In [251]:
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
from sklearn.model_selection import train_test_split
import pandas as pd

class FeatureEngineer(BaseEstimator, TransformerMixin):
    def fit(self, X, y):
        print(f'fitting on {len(X)} records')
        return self

    def transform(self, X, y=None):
        df = X.copy()
        df.loc[df['wave_direction'] == 'min', 'wave_direction'] = 1
        df.loc[df['wave_direction'] == 'max'] = -1
        df = df.astype({"wave_direction": 'float64'})
        return df

df = pd.read_parquet('./data')
print(f'total dataset: {len(df)} rows')

if '0_price' in df.columns:
    df.drop('0_price', axis=1, inplace=True)
    df.drop('1_price', axis=1, inplace=True)
    df.drop('2_price', axis=1, inplace=True)
    df.drop('3_price', axis=1, inplace=True)
    df.drop('4_price', axis=1, inplace=True)

# filters
df = df[df['stabilized_spread'] < 10] # nem tul nagy spread
df = df[df['stabilized_spread'] > 0.2] # nem tul nagy spread
df = df[abs(df['last_price_delta_since_stabilized']) > 1]
df = df[abs(df['last_price_delta_since_stabilized']) < 8]

## df = df[df['stabilized_amount_mean'] < 0.4]  # nem tul nagy tradek
## df = df[df['stabilized_nr_trades'] < 300]  # nem tul sok trade


df.sort_index(axis=1, inplace=True)

y = df['last_price_delta_since_stabilized']
X = df.drop('last_price_delta_since_stabilized', axis=1)

X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)
X_train

total dataset: 20857 rows


Unnamed: 0_level_0,0_amount_mean,0_nr_trades,0_price_delta,0_spread,1_amount_mean,1_nr_trades,1_price_delta,1_spread,2_amount_mean,2_nr_trades,...,3_spread,4_amount_mean,4_nr_trades,4_price_delta,4_spread,stabilized_amount_mean,stabilized_at_ms,stabilized_nr_trades,stabilized_spread,wave_direction
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0380,3.0,0.08,0.55,0.0295,4.0,0.08,0.55,0.0231,7.0,...,0.60,0.0222,9.0,0.02,0.61,0.0734,228,16.0,0.63,max
0,0.0036,6.0,0.17,0.07,0.0044,8.0,0.17,0.22,0.0042,10.0,...,0.39,0.0041,12.0,0.00,0.39,0.0045,277,14.0,0.39,max
0,0.0260,16.0,1.64,1.29,0.0166,38.0,0.34,2.59,0.0165,39.0,...,2.75,0.0176,41.0,0.05,2.88,0.0183,44,42.0,2.93,max
0,0.0049,5.0,0.26,0.14,0.0042,6.0,0.13,0.27,0.0039,7.0,...,0.40,0.0133,9.0,0.00,0.40,0.0121,72,10.0,0.40,max
0,0.0061,4.0,0.24,0.81,0.0050,5.0,0.19,0.86,0.0044,6.0,...,0.89,0.0101,9.0,0.00,1.05,0.0094,299,10.0,1.05,min
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,0.0081,4.0,1.01,0.97,0.0062,7.0,0.99,0.99,0.0104,8.0,...,1.76,0.0142,10.0,0.22,1.76,0.0169,230,11.0,1.98,min
0,0.0623,10.0,0.89,1.46,0.0567,11.0,0.89,1.46,0.0541,12.0,...,1.46,0.0465,14.0,0.34,2.01,0.0437,106,15.0,2.35,max
0,0.0049,3.0,1.52,1.57,0.0252,4.0,1.52,1.57,0.0193,6.0,...,2.36,0.0324,19.0,0.71,2.38,0.0955,167,33.0,3.09,max
0,0.0189,8.0,0.03,1.28,0.0170,9.0,0.03,1.28,0.0160,10.0,...,1.28,0.0170,13.0,0.03,1.28,0.0159,286,14.0,1.31,max


In [231]:
from catboost import CatBoostRegressor
# from sklearn.ensemle import RandomForestRegressor

model = CatBoostRegressor(learning_rate=0.007, depth=7, loss_function='RMSE', random_state=0, verbose=False, iterations=1500)

pipeline = Pipeline(steps=[
    ('preprocessor', FeatureEngineer()),
    # ('model', RandomForestRegressor(n_estimators=50, random_state=0))
    ('model',  model)
])

pipeline.fit(X_train, y_train)
score = pipeline.score(X_valid, y_valid)
display(score)


fitting on 6455 records


0.772864681037414

In [234]:
from spreadsurfer import now_isoformat

save = False
if save:
    model.save_model(f'./models/{now_isoformat()}-score-{1000 * round(score, 3)}.cat')


In [241]:
sample = pd.DataFrame([
    {'0_amount_mean': 0.094411, '0_nr_trades': 75.0, '0_price_delta': 0.61, '0_spread': 1.44, '1_amount_mean': 0.093298, '1_nr_trades': 76.0, '1_price_delta': 0.61, '1_spread': 1.44, '2_amount_mean': 0.092106, '2_nr_trades': 77.0, '2_price_delta': 0.61, '2_spread': 1.44, '3_amount_mean': 0.084998, '3_nr_trades': 85.0, '3_price_delta': 0.11, '3_spread': 1.94, '4_amount_mean': 0.084199, '4_nr_trades': 86.0, '4_price_delta': 0.04, '4_spread': 2.01, 'last_price_delta_since_stabilized': 1.97, 'stabilized_amount_mean': 0.08325, 'stabilized_at_ms': 123, 'stabilized_nr_trades': 87.0, 'stabilized_spread': 2.05, 'wave_direction': 'min'}
])

print('expected: ', sample.last_price_delta_since_stabilized.mean())
sample.drop('last_price_delta_since_stabilized', axis=1, inplace=True)

from timeit import timeit
timeit(lambda: pipeline.predict(sample), number=1) * 1000
pipeline.predict(sample)


expected:  1.97


array([1.96460944])

In [244]:
from glob import glob
cat_filename = glob('*.cat')[0]
cat_filename


'2023-01-21T20:37:02-score-773.0.cat'