In [533]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

df = pd.read_parquet('./data')
print(f'total dataset: {len(df)} rows')

## filters
# df = df[df['stabilized_spread'] < 10]  # nem tul nagy spread
# df = df[df['stabilized_spread'] > 0.2]  # nem tul nagy spread
# df = df[abs(df['last_price_delta_since_stabilized']) > 0.1]
# df = df[abs(df['last_price_delta_since_stabilized']) < 8]

# df = df[df['stabilized_amount_mean'] < 0.4]  # nem tul nagy tradek
# df = df[df['stabilized_nr_trades'] < 300]  # nem tul sok trade

df.sort_index(axis=1, inplace=True)

y = df['last_price_delta_since_stabilized']
X = df.drop('last_price_delta_since_stabilized', axis=1)

X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=0)
X_train

total dataset: 63000 rows


index
0    0.00
0   -0.20
0    0.00
0    0.00
0    0.00
     ... 
0   -1.34
0    0.00
0    0.00
0   -0.46
0    0.00
Name: past_final_price_9, Length: 63000, dtype: float64

In [526]:
from spreadsurfer.price_engine import FeatureEngineer
from catboost import CatBoostRegressor
from sklearn.preprocessing import StandardScaler
# from sklearn.ensemle import RandomForestRegressor

model = CatBoostRegressor(learning_rate=0.015, depth=7, loss_function='RMSE', random_state=0, verbose=False, iterations=1000)

pipeline = Pipeline(steps=[
    ('preprocessor', FeatureEngineer()),
    ('scaler', StandardScaler()),
    # ('model', RandomForestRegressor(n_estimators=50, random_state=0))
    ('model', model)
])

pipeline.fit(X_train, y_train)
score = pipeline.score(X_valid, y_valid)
display(score)


fitting on 44100 records


0.5298602653383727

In [517]:
import catboost
w = catboost.MetricVisualizer('./models', subdirs=True)
w.start()

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

boo


In [427]:
from sklearn.model_selection import GridSearchCV

do_grid_search = False

if do_grid_search:
    X = FeatureEngineer().transform(X)
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

    parameters = {'depth': [7, 8],
                  'learning_rate': [0.015, 0.03, 0.04],
                  'iterations': [100, 1000, 1500]
                  }
    grid = GridSearchCV(estimator=model, param_grid=parameters, cv=2, n_jobs=-1)
    grid.fit(X_train, y_train)

    print(" Results from Grid Search ")
    print("\n The best estimator across ALL searched params:\n", grid.best_estimator_)
    print("\n The best score across ALL searched params:\n", grid.best_score_)
    print("\n The best parameters across ALL searched params:\n", grid.best_params_)

 Results from Grid Search 

 The best estimator across ALL searched params:
 <catboost.core.CatBoostRegressor object at 0x7ff96cec73d0>

 The best score across ALL searched params:
 0.5933380986132425

 The best parameters across ALL searched params:
 {'depth': 7, 'iterations': 1500, 'learning_rate': 0.015}


In [530]:
from spreadsurfer import now_isoformat

save = True
if save:
    model.save_model(f'./models/{now_isoformat()}-score-{1000 * round(score, 3)}.cat')


In [529]:
input_from_log = """
2023-01-24 02:05:44.030 | data     | spreadsurfer.datacollect:start:91 - wave collected: {'0_amount_mean': 0.0311, '0_nr_trades': 39.0, '0_price_delta': 0.65, '0_spread': 1.97, '1_amount_mean': 0.0306, '1_nr_trades': 40.0, '1_price_delta': 0.65, '1_spread': 1.97, '2_amount_mean': 0.0304, '2_nr_trades': 41.0, '2_price_delta': 0.65, '2_spread': 1.97, '3_amount_mean': 0.0286, '3_nr_trades': 45.0, '3_price_delta': 0.65, '3_spread': 1.97, '4_amount_mean': 0.028, '4_nr_trades': 46.0, '4_price_delta': 0.61, '4_spread': 2.01, 'last_price_delta_since_stabilized': 0.57, 'past_final_price_0': 1.639999999999418, 'past_final_price_1': 0.680000000000291, 'past_final_price_2': -1.0300000000024738, 'past_final_price_3': 2.9499999999970896, 'past_final_price_4': 2.099999999998545, 'past_final_price_5': -0.8400000000001455, 'past_final_price_6': 2.4399999999986903, 'past_final_price_7': 5.369999999998981, 'past_final_price_8': 1.0399999999972351, 'past_final_price_9': 0.0, 'stabilized_amount_mean': 0.0338, 'stabilized_at_ms': 419, 'stabilized_nr_trades': 72.0, 'stabilized_spread': 2.62, 'wave_direction': 'min'}
2023-01-24 02:05:44.268 | data     | spreadsurfer.datacollect:start:91 - wave collected: {'0_amount_mean': 0.0017, '0_nr_trades': 3.0, '0_price_delta': 0.05, '0_spread': 0.0, '1_amount_mean': 0.0042, '1_nr_trades': 4.0, '1_price_delta': 0.02, '1_spread': 0.03, '2_amount_mean': 0.0225, '2_nr_trades': 5.0, '2_price_delta': 0.0, '2_spread': 0.05, '3_amount_mean': 0.0189, '3_nr_trades': 6.0, '3_price_delta': 0.0, '3_spread': 1.27, '4_amount_mean': 0.02, '4_nr_trades': 7.0, '4_price_delta': 0.0, '4_spread': 1.27, 'last_price_delta_since_stabilized': -0.04, 'past_final_price_0': -0.7099999999991269, 'past_final_price_1': -1.6699999999982538, 'past_final_price_2': -3.3800000000010186, 'past_final_price_3': 0.5999999999985448, 'past_final_price_4': -0.25, 'past_final_price_5': -3.1899999999986903, 'past_final_price_6': 0.09000000000014552, 'past_final_price_7': 3.0200000000004366, 'past_final_price_8': -1.3100000000013097, 'past_final_price_9': -2.349999999998545, 'stabilized_amount_mean': 0.019, 'stabilized_at_ms': 63, 'stabilized_nr_trades': 8.0, 'stabilized_spread': 1.27, 'wave_direction': None}
2023-01-24 02:05:45.029 | data     | spreadsurfer.datacollect:start:91 - wave collected: {'0_amount_mean': 0.02, '0_nr_trades': 7.0, '0_price_delta': 0.04, '0_spread': 1.27, '1_amount_mean': 0.019, '1_nr_trades': 8.0, '1_price_delta': 0.04, '1_spread': 1.27, '2_amount_mean': 0.017, '2_nr_trades': 9.0, '2_price_delta': 0.04, '2_spread': 1.27, '3_amount_mean': 0.016, '3_nr_trades': 11.0, '3_price_delta': 0.03, '3_spread': 1.28, '4_amount_mean': 0.0147, '4_nr_trades': 12.0, '4_price_delta': 0.0, '4_spread': 1.31, 'last_price_delta_since_stabilized': -0.63, 'past_final_price_0': -2.2999999999992724, 'past_final_price_1': -4.010000000002037, 'past_final_price_2': -0.030000000002473826, 'past_final_price_3': -0.8800000000010186, 'past_final_price_4': -3.819999999999709, 'past_final_price_5': -0.5400000000008731, 'past_final_price_6': 2.389999999999418, 'past_final_price_7': -1.9400000000023283, 'past_final_price_8': -2.9799999999995634, 'past_final_price_9': 0.0, 'stabilized_amount_mean': 0.0138, 'stabilized_at_ms': 151, 'stabilized_nr_trades': 13.0, 'stabilized_spread': 1.31, 'wave_direction': 'max'}
2023-01-24 02:05:46.019 | data     | spreadsurfer.datacollect:start:91 - wave collected: {'0_amount_mean': 0.0053, '0_nr_trades': 4.0, '0_price_delta': 1.35, '0_spread': 0.07, '1_amount_mean': 0.0062, '1_nr_trades': 5.0, '1_price_delta': 0.4, '1_spread': 1.02, '2_amount_mean': 0.0053, '2_nr_trades': 6.0, '2_price_delta': 0.39, '2_spread': 1.03, '3_amount_mean': 0.0069, '3_nr_trades': 7.0, '3_price_delta': 0.18, '3_spread': 1.24, '4_amount_mean': 0.0074, '4_nr_trades': 8.0, '4_price_delta': 0.17, '4_spread': 1.25, 'last_price_delta_since_stabilized': 0.17, 'past_final_price_0': -0.3600000000005821, 'past_final_price_1': 3.6199999999989814, 'past_final_price_2': 2.7700000000004366, 'past_final_price_3': -0.16999999999825377, 'past_final_price_4': 3.110000000000582, 'past_final_price_5': 6.040000000000873, 'past_final_price_6': 1.7099999999991269, 'past_final_price_7': 0.6700000000018917, 'past_final_price_8': 3.650000000001455, 'past_final_price_9': 0.0, 'stabilized_amount_mean': 0.0067, 'stabilized_at_ms': 231, 'stabilized_nr_trades': 9.0, 'stabilized_spread': 1.42, 'wave_direction': 'min'}
2023-01-24 02:05:47.028 | data     | spreadsurfer.datacollect:start:91 - wave collected: {'0_amount_mean': 0.0008, '0_nr_trades': 3.0, '0_price_delta': 0.1, '0_spread': 0.08, '1_amount_mean': 0.001, '1_nr_trades': 4.0, '1_price_delta': 0.07, '1_spread': 0.11, '2_amount_mean': 0.001, '2_nr_trades': 5.0, '2_price_delta': 0.03, '2_spread': 0.15, '3_amount_mean': 0.0009, '3_nr_trades': 6.0, '3_price_delta': 0.0, '3_spread': 0.18, '4_amount_mean': 0.0009, '4_nr_trades': 7.0, '4_price_delta': 0.0, '4_spread': 0.18, 'last_price_delta_since_stabilized': -3.18, 'past_final_price_0': -0.8700000000026193, 'past_final_price_1': -1.7200000000011642, 'past_final_price_2': -4.6599999999998545, 'past_final_price_3': -1.3800000000010186, 'past_final_price_4': 1.5499999999992724, 'past_final_price_5': -2.780000000002474, 'past_final_price_6': -3.819999999999709, 'past_final_price_7': -0.8400000000001455, 'past_final_price_8': -4.490000000001601, 'past_final_price_9': 0.0, 'stabilized_amount_mean': 0.0009, 'stabilized_at_ms': 256, 'stabilized_nr_trades': 8.0, 'stabilized_spread': 0.18, 'wave_direction': 'max'}
2023-01-24 02:05:48.009 | data     | spreadsurfer.datacollect:start:91 - wave collected: {'0_amount_mean': 0.0432, '0_nr_trades': 3.0, '0_price_delta': 5.53, '0_spread': 0.01, '1_amount_mean': 0.0396, '1_nr_trades': 4.0, '1_price_delta': 5.52, '1_spread': 0.02, '2_amount_mean': 0.0772, '2_nr_trades': 62.0, '2_price_delta': 2.16, '2_spread': 3.38, '3_amount_mean': 0.098, '3_nr_trades': 125.0, '3_price_delta': 0.22, '3_spread': 5.32, '4_amount_mean': 0.109, '4_nr_trades': 142.0, '4_price_delta': 0.0, '4_spread': 5.54, 'last_price_delta_since_stabilized': 2.25, 'past_final_price_0': 7.1599999999998545, 'past_final_price_1': 4.220000000001164, 'past_final_price_2': 7.5, 'past_final_price_3': 10.430000000000291, 'past_final_price_4': 6.099999999998545, 'past_final_price_5': 5.06000000000131, 'past_final_price_6': 8.040000000000873, 'past_final_price_7': 4.389999999999418, 'past_final_price_8': 8.880000000001019, 'past_final_price_9': 0.0, 'stabilized_amount_mean': 0.1084, 'stabilized_at_ms': 61, 'stabilized_nr_trades': 143.0, 'stabilized_spread': 5.54, 'wave_direction': 'min'}
2023-01-24 02:05:48.223 | data     | spreadsurfer.datacollect:start:91 - wave collected: {'0_amount_mean': 0.0572, '0_nr_trades': 10.0, '0_price_delta': 0.0, '0_spread': 1.27, '1_amount_mean': 0.0531, '1_nr_trades': 11.0, '1_price_delta': 0.0, '1_spread': 1.3, '2_amount_mean': 0.052, '2_nr_trades': 12.0, '2_price_delta': 0.0, '2_spread': 1.38, '3_amount_mean': 0.0491, '3_nr_trades': 13.0, '3_price_delta': 0.0, '3_spread': 1.38, '4_amount_mean': 0.0457, '4_nr_trades': 14.0, '4_price_delta': 0.0, '4_spread': 1.38, 'last_price_delta_since_stabilized': -0.21, 'past_final_price_0': 4.8400000000001455, 'past_final_price_1': 1.9000000000014552, 'past_final_price_2': 5.180000000000291, 'past_final_price_3': 8.110000000000582, 'past_final_price_4': 3.779999999998836, 'past_final_price_5': 2.7400000000016007, 'past_final_price_6': 5.720000000001164, 'past_final_price_7': 2.069999999999709, 'past_final_price_8': 6.56000000000131, 'past_final_price_9': -2.319999999999709, 'stabilized_amount_mean': 0.0458, 'stabilized_at_ms': 77, 'stabilized_nr_trades': 15.0, 'stabilized_spread': 1.38, 'wave_direction': None}
2023-01-24 02:05:49.037 | data     | spreadsurfer.datacollect:start:91 - wave collected: {'0_amount_mean': 0.052, '0_nr_trades': 12.0, '0_price_delta': 0.22, '0_spread': 1.38, '1_amount_mean': 0.0491, '1_nr_trades': 13.0, '1_price_delta': 0.22, '1_spread': 1.38, '2_amount_mean': 0.0457, '2_nr_trades': 14.0, '2_price_delta': 0.22, '2_spread': 1.38, '3_amount_mean': 0.0458, '3_nr_trades': 15.0, '3_price_delta': 0.22, '3_spread': 1.38, '4_amount_mean': 0.0449, '4_nr_trades': 17.0, '4_price_delta': 0.01, '4_spread': 1.59, 'last_price_delta_since_stabilized': -3.02, 'past_final_price_0': -1.1299999999973807, 'past_final_price_1': 2.150000000001455, 'past_final_price_2': 5.080000000001746, 'past_final_price_3': 0.75, 'past_final_price_4': -0.28999999999723514, 'past_final_price_5': 2.6900000000023283, 'past_final_price_6': -0.9599999999991269, 'past_final_price_7': 3.530000000002474, 'past_final_price_8': -5.349999999998545, 'past_final_price_9': 0.0, 'stabilized_amount_mean': 0.0477, 'stabilized_at_ms': 95, 'stabilized_nr_trades': 18.0, 'stabilized_spread': 1.6, 'wave_direction': 'max'}
2023-01-24 02:05:49.310 | data     | spreadsurfer.datacollect:start:91 - wave collected: {'0_amount_mean': 0.0003, '0_nr_trades': 3.0, '0_price_delta': 0.0, '0_spread': 0.29, '1_amount_mean': 0.0005, '1_nr_trades': 4.0, '1_price_delta': 0.0, '1_spread': 0.3, '2_amount_mean': 0.0061, '2_nr_trades': 5.0, '2_price_delta': 0.0, '2_spread': 0.31, '3_amount_mean': 0.0052, '3_nr_trades': 6.0, '3_price_delta': 0.0, '3_spread': 0.31, '4_amount_mean': 0.0045, '4_nr_trades': 7.0, '4_price_delta': 0.0, '4_spread': 0.31, 'last_price_delta_since_stabilized': -0.53, 'past_final_price_0': -0.5599999999976717, 'past_final_price_1': 2.720000000001164, 'past_final_price_2': 5.650000000001455, 'past_final_price_3': 1.319999999999709, 'past_final_price_4': 0.2800000000024738, 'past_final_price_5': 3.2600000000020373, 'past_final_price_6': -0.3899999999994179, 'past_final_price_7': 4.100000000002183, 'past_final_price_8': -4.779999999998836, 'past_final_price_9': 0.569999999999709, 'stabilized_amount_mean': 0.0041, 'stabilized_at_ms': 89, 'stabilized_nr_trades': 8.0, 'stabilized_spread': 0.74, 'wave_direction': None}

"""
from timeit import timeit

mae = 0
count = 0
for line in input_from_log.split('\n'):
    if not line: continue
    count += 1
    json = '{' + line.split('{')[1]
    sample = pd.DataFrame([eval(json)])

    real = sample.last_price_delta_since_stabilized[0]
    print('expected: ', real)
    sample.drop('last_price_delta_since_stabilized', axis=1, inplace=True)

    timeit(lambda: pipeline.predict(sample), number=1) * 1000
    guess = pipeline.predict(sample)[0]
    print('predict: ', guess)
    mae += abs(real - guess)
    print()
display(f'MAE: {mae / count}')

expected:  0.57
predict:  0.6671444955702863

expected:  -0.04
predict:  -0.3085850859949722

expected:  -0.63
predict:  -0.8915986439711182

expected:  0.17
predict:  1.4576489305049165

expected:  -3.18
predict:  -0.8915986439711182

expected:  2.25
predict:  1.8277499692065142

expected:  -0.21
predict:  -0.42969295805257013

expected:  -3.02
predict:  -0.8915986439711182

expected:  -0.53
predict:  -0.3596457092489567



'MAE: 0.7937863497440173'

In [438]:
sample = pd.DataFrame([
    {'0_amount_mean': 0.007, '0_nr_trades': 12.0, '0_price_delta': 0.91, '0_spread': 1.15, '1_amount_mean': 0.007, '1_nr_trades': 13.0, '1_price_delta': 0.91, '1_spread': 1.52, '2_amount_mean': 0.006, '2_nr_trades': 14.0,
     '2_price_delta': 0.91, '2_spread': 1.52, '3_amount_mean': 0.006, '3_nr_trades': 16.0, '3_price_delta': 0.69, '3_spread': 1.74, '4_amount_mean': 0.005, '4_nr_trades': 17.0, '4_price_delta': 0.69, '4_spread': 1.74,
     'last_price_delta_since_stabilized': -0.87, 'past_final_price_0': -0.11999999999898137, 'past_final_price_1': 0.25, 'past_final_price_2': -0.020000000000436557, 'past_final_price_3': -1.7400000000016007,
     'past_final_price_4': 0.06999999999970896, 'past_final_price_5': -2.7299999999995634, 'past_final_price_6': -3.9099999999998545, 'past_final_price_7': 3.360000000000582, 'past_final_price_8': -0.06000000000130967,
     'past_final_price_9': 0.0, 'stabilized_amount_mean': 0.007, 'stabilized_at_ms': 369, 'stabilized_nr_trades': 18.0, 'stabilized_spread': 2.43, 'wave_direction': 'max'}
])

print('expected: ', sample.last_price_delta_since_stabilized.real())
sample.drop('last_price_delta_since_stabilized', axis=1, inplace=True)

from timeit import timeit

timeit(lambda: pipeline.predict(sample), number=1) * 1000
print('predict: ', pipeline.predict(sample)[0])


expected:  -0.87
predict:  -0.9450021282696802


In [278]:
from glob import glob

cat_filename = glob('*.cat')[0]
cat_filename

model = CatBoostRegressor()
model.load_model(fname=cat_filename)
print(f'catboost model loaded from {cat_filename}')


catboost model loaded from 2023-01-21T20:37:02-score-773.0.cat
