In [49]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

df = pd.read_parquet('./data')
print(f'total dataset: {len(df)} rows')

## filters
# df = df[df['stabilized_spread'] < 10]  # nem tul nagy spread
df = df[df['stabilized_spread'] > 0.3]  # nem tul kis spread

for col in df.columns:
    # if 'nr_trades' in col or 'price_delta' in col and not col.startswith('last'):
    #     df.drop(col, axis=1, inplace=True)
    # if 'nr_trades' in col:
    #     df.drop(col, axis=1, inplace=True)
    # if 'past' in col:
    #     df.drop(col, axis=1, inplace=True)
    # if '_spread' == col[1:]:
    #     df.drop(col, axis=1, inplace=True)
    pass

df_min = df[df.wave_direction == 'min'].copy()
df_max = df[df.wave_direction == 'max'].copy()

df_min.drop('wave_direction', axis=1, inplace=True)
df_max.drop('wave_direction', axis=1, inplace=True)
df = None
print(f'min dataset: {len(df_min)}')
print(f'max dataset: {len(df_max)}')

df_min.sort_index(axis=1, inplace=True)
df_max.sort_index(axis=1, inplace=True)

df_max.last_price_delta_since_stabilized = df_max.last_price_delta_since_stabilized * -1


total dataset: 130438 rows
min dataset: 45737
max dataset: 46146


index
0    0.78
0    1.84
0    0.74
0    0.89
0   -0.00
     ... 
0   -0.00
0    0.89
0    1.62
0    2.46
0    5.52
Name: last_price_delta_since_stabilized, Length: 46146, dtype: float64

In [14]:
a = df_min.corr().last_price_delta_since_stabilized * 1000
display(a.sort_values(axis=0, ascending=False))

a = df_max.corr().last_price_delta_since_stabilized * 1000
display(a.sort_values(axis=0, ascending=False))


last_price_delta_since_stabilized    1000.000000
stabilized_spread                     121.255248
4_price_delta                          87.125520
stabilized_amount_mean                 70.170515
0_price_delta                          63.571120
1_price_delta                          61.073696
4_amount_mean                          60.769982
3_amount_mean                          58.992283
2_price_delta                          57.510423
3_price_delta                          54.599929
2_amount_mean                          54.372355
1_amount_mean                          52.494637
0_amount_mean                          50.198359
stabilized_gasp                        10.982964
0_nr_trades                           -35.906691
1_nr_trades                           -40.612537
2_nr_trades                           -46.274581
3_nr_trades                           -51.991202
stabilized_nr_trades                  -55.008424
4_nr_trades                           -63.127651
stabilized_at_ms    

last_price_delta_since_stabilized    1000.000000
stabilized_at_ms                      288.071634
4_nr_trades                            56.124143
stabilized_nr_trades                   45.255247
3_nr_trades                            45.155037
2_nr_trades                            38.702827
1_nr_trades                            31.744374
0_nr_trades                            27.550893
stabilized_gasp                         3.268431
0_amount_mean                         -43.589269
2_amount_mean                         -46.663917
1_price_delta                         -46.974294
1_amount_mean                         -47.050284
3_amount_mean                         -52.446433
2_price_delta                         -52.733642
0_price_delta                         -52.778477
4_amount_mean                         -55.756021
3_price_delta                         -61.113229
stabilized_amount_mean                -69.204663
4_price_delta                         -91.336248
stabilized_spread   

In [50]:
from spreadsurfer.price_engine import FeatureEngineer
from catboost import CatBoostRegressor
from sklearn.preprocessing import StandardScaler


def train(df, loss_function):
    # model = CatBoostRegressor(learning_rate=0.01, depth=7, loss_function=loss_function, random_state=0, verbose=False, iterations=1500)
    model = CatBoostRegressor(learning_rate=0.15, depth=6, loss_function=loss_function, random_state=0, verbose=False)
    pipeline = Pipeline(steps=[
        ('preprocessor', FeatureEngineer()),
        ('model', model)
    ])

    df = df.copy()
    y = df['last_price_delta_since_stabilized']
    X = df.drop('last_price_delta_since_stabilized', axis=1)
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=0)

    pipeline.fit(X_train, y_train)
    score = pipeline.score(X_valid, y_valid)
    display(score)
    return model, score


# min_model, min_score = train(df_min, loss_function='MAPE')
# max_model, max_score = train(df_max, loss_function='MAPE')

# min_model, min_score = train(df_min, loss_function='RMSE')
# max_model, max_score = train(df_max, loss_function='RMSE')

min_model, min_score = train(df_min, loss_function='Poisson') # 0.12
max_model, max_score = train(df_max, loss_function='Poisson') # 0.14

# min_model, min_score = train(df_min, loss_function='Huber:delta=0.4')
# max_model, max_score = train(df_max, loss_function='Huber:delta=0.4')

# min_model, min_score = train(df_min, loss_function='Quantile:alpha=0.7')
# max_model, max_score = train(df_max, loss_function='Quantile:alpha=0.7')



0.12562247115078307

0.14840873378065234

In [51]:
from spreadsurfer import now_isoformat

save = True
if save:
    min_model.save_model(f'./models/{now_isoformat()}-min_poisson-slower-score-{1000 * round(min_score, 3)}.cat')
    max_model.save_model(f'./models/{now_isoformat()}-max_poisson-slower-inv-score-{1000 * round(max_score, 3)}.cat')
