In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

df = pd.read_parquet('./data')
print(f'total dataset: {len(df)} rows')

## filters
# df = df[df['stabilized_spread'] < 10]  # nem tul nagy spread
df = df[df['stabilized_spread'] > 0.3]  # nem tul kis spread

for col in df.columns:
    # if 'nr_trades' in col or 'price_delta' in col and not col.startswith('last'):
    #     df.drop(col, axis=1, inplace=True)
    # if 'nr_trades' in col:
    #     df.drop(col, axis=1, inplace=True)
    # if 'past' in col:
    #     df.drop(col, axis=1, inplace=True)
    # if '_spread' == col[1:]:
    #     df.drop(col, axis=1, inplace=True)
    pass

df_min = df[df.wave_direction == 'min'].copy()
df_max = df[df.wave_direction == 'max'].copy()

df_min.drop('wave_direction', axis=1, inplace=True)
df_max.drop('wave_direction', axis=1, inplace=True)
df = None
print(f'min dataset: {len(df_min)}')
print(f'max dataset: {len(df_max)}')

df_min.sort_index(axis=1, inplace=True)
df_max.sort_index(axis=1, inplace=True)


total dataset: 129438 rows
min dataset: 45432
max dataset: 45848


In [14]:
a = df_min.corr().last_price_delta_since_stabilized * 1000
display(a.sort_values(axis=0, ascending=False))

a = df_max.corr().last_price_delta_since_stabilized * 1000
display(a.sort_values(axis=0, ascending=False))


last_price_delta_since_stabilized    1000.000000
stabilized_spread                     121.255248
4_price_delta                          87.125520
stabilized_amount_mean                 70.170515
0_price_delta                          63.571120
1_price_delta                          61.073696
4_amount_mean                          60.769982
3_amount_mean                          58.992283
2_price_delta                          57.510423
3_price_delta                          54.599929
2_amount_mean                          54.372355
1_amount_mean                          52.494637
0_amount_mean                          50.198359
stabilized_gasp                        10.982964
0_nr_trades                           -35.906691
1_nr_trades                           -40.612537
2_nr_trades                           -46.274581
3_nr_trades                           -51.991202
stabilized_nr_trades                  -55.008424
4_nr_trades                           -63.127651
stabilized_at_ms    

last_price_delta_since_stabilized    1000.000000
stabilized_at_ms                      288.071634
4_nr_trades                            56.124143
stabilized_nr_trades                   45.255247
3_nr_trades                            45.155037
2_nr_trades                            38.702827
1_nr_trades                            31.744374
0_nr_trades                            27.550893
stabilized_gasp                         3.268431
0_amount_mean                         -43.589269
2_amount_mean                         -46.663917
1_price_delta                         -46.974294
1_amount_mean                         -47.050284
3_amount_mean                         -52.446433
2_price_delta                         -52.733642
0_price_delta                         -52.778477
4_amount_mean                         -55.756021
3_price_delta                         -61.113229
stabilized_amount_mean                -69.204663
4_price_delta                         -91.336248
stabilized_spread   

In [20]:
from spreadsurfer.price_engine import FeatureEngineer
from catboost import CatBoostRegressor
from sklearn.preprocessing import StandardScaler


def train(df, mape=True):
    if mape:
        model = CatBoostRegressor(learning_rate=0.01, depth=7, loss_function='MAPE', random_state=0, verbose=False, iterations=1500)
        pipeline = Pipeline(steps=[
            ('preprocessor', FeatureEngineer()),
            ('model', model)
        ])
    else:
        model = CatBoostRegressor(learning_rate=0.01, depth=7, loss_function='RMSE', random_state=0, verbose=False, iterations=1500)
        pipeline = Pipeline(steps=[
            ('preprocessor', FeatureEngineer()),
            ('model', model)
        ])

    df = df.copy()
    y = df['last_price_delta_since_stabilized']
    X = df.drop('last_price_delta_since_stabilized', axis=1)
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=0)

    pipeline.fit(X_train, y_train)
    score = pipeline.score(X_valid, y_valid)
    display(score)
    return model, score


min_mape, min_mape_score = train(df_min, mape=True)
max_mape, max_mape_score = train(df_max, mape=True)
min_rmse, _ = train(df_min, mape=False)
max_rmse, _ = train(df_max, mape=False)


0.1641255596101746

-0.010437577754366378

-0.02271522594168429

0.1886481407478401

0.1736075473432439

In [25]:
from spreadsurfer import now_isoformat

save = True
if save:
    min_rmse.save_model(f'./models/{now_isoformat()}-min_rmse-score-{1000 * round(min_mape_score, 3)}.cat')
    max_rmse.save_model(f'./models/{now_isoformat()}-max_rmse-score-{1000 * round(max_mape_score, 3)}.cat')


## TEST LAB


In [24]:
with open("console.log","r") as f:
    input_from_log = ''.join([x for x in f.readlines() if 'collected' in x][-100:])

def predict_samples():
    mae = 0
    mae2 = 0
    count = 0
    for line in input_from_log.split('\n'):
        if not line: continue
        count += 1
        json = '{' + line.split('{')[1]
        json = json.replace('nan', '0')
        sample = pd.DataFrame([eval(json)])
        if sample.wave_direction[0] not in ['min', 'max']: continue

        real = sample.last_price_delta_since_stabilized[0]
        # if real < 0.1: continue
        print('expected: ', real)
        sample.drop('last_price_delta_since_stabilized', axis=1, inplace=True)

        if sample.wave_direction[0] == 'min':
            print('using MIN mape model')
            model = min_mape
            model2 = min_rmse
        else:
            print('using MAX mape model')
            model = max_mape
            model2 = max_rmse

        sample.drop('wave_direction', axis=1, inplace=True)

        guess = model.predict(sample)[0]
        guess2 = model2.predict(sample)[0]
        print('predict: ', guess)
        print('predict2: ', guess2)
        mae += abs(real - guess)
        mae2 += abs(real - guess2)
        print()
    display(f'MAE: {mae / count}')
    display(f'MAE2: {mae2 / count}')

predict_samples()


expected:  0.0
using MIN mape model
predict:  0.004881361281631941
predict2:  0.3809028355687599

expected:  -2.53
using MAX mape model
predict:  -0.3572492208713202
predict2:  -0.7256203240337229

expected:  1.01
using MIN mape model
predict:  0.6117831174004376
predict2:  1.5551748061844024

expected:  -0.66
using MAX mape model
predict:  -0.12397609608909793
predict2:  -0.28073461911158976

expected:  0.02
using MIN mape model
predict:  0.4838251319581675
predict2:  0.8543858724183178

expected:  0.5
using MIN mape model
predict:  0.11248173337018426
predict2:  0.20044478000377047

expected:  0.6
using MIN mape model
predict:  0.04597115142569544
predict2:  0.35053212056146416

expected:  -1.19
using MAX mape model
predict:  -0.33859611314454224
predict2:  -0.7144218210163709

expected:  -0.08
using MAX mape model
predict:  -0.2525868283872966
predict2:  -0.45021114369048826

expected:  -1.56
using MAX mape model
predict:  -0.44848328413658756
predict2:  -0.9151000630447021

expecte

'MAE: 0.510872859891832'

'MAE2: 0.47430157981071164'