In [71]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
import numpy as np

df = pd.read_parquet('./data')

## filters
# df = df[df['stabilized_spread'] < 10]  # nem tul nagy spread
# df = df[df['stabilized_spread'] > 0.3]  # nem tul kis spread

print(f'total dataset: {len(df)} rows')

for col in df.columns:
    if 'nr_trades' in col or 'price_delta' in col and not col.startswith('last'):
        df.drop(col, axis=1, inplace=True)
    # if 'nr_trades' in col:
    #     df.drop(col, axis=1, inplace=True)
    if 'past' in col:
        df.drop(col, axis=1, inplace=True)
    if '_spread' == col[1:]:
        df.drop(col, axis=1, inplace=True)
    pass

df_min = df[df.wave_direction == 'min'].copy()
df_max = df[df.wave_direction == 'max'].copy()

df.drop('wave_direction', axis=1, inplace=True)
df_min.drop('wave_direction', axis=1, inplace=True)
df_max.drop('wave_direction', axis=1, inplace=True)
print(f'min dataset: {len(df_min)}')
print(f'max dataset: {len(df_max)}')

df_min.sort_index(axis=1, inplace=True)
df_max.sort_index(axis=1, inplace=True)

display(df_min.last_price_delta_since_stabilized.describe())
display(df_max.last_price_delta_since_stabilized.describe())

# display(df_max)

## df_max.last_price_delta_since_stabilized = df_max.last_price_delta_since_stabilized * -1

total dataset: 201839 rows
min dataset: 69536
max dataset: 70401


count    69536.000000
mean         0.599530
std          1.025531
min          0.000000
25%          0.000000
50%          0.220000
75%          0.760000
max         30.220000
Name: last_price_delta_since_stabilized, dtype: float64

count    70401.000000
mean        -0.583757
std          0.997632
min        -20.960000
25%         -0.740000
50%         -0.210000
75%          0.000000
max          0.000000
Name: last_price_delta_since_stabilized, dtype: float64

In [72]:
a = df.corr().last_price_delta_since_stabilized * 1000
display(a.sort_values(axis=0, ascending=False))

# a = df_min.corr().last_price_delta_since_stabilized * 1000
# display(a.sort_values(axis=0, ascending=False))
#
# a = df_max.corr().last_price_delta_since_stabilized * 1000
# display(a.sort_values(axis=0, ascending=False))


last_price_delta_since_stabilized    1000.000000
stabilized_gasp                        40.140500
stabilized_at_ms                       33.182801
stabilized_spread                      10.520160
2_amount_mean                           1.640186
1_amount_mean                           1.035427
3_amount_mean                           0.664238
0_amount_mean                          -1.284374
4_amount_mean                          -1.629724
stabilized_amount_mean                 -2.212604
Name: last_price_delta_since_stabilized, dtype: float64

In [73]:
from spreadsurfer.price_engine import FeatureEngineer
from catboost import CatBoostRegressor
from sklearn.preprocessing import StandardScaler


def train(df, loss_function):
    # model = CatBoostRegressor(learning_rate=0.01, depth=7, loss_function=loss_function, random_state=0, verbose=False, iterations=1500)
    model = CatBoostRegressor(learning_rate=0.15, depth=6, loss_function=loss_function, random_state=0, verbose=False)
    pipeline = Pipeline(steps=[
        ('preprocessor', FeatureEngineer()),
        ('model', model)
    ])

    df = df.copy()
    y = df['last_price_delta_since_stabilized']
    X = df.drop('last_price_delta_since_stabilized', axis=1)
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=0)

    pipeline.fit(X_train, y_train)
    score = pipeline.score(X_valid, y_valid)
    display(f'{loss_function}: {score}')
    return model, score


min_model, min_score = train(df, loss_function='MAPE')
# max_model, max_score = train(df_max, loss_function='MAPE')
# min_model, min_score = train(df, loss_function='RMSE')
# max_model, max_score = train(df_max, loss_function='RMSE')

# min_model, min_score = train(df_min, loss_function='Poisson') # 0.12
# max_model, max_score = train(df, loss_function='Poisson') # 0.14

# min_model, min_score = train(df_min, loss_function='Huber:delta=0.4')
# max_model, max_score = train(df, loss_function='Huber:delta=0.4')

# max_model, max_score = train(df, loss_function='Quantile:alpha=0.8')
# max_model, max_score = train(df, loss_function='Quantile:alpha=0.7')
# max_model, max_score = train(df, loss_function='Quantile:alpha=0.6')
# max_model, max_score = train(df, loss_function='Quantile:alpha=0.5')
# max_model, max_score = train(df, loss_function='Quantile:alpha=0.4')
# max_model, max_score = train(df, loss_function='Quantile:alpha=0.3')
# max_model, max_score = train(df, loss_function='Quantile:alpha=0.2')

# max_model, max_score = train(df_max, loss_function='Quantile:alpha=0.7')



'MAPE: 0.12600981684257384'

In [61]:
min_model, min_score = train(df, loss_function='Quantile:alpha=0.811')
max_model, max_score = train(df, loss_function='Quantile:alpha=0.178') # 0.0038


'Quantile:alpha=0.811: 0.017096799918225103'

'Quantile:alpha=0.178: 0.003792171935041111'

In [75]:
from spreadsurfer import now_isoformat

save = True
if save:
    min_model.save_model(f'./models/{now_isoformat()}-mape-score-{1000 * round(min_score, 3)}.cat')
    # max_model.save_model(f'./models/{now_isoformat()}-quantile0178-score-{1000 * round(max_score, 3)}.cat')


In [74]:
model = min_model

with open("neverseen.log","r") as f:
    input_from_log = ''.join([x for x in f.readlines() if 'collected' in x][-100:])

pipeline = Pipeline(steps=[
    ('preprocessor', FeatureEngineer()),
    ('model', model)
])

mae = 0
count = 0
for line in input_from_log.split('\n'):
    if not line: continue
    count += 1
    json = '{' + line.split('{')[1]
    json = json.replace('nan', '0')
    sample = pd.DataFrame([eval(json)])
    if sample.wave_direction[0] not in ['min', 'max']: continue

    real = sample.last_price_delta_since_stabilized[0]
    if abs(real) < 0.2: continue
    sample.drop('last_price_delta_since_stabilized', axis=1, inplace=True)

    sample.loc[sample['wave_direction'] == 'min', 'wave_direction'] = 1
    sample.loc[sample['wave_direction'] == 'max', 'wave_direction'] = -1
    sample = sample.astype({"wave_direction": 'float64'})

    # if sample['wave_direction'][0] != (1 if min else -1):
    #     continue

    print('expected: ', real)
    guess = model.predict(sample)[0]
    guess *= 3.5
    print('predict: ', guess)
    mae += abs(real - guess)
    print()
display(f'MAE: {mae / count}')


expected:  -0.25
predict:  -0.33381900802663705

expected:  0.64
predict:  0.3336713274390162

expected:  0.55
predict:  0.4827536632059254

expected:  -0.5
predict:  -0.31529032645272353

expected:  -0.23
predict:  -0.5161163142562414

expected:  -1.59
predict:  -0.3551902269828745

expected:  1.14
predict:  1.1380419517450282

expected:  0.94
predict:  0.8185788298481572

expected:  -0.54
predict:  -0.6295173418783049

expected:  -0.21
predict:  -0.38243273635497005

expected:  -0.33
predict:  -0.21370283745824653

expected:  0.21
predict:  1.4007142361503633

expected:  -0.43
predict:  -0.48145900839721256

expected:  -1.01
predict:  -0.43318889911389374

expected:  -0.2
predict:  -0.5021814621222135

expected:  3.18
predict:  0.1477227994364838

expected:  -0.3
predict:  -0.22451048839569404

expected:  0.92
predict:  0.009160082248520402

expected:  0.27
predict:  0.017790864704376037

expected:  1.73
predict:  -0.4107362426920435

expected:  -0.39
predict:  -0.22475383992859557



'MAE: 0.15387618899734767'