In [1]:
import sys
sys.path.append('..')

In [2]:
import Simulator.simulator as simulator
import Strategies.Stoikov as Stoikov
import Simulator.get_info as get_info
import Simulator.load_data as load_data

from importlib import reload

import pandas as pd
import numpy as np
import datetime
from matplotlib import pyplot as plt
from tqdm import tqdm
tqdm.pandas()

path_to_file = '../data/data_eth/'
run_time = pd.Timedelta(hours=12).value

## Get data

In [3]:
trades = pd.read_csv(path_to_file + 'trades.csv')
trades.index = list(trades.index)

trades['receive_ts'] = pd.to_datetime(trades['receive_ts'])
trades['supporting: size_cumsum'] = trades['size'].cumsum()
trades['supporting: 1_sec_before'] = trades['receive_ts'].searchsorted(trades['receive_ts'] - datetime.timedelta(seconds=1))
dct = dict(zip(trades.index, trades['supporting: size_cumsum']))
trades['supporting: cs_1_sec_before'] = trades['supporting: 1_sec_before'].map(dct)
trades['order_intensity'] = trades['supporting: size_cumsum'] - trades['supporting: cs_1_sec_before']

In [4]:
lobs = pd.read_csv(path_to_file + 'lobs.csv')
lobs = lobs.rename(columns={
    ' exchange_ts': 'exchange_ts'
})

features = lobs.columns
lobs.index = list(lobs.index)
lobs['receive_ts'] = pd.to_datetime(lobs['receive_ts'])

In [5]:
lobs['supporting: 1_sec_before'] = lobs['receive_ts'].searchsorted(lobs['receive_ts'] - datetime.timedelta(seconds=1))
lobs['supporting: 1_sec_after'] = lobs['receive_ts'].searchsorted(lobs['receive_ts'] + datetime.timedelta(seconds=1))
lobs['supporting: 500_ms_after'] = lobs['receive_ts'].searchsorted(lobs['receive_ts'] + datetime.timedelta(milliseconds=1))

lobs['midprice'] = (lobs['ethusdt:Binance:LinearPerpetual_ask_price_0'] + lobs['ethusdt:Binance:LinearPerpetual_bid_price_0']) / 2
lobs['future: midprice'] = lobs['supporting: 500_ms_after'].map(dict(zip(lobs.index, lobs['midprice'])))
lobs['future: return'] = np.log(lobs['future: midprice']/lobs['midprice'])
lobs['i'] = lobs.index

In [6]:
lobs['volatility'] = lobs.progress_apply(lambda x : lobs['future: return'].iloc[x['supporting: 1_sec_before'] : x['i'] + 1].std(), axis=1)

100%|██████████| 2539699/2539699 [05:31<00:00, 7664.00it/s]


In [7]:
lobs['future: volatility'] = lobs['supporting: 1_sec_after'].map(dict(zip(lobs.index, lobs['volatility'])))
lobs['order_intensity'] = trades['order_intensity'][trades['receive_ts'].searchsorted(lobs['receive_ts']).clip(0, trades.shape[0] - 1)].reset_index(drop=True)
lobs['future: order_intensity'] = trades['order_intensity'][trades['receive_ts'].searchsorted(lobs['receive_ts'] + datetime.timedelta(seconds=1)).clip(0, trades.shape[0] - 1)].reset_index(drop=True)

In [8]:
lobs = lobs.iloc[1000:-1000]

In [9]:
targets = {
    'return': lobs['future: return'].copy(deep=True),
    'volatility': lobs['future: volatility'].copy(deep=True),
    'order_intensity': lobs['future: order_intensity'].copy(deep=True)
}

In [10]:
returns = pd.DataFrame({'return': targets['return']})
volatility = pd.DataFrame({'volatility': targets['volatility']})
order_intensity = pd.DataFrame({'order_intensity': targets['order_intensity']})

In [11]:
returns.to_pickle('../data/data_eth/target/return.pickle')
volatility.to_pickle('../data/data_eth/target/volatility.pickle')
order_intensity.to_pickle('../data/data_eth/target/order_intensity.pickle')

## Predict

In [12]:
from lightgbm import LGBMRegressor
import joblib
from sklearn.metrics import r2_score

In [13]:
import re
lobs = lobs[features]
lobs = lobs.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '-', x))

In [14]:
params = {'objective': 'regression', 'learning_rate': 0.1, 'num_leaves': 8, 'early_stopping_round': 10,
          'verbose': -1, 'linear_tree': True}

model = LGBMRegressor(**params)
X_train = lobs[:1_800_000].drop(columns=['receive_ts', 'exchange_ts'])
y_train = targets['return'][:1800000]

X_es = lobs[1800000:2100000].drop(columns=['receive_ts', 'exchange_ts'])
y_es = targets['return'][1800000:2100000]

X_val = lobs[2100000:].drop(columns=['receive_ts', 'exchange_ts'])
y_val = targets['return'][2100000:]

model.fit(X_train, y_train, eval_set=[(X_es, y_es)])

joblib.dump(model, 'return_predictor.pkl')

preds = model.predict(X_val)

np.corrcoef(y_val, preds)

array([[1.       , 0.0561553],
       [0.0561553, 1.       ]])

In [15]:
r2_score(y_val, preds)

-0.20908657162249566

In [16]:
params = {'objective': 'regression', 'learning_rate': 0.1, 'num_leaves': 8, 'early_stopping_round': 10,
          'verbose': -1, 'linear_tree': True}

model = LGBMRegressor(**params)
X_train = lobs[:1800000].drop(columns=['receive_ts', 'exchange_ts'])
y_train = targets['volatility'][:1800000]

X_es = lobs[1800000:2_100_000].drop(columns=['receive_ts', 'exchange_ts'])
y_es = targets['volatility'][1800000:2100000]

X_val = lobs[2100000:].drop(columns=['receive_ts', 'exchange_ts'])
y_val = targets['volatility'][2100000:]

model.fit(X_train, y_train, eval_set=[(X_es, y_es)])

joblib.dump(model, 'volatility_predictor.pkl')

preds = model.predict(X_val)

np.corrcoef(y_val, preds)

array([[1.        , 0.47677794],
       [0.47677794, 1.        ]])

In [17]:
r2_score(y_val, preds)

-0.0890721344855383

In [18]:
params = {'objective': 'regression', 'learning_rate': 0.1, 'num_leaves': 8, 'early_stopping_round': 10,
          'verbose': -1, 'linear_tree': True}

model = LGBMRegressor()
X_train = lobs[:1800000].drop(columns=['receive_ts', 'exchange_ts'])
y_train = targets['order_intensity'][:1800000]

X_es = lobs[1800000:2100000].drop(columns=['receive_ts', 'exchange_ts'])
y_es = targets['order_intensity'][1800000:2100000]

X_val = lobs[2100000:].drop(columns=['receive_ts', 'exchange_ts'])
y_val = targets['order_intensity'][2100000:]

model.fit(X_train, y_train)

joblib.dump(model, 'order_intensity_predictor.pkl')

preds = model.predict(X_val)

np.corrcoef(y_val, preds)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.120222 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 10200
[LightGBM] [Info] Number of data points in the train set: 1800000, number of used features: 40
[LightGBM] [Info] Start training from score 61.710788


array([[1.      , 0.091568],
       [0.091568, 1.      ]])

In [19]:
r2_score(y_val, preds)

-0.5171460055374275