# Description

The notebook simulates the performance of a "pegged-at-mid" trading algorithm.

- Load the 1sec bid/ask data
- Conduct a sanity check
- Calculate the midpoint limit buy/sell prices
- Aggregate to 5T and check the success of trades in historical data

```
dataset_signature=periodic.airflow.downloaded_EOD.parquet.bid_ask.futures.v3.cryptochassis.binance.v1_0_0
```

In [None]:
%load_ext autoreload
%autoreload 2
import logging

import pandas as pd

import core.finance as cofinanc
import dataflow.core as dtfcore
import dataflow.system as dtfsys
import dataflow.universe as dtfuniver
import helpers.hdbg as hdbg
import helpers.henv as henv
import helpers.hprint as hprint
import im_v2.crypto_chassis.data.client as iccdc
import market_data as mdata
import core.config as cconfig

In [None]:
hdbg.init_logger(verbosity=logging.INFO)

_LOG = logging.getLogger(__name__)

_LOG.info("%s", henv.get_system_signature()[0])

hprint.config_notebook()

# Load CryptoChassis data.

- Latest universe (v3)
- Resampled to 1sec
- For 1 asset and 1 day
- Using DataFlow `read_data` node

## Initialize a config for `read_data` node

In [None]:
dict_ = {
    "load_data": {
        "start_ts": pd.Timestamp("2022-12-14 00:00:00+00:00"),
        "end_ts": pd.Timestamp("2022-12-15 00:00:00+00:00"),
    },
    "universe": {
        "full_symbols": ["binance::ADA_USDT"],
    }
}
config = cconfig.Config.from_dict(dict_)

In [None]:
# Set up the parameters for initialization of the IM Client.
#  Note: these parameters are defined separately since they are 
universe_version = "v3"
resample_1min = False
contract_type = "futures"
tag = "downloaded_1sec"
client = iccdc.get_CryptoChassisHistoricalPqByTileClient_example2(
    universe_version, resample_1min, contract_type, tag
)

In [None]:
start_ts = config.get_and_mark_as_used(("load_data", "start_ts"))
end_ts = config.get_and_mark_as_used(("load_data", "end_ts"))

intervals = [
    (
        start_ts,
        end_ts,
    ),
]

In [None]:
# Verify that provided symbols are present in the client.
universe_full_symbols = dtfuniver.get_universe("crypto_chassis_v3-all")
config_full_symbols = config.get_and_mark_as_used(("universe","full_symbols"))
hdbg.dassert_is_subset(config_full_symbols, universe_full_symbols)
# Convert to asset ids.
config["universe"]["asset_ids"] = client.get_asset_ids_from_full_symbols(config_full_symbols)
asset_ids = config.get_and_mark_as_used(("universe", "asset_ids"))

In [None]:
# Initialize market data.
columns = None
columns_remap = None
wall_clock_time = pd.Timestamp("2100-01-01T00:00:00+00:00")
stage = "read_data"
ts_col_name = "end_ts"
multiindex_output = True
col_names_to_remove = []
market_data = mdata.get_HistoricalImClientMarketData_example1(
    client,
    asset_ids,
    columns,
    columns_remap,
)

## Initialize DAG

In [None]:
def _run_dag_node(dag):
    dag_runner = dtfcore.FitPredictDagRunner(dag)
    dag_runner.set_fit_intervals(intervals)
    fit_result_bundle = dag_runner.fit()
    df = fit_result_bundle.result_df
    return df

In [None]:
# Create an empty DAG.
dag = dtfcore.DAG(mode="strict")
dtfcore.draw(dag)

In [None]:
# TODO(gp): @danya, see if we have also close or trade.
stage = "read_data"
ts_col_name = "end_ts"
multiindex_output = True
col_names_to_remove = []
node = dtfsys.HistoricalDataSource(
    stage,
    market_data,
    ts_col_name,
    multiindex_output,
    col_names_to_remove=col_names_to_remove,
)
dag.insert_at_head(node)
dtfcore.draw(dag)

## Read data

In [None]:
df1 = _run_dag_node(dag)
df1.shape
df1.head(5)

In [None]:
# Drop multiindex in single-asset dataframes for human readability.
if len(asset_ids) < 2:
    df2 = df1.droplevel(1, axis=1)
else:
    df2 = df1.copy()

In [None]:
df2.head()

## Sanity check

A quick sanity-check for the following:
- What percentage of 1 sec bars are missing?
- How often is bid_size = 0, ask_size = 0, volume=0?
- How often is bid !< ask?


In [None]:
# Check for missing data.
df1.isna().sum()

In [None]:
# Check for zeroes.
(df1 == 0).astype(int).sum(axis=1).sum()

In [None]:
# Check bid price !< ask price.
(df1["bid_price"] >= df1["ask_price"]).any().any()

In [None]:
df1.head()

In [None]:
df2.head()

In [None]:
2 / 0

In [None]:
# TODO(gp): There are some missing data, e.g., 19:00:02. Let's compute some quick stats.

### Commentary

Since no NaNs or zeroes were found with a simple general check, there is no need for an in-depth look.

## Augment data with new features

In [None]:
# Append `mid` data.
# # (bid + ask) / 2.
bid_col = "bid_price"
ask_col = "ask_price"
bid_volume_col = "bid_size"
ask_volume_col = "ask_size"
requested_cols = ["mid"]
join_output_with_input = True
df2 = cofinanc.process_bid_ask(
    df1,
    bid_col,
    ask_col,
    bid_volume_col,
    ask_volume_col,
    requested_cols=requested_cols,
    join_output_with_input=join_output_with_input,
)
df2.head(10)

In [None]:
# TODO(gp): Let's assign dfs to different vars so that each cell is idempotent.
print(df2.shape)
print(df2.index.min())
print(df2.index.max())

In [None]:
asset_id = 3303714233

In [None]:
df2.droplevel(1, axis=1)

In [None]:
df3 = df2.swaplevel(axis=1)
df3.head()

In [None]:
df3["ask_value"] = df3["ask_price"] * df3["ask_size"]
df3["bid_value"] = df3["bid_price"] * df3["bid_size"]

# This is really high. 100m USD per hour on top of the book.
df3[["bid_value", "ask_value"]].resample("1H").sum().plot()

In [None]:
# TODO(gp): @danya this is add_limit_order_prices()

print(df3.shape)
# Add limit prices based on passivity of 0.01.
mid_price = df3["mid"]
passivity_factor = 0.01

df_limit_price = pd.DataFrame()
# df_limit_price["limit_buy_price"] = df["mid"].resample("1T").mean().shift(1) * (
#     1 - passivity_factor
# )
# df_limit_price["limit_sell_price"] = df["mid"].resample("1T").mean().shift(1) * (
#     1 + passivity_factor
# )
    
# TODO(gp): This should be tuned as function of the rolling std dev.
abs_spread = 0.0001
# We are trading at the top of the book.
# TODO(gp): Crossing the spread means setting limit_buy_price = ... + abs_spread (and vice versa for sell).
df_limit_price["limit_buy_price"] = df3["mid"].resample("1T").mean().shift(1) - abs_spread
df_limit_price["limit_sell_price"] = df3["mid"].resample("1T").mean().shift(1) + abs_spread
    
df4 = df3.merge(df_limit_price, right_index=True, left_index=True, how="outer")
df4["limit_buy_price"] = df4["limit_buy_price"].ffill()
df4["limit_sell_price"] = df4["limit_sell_price"].ffill()
print(df4.shape)

In [None]:
# Count is_buy / is_sell.
df4["is_buy"] = (
    df4["ask_price"] <= df4["limit_buy_price"]
)
df4["is_sell"] = (
    df4["bid_price"] >= df4["limit_sell_price"]
)

In [None]:
# TODO(gp): Not sure this is working as expected. I don't see the seconds.
# Assigning df columns with a df series with different time index might subsample.
# I would do a outmerge merge and then ffill.

# Let's always check the output of the df to make sure things are sane.
df4

In [None]:
# TODO(gp): @Danya
# def perform_spread_analysys(...)

In [None]:
spread = df4["ask_price"] - df4["bid_price"]
spread_in_bps = spread / df4["mid"] * 1e4

In [None]:
spread.hist(bins=101)

In [None]:
spread.plot()

In [None]:
spread_in_bps.plot()

In [None]:
# TODO(gp): Create a function:
# plot_limit_orders(df, start_timestamp: Optional[pd.timestamp] = None, end_timestamp:Optional[pd.Timestamp] = None)
df4[["mid", "ask_price", "bid_price", "limit_buy_price", "limit_sell_price"]].head(1000).plot()

(df4[["is_buy", "is_sell"]] * 1.0).head(1000).plot()

## Resample to T_reprice

In [None]:
print(df4.shape)

In [None]:
# TODO(gp): @danya compute_repricing_df(df, report_stats):

# TODO(gp): ask_price -> buy_limit?
df4["exec_buy_price"] = df4["is_buy"] * df4["ask_price"]
mask = ~df4["is_buy"]
df4["exec_buy_price"][mask] = np.nan

#df4["exec_price"].plot()
#df4["exec_price"].mean()

# TODO(gp): Not sure this does what we want. We want to average only the values that are not nan.
#df4["exec_buy_price"].resample("5T").mean()

df4["exec_sell_price"] = df4["is_sell"] * df4["bid_price"]

mask = ~df4["is_sell"]
df4["exec_sell_price"][mask] = np.nan

#df4["exec_price"].plot()
#df4["exec_price"].mean()

In [None]:
# Display as percentages.
# if report_stats:
print("buy percentage at repricing freq: ", df4["is_buy"].sum() / df4.shape[0])
print(df4["is_sell"].sum() / df4.shape[0])

import helpers.hprint as hprint

import numpy as np

print(hprint.perc(df4["exec_buy_price"].isnull().sum(), df4.shape[0]))
print(hprint.perc(df4["exec_sell_price"].isnull().sum(), df4.shape[0]))

## Resample to T_exec

In [None]:
# TODO(gp): @danya ->
# def compute_execution_df(df, t_exec: str, report_stats)

df5 = pd.DataFrame()
# Count how many executions there were in an interval.
df5["exec_buy_num"] = df4["is_buy"].resample("5T").sum()
df5["exec_buy_price"] = df4["exec_buy_price"].resample("5T").mean()
df5["exec_is_buy"] = df5["exec_buy_num"] > 0
print(hprint.perc(df5["exec_is_buy"].sum(), df5["exec_is_buy"].shape[0]))

# Estimate the executed volume. 
df5["exec_buy_volume"] = (df4["ask_size"] * df4["ask_price"] * df4["is_buy"]).resample("5T").sum()
print("million USD per 5T=", df5["exec_buy_volume"].mean() / 1e6)
# Estimate price as average of executed price.
#exec_buy_price = (close * exec_is_buy).groupby("5T").sum() / exec_buy_num

# Same for sell.
#exec_sell_volume = (is_sell * bid_size).group("5T").sum()
#exec_sell_price = (close * exec_is_sell).groupby("5T").sum() / exec_sell_num

df5["exec_sell_num"] = df4["is_sell"].resample("5T").sum()
df5["exec_sell_price"] = df4["exec_sell_price"].resample("5T").mean()
df5["exec_is_sell"] = df5["exec_sell_num"] > 0
print(hprint.perc(df5["exec_is_sell"].sum(), df5["exec_is_sell"].shape[0]))

# Estimate the executed volume. 
df5["exec_sell_volume"] = (df4["bid_size"] * df4["bid_price"] * df4["is_sell"]).resample("5T").sum()
print("million USD per 5T=", df5["exec_sell_volume"].mean() / 1e6)

## Compare to benchmark price. 

In [None]:
# TODO(gp): @danya 
# def compute_benchmark_stats(...)

In [None]:
# This is the benchmark.
df5["twap_mid_price"] = df4["mid"].resample("5T").mean()

df5[["twap_mid_price", "exec_sell_price", "exec_buy_price"]].head(1000).plot()

In [None]:
slippage = df5[["twap_mid_price", "exec_sell_price", "exec_buy_price"]]

slippage["sell_slippage_bps"] = (df5["exec_sell_price"] - df5["twap_mid_price"]) / df5["twap_mid_price"] * 1e4

#slippage = df["twap_mid_price"] / 

slippage["sell_slippage_bps"].hist(bins=21)

print("sell_slippage_bps.mean=", slippage["sell_slippage_bps"].mean())
print("sell_slippage_bps.median=", slippage["sell_slippage_bps"].median())

In [None]:
df5.head()

### Commentary

The quick look into the rate of successful trades indicated that for the given asset (`ADA/USDT`) and the date the successful "buy" order can be met for 16% of the time and a "sell" order is not met at all.