# Imports

In [1]:
import logging

import pandas as pd
import requests

import core.finance.resampling as cfinresa
import helpers.hdatetime as hdateti
import helpers.hdbg as hdbg
import helpers.hprint as hprint
import im_v2.ccxt.data.extract.exchange_class as imvcdeexcl

In [2]:
hdbg.init_logger(verbosity=logging.INFO)

_LOG = logging.getLogger(__name__)

hprint.config_notebook()

[0m[36mINFO[0m: > cmd='/venv/lib/python3.8/site-packages/ipykernel_launcher.py -f /home/.local/share/jupyter/runtime/kernel-bcd87e13-bfa4-4ea2-8a36-f727556486f6.json'


# Bid-ask data snippet (current implementation)

In [3]:
# Specify params.
exchange_id = "binance"
# Initiate the client.
bid_ask_client = imvcdeexcl.CcxtExchange(exchange_id)

In [4]:
# Load the data snippet for BTC.
currency_pair = "BTC_USDT"
ba_df = bid_ask_client.download_order_book(currency_pair)

In [5]:
ba_df

{'symbol': 'BTC/USDT',
 'bids': [[39429.75, 0.09976],
  [39429.27, 0.00027],
  [39428.5, 0.20008],
  [39426.49, 0.06],
  [39425.51, 0.00037],
  [39423.66, 0.00364],
  [39422.77, 0.045],
  [39422.55, 0.06],
  [39422.13, 0.00027],
  [39422.12, 0.00266],
  [39421.8, 0.0471],
  [39421.56, 0.0009],
  [39421.48, 0.31755],
  [39421.47, 0.14729],
  [39421.09, 0.44316],
  [39420.88, 0.00037],
  [39420.81, 0.04717],
  [39420.75, 0.38033],
  [39420.61, 0.8],
  [39420.11, 0.01105],
  [39420.0, 0.05074],
  [39419.9, 0.1268],
  [39419.85, 0.38034],
  [39419.82, 0.04675],
  [39419.8, 0.924],
  [39419.42, 0.001],
  [39419.17, 0.0005],
  [39418.83, 0.0194],
  [39418.16, 0.0003],
  [39418.1, 0.0095],
  [39417.79, 0.1],
  [39417.35, 0.01775],
  [39417.19, 0.00037],
  [39417.02, 0.60143],
  [39416.77, 0.94299],
  [39416.26, 0.15109],
  [39416.21, 0.48],
  [39416.08, 0.00486],
  [39416.0, 0.00166],
  [39415.92, 0.2],
  [39415.86, 0.14086],
  [39415.57, 0.0006],
  [39415.54, 0.0633],
  [39415.01, 0.01369],


As one can see, the current implementation of bid-ask data loader only allows to show the order book at the exact moment of its initiation.

# Bid-ask data extraction (proposed solution)

Thanks to the research that was done in #193, we had a notion that the bid-ask data can be downloaded via open sources and specifically - _crypto-chassis_.
For more details one can see https://github.com/cryptokaizen/cmamp/issues/193#issuecomment-974822385

Few words about the data:
- API page: https://github.com/crypto-chassis/cryptochassis-data-api-docs#information
   - Specifically, `Market Depth` section
- each GET request allow to download one day of 1-second snapshot data on market depth (aka order books or Level 2 data) up to a depth of 10

## Example of a raw data

For the example I am taking the data with he following characteristics:
- `full_symbol` = binance::BTC_USDT
- depth = 1 (default option)

In [6]:
example_date = "2022-01-01"
r = requests.get(
    f"https://api.cryptochassis.com/v1/market-depth/binance/btc-usdt?startTime={example_date}"
)
example_data = pd.read_csv(r.json()["urls"][0]["url"], compression="gzip")

In [7]:
example_data.head()

Unnamed: 0,time_seconds,bid_price_bid_size,ask_price_ask_size
0,1640995200,46214.01_1.08286,46214.02_0.24793
1,1640995201,46216.92_0.04341,46216.93_1.72798
2,1640995202,46220.82_0.86682,46220.83_1.10668
3,1640995203,46220.82_1.05178,46220.83_0.785
4,1640995204,46220.82_1.85231,46220.83_1.06233


## Get historical data

Each request is strictly limited to get only one day of bid-ask data. That's why I want to propose the solution that allows to get the DataFrame for any desired time range of historical data.

### Functions that convert data to the C-K format

In [8]:
def clean_up_raw_bid_ask_data(df):
    # Split the columns to differentiate between `price` and `size`.
    df[["bid_price", "bid_size"]] = df["bid_price_bid_size"].str.split(
        "_", expand=True
    )
    df[["ask_price", "ask_size"]] = df["ask_price_ask_size"].str.split(
        "_", expand=True
    )
    df = df.drop(columns=["bid_price_bid_size", "ask_price_ask_size"])
    # Convert `timestamps` to the usual format.
    df = df.rename(columns={"time_seconds": "timestamp"})
    df["timestamp"] = df["timestamp"].apply(
        lambda x: hdateti.convert_unix_epoch_to_timestamp(x, unit="s")
    )
    df = df.set_index("timestamp")
    # Convert to `float`.
    for cols in df.columns:
        df[cols] = df[cols].astype(float)
    # Add `full_symbol` (hardcoded solution).
    df["full_symbol"] = "binance::BTC_USDT"
    return df

In [9]:
def resample_bid_ask(df, resampling_rule):
    """
    In the current format the data is presented in the `seconds` frequency. In
    order to convert it to the minutely (or other) frequencies the following
    aggregation rules are applied:

    - Size is the sum of all sizes during the resampling period
    - Price is the mean of all prices during the resampling period
    """
    new_df = cfinresa.resample(df, rule=resampling_rule).agg(
        {
            "bid_price": "mean",
            "bid_size": "sum",
            "ask_price": "mean",
            "ask_size": "sum",
            "full_symbol": "last",
        }
    )
    return new_df

In [10]:
def process_bid_ask_data(df):
    # Convert the data to the right format.
    converted_df = clean_up_raw_bid_ask_data(df)
    # Resample.
    converted_resampled_df = resample_bid_ask(converted_df, "1T")
    return converted_resampled_df

### Load historical data

For the example I am taking the data with he following characteristics:
- `full_symbol` = binance::BTC_USDT
- depth = 1 (default option)
- start_ts = "2022-01-01"
- end_ts = "2022-01-30" (15 days in total)

In [11]:
# Get the list of all dates in the range.
datelist = pd.date_range("2022-01-01", periods=30).tolist()
datelist = [str(x.strftime("%Y-%m-%d")) for x in datelist]

In [12]:
# Using the variables from `datelist` the multiple requests can be sent to the API.
result = []
for date in datelist:
    # Interaction with the API.
    r = requests.get(
        f"https://api.cryptochassis.com/v1/market-depth/binance/btc-usdt?startTime={date}"
    )
    data = pd.read_csv(r.json()["urls"][0]["url"], compression="gzip")
    # Attaching it day-by-day to the final DataFrame.
    result.append(data)
bid_ask_df = pd.concat(result)

In [13]:
# Transforming the data.
processed_data = process_bid_ask_data(bid_ask_df)

In [14]:
# Show the data.
display(processed_data.shape)
display(processed_data)

(43201, 5)

Unnamed: 0_level_0,bid_price,bid_size,ask_price,ask_size,full_symbol
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2022-01-01 00:00:00+00:00,46214.010000,1.08286,46214.020000,0.24793,binance::BTC_USDT
2022-01-01 00:01:00+00:00,46247.429167,29.78377,46247.692500,36.79080,binance::BTC_USDT
2022-01-01 00:02:00+00:00,46262.766167,26.08551,46263.049833,35.87070,binance::BTC_USDT
2022-01-01 00:03:00+00:00,46345.883167,45.60706,46346.071000,39.95809,binance::BTC_USDT
2022-01-01 00:04:00+00:00,46352.099667,49.22233,46352.251667,44.05137,binance::BTC_USDT
...,...,...,...,...,...
2022-01-30 23:56:00+00:00,37930.353559,23.68421,37930.674237,28.78032,binance::BTC_USDT
2022-01-30 23:57:00+00:00,37899.581333,18.37171,37899.765167,39.71293,binance::BTC_USDT
2022-01-30 23:58:00+00:00,37893.692373,25.83638,37893.762542,19.31441,binance::BTC_USDT
2022-01-30 23:59:00+00:00,37888.562000,8.64161,37888.833500,16.46435,binance::BTC_USDT


Now, this data is in the format that is compatible for working with CCXT OHLCV data.

It takes ±1.5mins to load and process data for 1 month (30 days), so it shouldn't take much time to load big chunks of historical data.