# Description

The notebook experiments with custom resampling on data from `HistoricalDataSource`.

# Imports

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import logging
from typing import Any, Dict, List, Optional

import pandas as pd

import core.finance as cofinanc
import dataflow.core as dtfcore
import dataflow.system as dtfsys
import dataflow_amp.system.Cx as dtfamsysc
import helpers.hdbg as hdbg
import helpers.henv as henv
import helpers.hprint as hprint
import im_v2.common.universe as ivcu

  from tqdm.autonotebook import tqdm


In [3]:
hdbg.init_logger(verbosity=logging.INFO)

_LOG = logging.getLogger(__name__)

_LOG.info("%s", henv.get_system_signature()[0])

hprint.config_notebook()

[0m[36mINFO[0m: > cmd='/venv/lib/python3.9/site-packages/ipykernel_launcher.py -f /home/.local/share/jupyter/runtime/kernel-428634ca-bf7e-4d4e-bd7f-e9ba5089fa2b.json'
[31m-----------------------------------------------------------------------------
This code is not in sync with the container:
code_version='1.10.0' != container_version='1.12.0'
-----------------------------------------------------------------------------
You need to:
- merge origin/master into your branch with `invoke git_merge_master`
- pull the latest container with `invoke docker_pull`[0m
INFO  # Git
  branch_name='CmTask6180_Add_a_notebook_to_research_custom_price_resampling_v02'
  hash='bfd455b2b'
  # Last commits:
    *   bfd455b2b Dan      Merge branch 'master' into CmTask6180_Add_a_notebook_to_research_custom_price_resampling_v02 (12 minutes ago) Thu Nov 30 19:24:50 2023  (HEAD -> CmTask6180_Add_a_notebook_to_research_custom_price_resampling_v02, origin/CmTask6180_Add_a_notebook_to_research_custom_price_res

# Functions

In [4]:
# Copied from `oms/notebooks/CmTask4275_check_ohlcv_timing_issue.ipynb`.
def _get_prod_market_data(universe_version: str):
    # Get trading universe as asset ids.
    vendor = "CCXT"
    mode = "trade"
    as_full_symbol = True
    full_symbols = ivcu.get_vendor_universe(
        vendor,
        mode,
        version=universe_version,
        as_full_symbol=as_full_symbol,
    )
    # Load data just for 1 asset id for simplicity.
    asset_ids = [1467591036, 1464553467]
    # Get prod `MarketData`.
    db_stage = "prod"
    market_data = dtfamsysc.get_Cx_RealTimeMarketData_prod_instance1(
        asset_ids, db_stage
    )
    return market_data


def resample_ohlcv_bars(
    df_ohlcv: pd.DataFrame, bar_duration: str
) -> pd.DataFrame:
    """
    Resample 1-minute data to `bar_duration`.
    """
    # Resample.
    resampling_node = dtfcore.GroupedColDfToDfTransformer(
        "resample",
        transformer_func=cofinanc.resample_bars,
        **{
            "in_col_groups": [
                ("open",),
                ("close",),
                ("volume",),
            ],
            "out_col_group": (),
            "transformer_kwargs": {
                "rule": bar_duration,
                "resampling_groups": [
                    ({"close": "close"}, "last", {}),
                    ({"open": "open"}, "first", {}),
                    (
                        {"volume": "volume"},
                        "sum",
                        {"min_count": 1},
                    ),
                ],
                "vwap_groups": [],
            },
            "reindex_like_input": False,
            "join_output_with_input": False,
        },
    )
    resampled_ohlcv = resampling_node.fit(df_ohlcv)["df_out"]
    return resampled_ohlcv


# Copied from `oms/notebooks/CmTask4275_check_ohlcv_timing_issue.ipynb`.
def load_ohlcv_data_from_historical_datasource(
    start_timestamp: pd.Timestamp,
    end_timestamp: pd.Timestamp,
    universe_version: str,
    apply_timing_fix: bool,
) -> pd.DataFrame:
    """
    Load OHLCV data using the prod-like `HistoricalDataSource`.

    :param universe_version: universe version
    :param apply_timing_fix: apply the timing fix if True, otherwise read data as-is
    """
    nid = "read_data"
    market_data = _get_prod_market_data(universe_version)
    ts_col_name = "end_timestamp"
    multiindex_output = True
    col_names_to_remove = None
    historical_data_source = dtfsys.HistoricalDataSource(
        nid,
        market_data,
        ts_col_name,
        multiindex_output,
        col_names_to_remove=col_names_to_remove,
    )
    if apply_timing_fix:
        # Subtract 1 minute to account for the fact that timestamp indicates the start
        # of the bar instead of the end of that.
        start_timestamp = start_timestamp - pd.Timedelta(minutes=1)
        end_timestamp = end_timestamp - pd.Timedelta(minutes=1)
    # Convert to the `dataflow` format.
    fit_intervals = [(start_timestamp, end_timestamp)]
    _LOG.info("fit_intervals=%s", fit_intervals)
    historical_data_source.set_fit_intervals(fit_intervals)
    df_ohlcv = historical_data_source.fit()["df_out"]
    if apply_timing_fix:
        # Add 1 minute back.
        df_ohlcv.index = df_ohlcv.index + pd.Timedelta(minutes=1)
        df_ohlcv["start_timestamp"] = df_ohlcv["start_timestamp"] + pd.Timedelta(
            minutes=1
        )
    return df_ohlcv

# Load OHLCV data via `HistoricalDataSource`

In [5]:
system_log_dir = "/shared_data/ecs/preprod/twap_experiment/system_reconciliation/C3a/20230419/system_log_dir.scheduled.20230419_041000.20230419_100500"
bar_duration = "5T"
universe_version = "v7.1"
#
start_timestamp_UTC = pd.Timestamp("2023-04-19 04:06:00", tz="UTC")
end_timestamp_UTC = pd.Timestamp("2023-04-19 08:05:00", tz="UTC")
start_timestamp_ET = start_timestamp_UTC.tz_convert("America/New_York")
end_timestamp_ET = end_timestamp_UTC.tz_convert("America/New_York")

In [6]:
apply_timing_fix = True
data = load_ohlcv_data_from_historical_datasource(
    start_timestamp_ET, end_timestamp_ET, universe_version, apply_timing_fix
)
data.head(10)

INFO  Unable to fetch DB credentials from environment variables: 
	'POSTGRES_HOST'
	Attempting env file method.
INFO  Unable to fetch DB credentials from env file: 
	
################################################################################
* Failed assertion *
File '/app/amp/im_v2/devops/env/prod.im_db_config.env' doesn't exist
################################################################################

	Attempting AWS SecretsManager method.
INFO  Fetching secret: prod.im_data_db.read_only
INFO  Created prod DB connection: 
 None


  df = pd.read_sql_query(query, connection)


INFO  fit_intervals=[(Timestamp('2023-04-19 00:05:00-0400', tz='America/New_York'), Timestamp('2023-04-19 04:04:00-0400', tz='America/New_York'))]


Unnamed: 0_level_0,close,close,end_download_timestamp,end_download_timestamp,full_symbol,full_symbol,high,high,id,id,knowledge_timestamp,knowledge_timestamp,low,low,open,open,start_timestamp,start_timestamp,volume,volume
Unnamed: 0_level_1,1464553467,1467591036,1464553467,1467591036,1464553467,1467591036,1464553467,1467591036,1464553467,1467591036,1464553467,1467591036,1464553467,1467591036,1464553467,1467591036,1464553467,1467591036,1464553467,1467591036
end_timestamp,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2
2023-04-19 00:06:00-04:00,2088.78,30165.8,2023-04-19 04:05:10.111087+00:00,2023-04-19 04:05:10.111717+00:00,binance::ETH_USDT,binance::BTC_USDT,2089.73,30174.3,131531966,131532029,2023-04-19 04:05:10.207040+00:00,2023-04-19 04:05:10.207040+00:00,2088.78,30164.4,2089.73,30174.3,2023-04-19 00:05:00-04:00,2023-04-19 00:05:00-04:00,1354.043,225.226
2023-04-19 00:07:00-04:00,2089.71,30184.7,2023-04-19 04:06:10.003449+00:00,2023-04-19 04:06:10.003572+00:00,binance::ETH_USDT,binance::BTC_USDT,2089.71,30184.7,131533600,131533604,2023-04-19 04:06:10.044583+00:00,2023-04-19 04:06:10.044583+00:00,2088.67,30164.4,2088.78,30165.8,2023-04-19 00:06:00-04:00,2023-04-19 00:06:00-04:00,1414.499,178.821
2023-04-19 00:08:00-04:00,2089.15,30183.1,2023-04-19 04:07:10.004714+00:00,2023-04-19 04:07:10.004846+00:00,binance::ETH_USDT,binance::BTC_USDT,2089.71,30184.7,131533835,131533840,2023-04-19 04:07:10.019647+00:00,2023-04-19 04:07:10.019647+00:00,2089.14,30176.4,2089.71,30184.7,2023-04-19 00:07:00-04:00,2023-04-19 00:07:00-04:00,663.771,82.462
2023-04-19 00:09:00-04:00,2086.43,30168.1,2023-04-19 04:08:10.006170+00:00,2023-04-19 04:08:10.006288+00:00,binance::ETH_USDT,binance::BTC_USDT,2089.15,30183.2,131534097,131534103,2023-04-19 04:08:10.020405+00:00,2023-04-19 04:08:10.020405+00:00,2086.03,30168.1,2089.14,30183.2,2023-04-19 00:08:00-04:00,2023-04-19 00:08:00-04:00,6263.982,124.072
2023-04-19 00:10:00-04:00,2088.28,30182.6,2023-04-19 04:09:10.007499+00:00,2023-04-19 04:09:10.007625+00:00,binance::ETH_USDT,binance::BTC_USDT,2088.28,30182.6,131534386,131534393,2023-04-19 04:09:10.024145+00:00,2023-04-19 04:09:10.024145+00:00,2084.59,30160.0,2086.42,30168.1,2023-04-19 00:09:00-04:00,2023-04-19 00:09:00-04:00,5430.551,189.581
2023-04-19 00:11:00-04:00,2087.9,30180.5,2023-04-19 04:10:10.008289+00:00,2023-04-19 04:10:10.008421+00:00,binance::ETH_USDT,binance::BTC_USDT,2088.56,30184.6,131534702,131534710,2023-04-19 04:10:10.021175+00:00,2023-04-19 04:10:10.021175+00:00,2087.3,30172.3,2088.28,30182.6,2023-04-19 00:10:00-04:00,2023-04-19 00:10:00-04:00,1493.508,95.792
2023-04-19 00:12:00-04:00,2087.76,30177.8,2023-04-19 04:11:10.009075+00:00,2023-04-19 04:11:10.009211+00:00,binance::ETH_USDT,binance::BTC_USDT,2088.16,30180.6,131535045,131535054,2023-04-19 04:11:10.027227+00:00,2023-04-19 04:11:10.027227+00:00,2086.76,30167.1,2087.9,30180.5,2023-04-19 00:11:00-04:00,2023-04-19 00:11:00-04:00,1075.886,68.877
2023-04-19 00:13:00-04:00,2087.15,30169.9,2023-04-19 04:12:10.010451+00:00,2023-04-19 04:12:10.010597+00:00,binance::ETH_USDT,binance::BTC_USDT,2087.77,30177.9,131535415,131535425,2023-04-19 04:12:10.027226+00:00,2023-04-19 04:12:10.027226+00:00,2086.85,30166.0,2087.76,30177.9,2023-04-19 00:12:00-04:00,2023-04-19 00:12:00-04:00,757.77,71.874
2023-04-19 00:14:00-04:00,2085.88,30165.4,2023-04-19 04:13:10.011555+00:00,2023-04-19 04:13:10.011709+00:00,binance::ETH_USDT,binance::BTC_USDT,2087.16,30170.0,131535812,131535823,2023-04-19 04:13:10.027366+00:00,2023-04-19 04:13:10.027366+00:00,2084.58,30160.3,2087.15,30169.9,2023-04-19 00:13:00-04:00,2023-04-19 00:13:00-04:00,4626.933,96.584
2023-04-19 00:15:00-04:00,2085.89,30160.0,2023-04-19 04:14:10.012594+00:00,2023-04-19 04:14:10.012779+00:00,binance::ETH_USDT,binance::BTC_USDT,2085.89,30165.4,131536236,131536248,2023-04-19 04:14:10.040848+00:00,2023-04-19 04:14:10.040848+00:00,2085.6,30158.1,2085.88,30165.3,2023-04-19 00:14:00-04:00,2023-04-19 00:14:00-04:00,736.096,98.683


# Apply weighted resampling

In [7]:
rule = "5T"
weights = [0.1, 0.2, 0.3, 0.4, 0.5]

## 1 column data

In [8]:
price_data = data["close"]
price_data.index.freq = pd.infer_freq(price_data.index)
price_data.head(10)

Unnamed: 0_level_0,1464553467,1467591036
end_timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2023-04-19 00:06:00-04:00,2088.78,30165.8
2023-04-19 00:07:00-04:00,2089.71,30184.7
2023-04-19 00:08:00-04:00,2089.15,30183.1
2023-04-19 00:09:00-04:00,2086.43,30168.1
2023-04-19 00:10:00-04:00,2088.28,30182.6
2023-04-19 00:11:00-04:00,2087.9,30180.5
2023-04-19 00:12:00-04:00,2087.76,30177.8
2023-04-19 00:13:00-04:00,2087.15,30169.9
2023-04-19 00:14:00-04:00,2085.88,30165.4
2023-04-19 00:15:00-04:00,2085.89,30160.0


In [14]:
col = 1464553467
resampled_price_data = cofinanc.resample_with_weights(
    price_data, rule, col, weights
)
resampled_price_data.head()

Unnamed: 0_level_0,1464553467
end_timestamp,Unnamed: 1_level_1
2023-04-19 00:10:00-04:00,2088.184667
2023-04-19 00:15:00-04:00,2086.522667
2023-04-19 00:20:00-04:00,2086.188667
2023-04-19 00:25:00-04:00,2085.420667
2023-04-19 00:30:00-04:00,2087.529333


## `dataflow` data format

In [15]:
def resample_with_weights_ohlcv_bars(
    df_ohlcv: pd.DataFrame, bar_duration: str, weights: List[float]
) -> pd.DataFrame:
    """
    Resample 1-minute data to `bar_duration` with weights.
    """
    # Resample.
    resampling_node = dtfcore.GroupedColDfToDfTransformer(
        "resample",
        transformer_func=cofinanc.resample_with_weights,
        **{
            "in_col_groups": [
                ("close",),
            ],
            "out_col_group": (),
            "transformer_kwargs": {
                "rule": bar_duration,
                "col": "close",
                "weights": weights,
            },
            "reindex_like_input": False,
            "join_output_with_input": False,
        },
    )
    resampled_ohlcv = resampling_node.fit(df_ohlcv)["df_out"]
    return resampled_ohlcv

In [17]:
resampled_with_weights_data = resample_with_weights_ohlcv_bars(
    data, rule, weights
)
resampled_with_weights_data.head()

Unnamed: 0_level_0,close,close
Unnamed: 0_level_1,1464553467,1467591036
end_timestamp,Unnamed: 1_level_2,Unnamed: 2_level_2
2023-04-19 00:10:00-04:00,2088.184667,30177.993333
2023-04-19 00:15:00-04:00,2086.522667,30167.16
2023-04-19 00:20:00-04:00,2086.188667,30169.053333
2023-04-19 00:25:00-04:00,2085.420667,30163.366667
2023-04-19 00:30:00-04:00,2087.529333,30207.193333
