# Description

Analyze research backtest results.

# Imports

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import datetime
import logging
import os
from typing import Dict

import pandas as pd

import core.config as cconfig
import core.plotting as coplotti
import dataflow.model as dtfmod
import helpers.hdbg as hdbg
import helpers.henv as henv
import helpers.hgit as hgit
import helpers.hparquet as hparque
import helpers.hprint as hprint

  import tqdm.autonotebook as tauton


In [3]:
hdbg.init_logger(verbosity=logging.INFO)

_LOG = logging.getLogger(__name__)

_LOG.info("%s", henv.get_system_signature()[0])

hprint.config_notebook()

[0m[36mINFO[0m: > cmd='/venv/lib/python3.9/site-packages/ipykernel_launcher.py -f /home/.local/share/jupyter/runtime/kernel-13877e7d-7c08-4cb4-bca9-5fdca1e5ff75.json'
INFO  # Git
  branch_name='CmTask6096_Allow_running_backtest_analyzer_notebook_for_multiple_price_cols'
  hash='2f36dffa1'
  # Last commits:
    * 2f36dffa1 dan      checkpoint                                                        (48 minutes ago) Tue Nov 14 18:26:47 2023  (HEAD -> CmTask6096_Allow_running_backtest_analyzer_notebook_for_multiple_price_cols)
    *   17f8a0579 Dan      Merge branch 'master' into CmTask6096_Allow_running_backtest_analyzer_notebook_for_multiple_price_cols (   3 hours ago) Tue Nov 14 16:00:33 2023  (origin/CmTask6096_Allow_running_backtest_analyzer_notebook_for_multiple_price_cols)
    |\  
    | * 757074fd5 Dan      Cm task6143 remove crv usdt from the current trade universe (#6148) (   3 hours ago) Tue Nov 14 15:58:55 2023  (origin/master, origin/HEAD, master)
# Machine info
  system=Lin

# Functions

In [4]:
# TODO(Dan): Move to a lib.
def build_research_backtest_analyzer_config_dict(
    default_config: cconfig.Config,
) -> Dict[str, cconfig.Config]:
    """
    Build a dict of configs to run a backtest analysis.
    """
    if "sweep_param" in default_config:
        hdbg.dassert_isinstance(default_config["sweep_param"], cconfig.Config)
        # Set param values to sweep and corressponding config keys.
        sweep_param_keys = default_config["sweep_param", "keys"]
        hdbg.dassert_isinstance(sweep_param_keys, tuple)
        sweep_param_values = default_config["sweep_param", "values"]
        hdbg.dassert_isinstance(sweep_param_values, tuple)
        # Build config dict.
        config_dict = {}
        for val in sweep_param_values:
            # Update new config value.
            config = default_config.copy()
            config.update_mode = "overwrite"
            config[sweep_param_keys] = val
            config.update_mode = "assert_on_overwrite"
            # Set updated config key for config dict.
            config_dict_key = ":".join(sweep_param_keys)
            config_dict_key = " = ".join([config_dict_key, str(val)])
            # Add new config to the config dict.
            config_dict[config_dict_key] = config
    else:
        # Put single input config to a dict.
        config_dict = {"default_config": default_config}
    return config_dict

# Build the config dict

In [5]:
# Get config from env when running the notebook via the `run_notebook.py` script.
default_config = cconfig.get_config_from_env()
if default_config:
    _LOG.info("Using config from env vars")
else:
    _LOG.info("Using hardwired config")
    # Build default config.
    amp_dir = hgit.get_amp_abs_path()
    dir_name = os.path.join(
        amp_dir,
        "dataflow/model/test/outcomes/Test_run_master_research_backtest_analyzer/input/tiled_results",
    )
    default_config_dict = {
        "dir_name": dir_name,
        "start_date": datetime.date(2000, 1, 1),
        "end_date": datetime.date(2000, 1, 31),
        "asset_id_col": "asset_id",
        "pnl_resampling_frequency": "15T",
        "annotate_forecasts_kwargs": {
            "style": "longitudinal",
            "quantization": 30,
            "liquidate_at_end_of_day": False,
            "initialize_beginning_of_day_trades_to_zero": False,
            "burn_in_bars": 3,
            "compute_extended_stats": True,
            "target_dollar_risk_per_name": 1e2,
            "modulate_using_prediction_magnitude": True,
        },
        "column_names": {
            "price_col": "vwap",
            "volatility_col": "vwap.ret_0.vol",
            "prediction_col": "prediction",
        },
        "bin_annotated_portfolio_df_kwargs": {
            "proportion_of_data_per_bin": 0.2,
            "normalize_prediction_col_values": False,
        },
        "load_all_tiles_in_memory": False,
    }
    default_config = cconfig.Config().from_dict(default_config_dict)
print(default_config)

INFO  Using hardwired config
dir_name: /app/amp/dataflow/model/test/outcomes/Test_run_master_research_backtest_analyzer/input/tiled_results
start_date: 2000-01-01
end_date: 2000-01-31
asset_id_col: asset_id
pnl_resampling_frequency: 15T
annotate_forecasts_kwargs: 
  style: longitudinal
  quantization: 30
  liquidate_at_end_of_day: False
  initialize_beginning_of_day_trades_to_zero: False
  burn_in_bars: 3
  compute_extended_stats: True
  target_dollar_risk_per_name: 100.0
  modulate_using_prediction_magnitude: True
column_names: 
  price_col: vwap
  volatility_col: vwap.ret_0.vol
  prediction_col: prediction
bin_annotated_portfolio_df_kwargs: 
  proportion_of_data_per_bin: 0.2
  normalize_prediction_col_values: False
load_all_tiles_in_memory: False


In [6]:
config_dict = build_research_backtest_analyzer_config_dict(default_config)
print(config_dict.keys())

dict_keys(['default_config'])


# Load tiled results

## Report tile stats

In [7]:
parquet_tile_analyzer = dtfmod.ParquetTileAnalyzer()
parquet_tile_metadata = parquet_tile_analyzer.collate_parquet_tile_metadata(
    default_config["dir_name"]
)

In [8]:
parquet_tile_analyzer.compute_metadata_stats_by_asset_id(parquet_tile_metadata)

Unnamed: 0_level_0,n_years,n_unique_months,n_files,size
asset_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1467591036,1,1,1,12.4 KB
3303714233,1,1,1,12.4 KB


In [9]:
parquet_tile_analyzer.compute_universe_size_by_time(parquet_tile_metadata)

Unnamed: 0_level_0,Unnamed: 1_level_0,n_asset_ids,size
year,month,Unnamed: 2_level_1,Unnamed: 3_level_1
2000,1,2,24.8 KB


In [10]:
asset_ids = parquet_tile_metadata.index.levels[0].to_list()
display(asset_ids)

[1467591036, 3303714233]

## Load tile data

In [11]:
if default_config["load_all_tiles_in_memory"]:
    asset_ids_to_load = asset_ids
else:
    asset_ids_to_load = asset_ids[0:1]
asset_batch_size = len(asset_ids_to_load)
cols = None
#
asset_tile = next(
    hparque.yield_parquet_tiles_by_assets(
        default_config["dir_name"],
        asset_ids_to_load,
        default_config["asset_id_col"],
        asset_batch_size,
        cols,
    )
)

  0%|          | 0/1 [00:00<?, ?it/s]

In [12]:
tile_df = dtfmod.process_parquet_read_df(
    asset_tile, default_config["asset_id_col"]
)

In [13]:
tile_df.columns.levels[0].to_list()

['close',
 'close.ret_0',
 'feature1',
 'month',
 'prediction',
 'twap',
 'twap.ret_0',
 'vwap',
 'vwap.ret_0',
 'vwap.ret_0.vol',
 'vwap.ret_0.vol_adj',
 'vwap.ret_0.vol_adj.c',
 'vwap.ret_0.vol_adj.c.lag0',
 'vwap.ret_0.vol_adj.c.lag1',
 'vwap.ret_0.vol_adj.c.lag2',
 'vwap.ret_0.vol_adj.c.lag3',
 'year']

In [14]:
tile_df.head(3)

Unnamed: 0_level_0,close,close.ret_0,feature1,month,prediction,twap,twap.ret_0,vwap,vwap.ret_0,vwap.ret_0.vol,vwap.ret_0.vol_adj,vwap.ret_0.vol_adj.c,vwap.ret_0.vol_adj.c.lag0,vwap.ret_0.vol_adj.c.lag1,vwap.ret_0.vol_adj.c.lag2,vwap.ret_0.vol_adj.c.lag3,year
asset_id,1467591036,1467591036,1467591036,1467591036,1467591036,1467591036,1467591036,1467591036,1467591036,1467591036,1467591036,1467591036,1467591036,1467591036,1467591036,1467591036,1467591036
end_ts,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2
2000-01-01 14:35:00+00:00,101.0,,1.0,1,,101.0,,101.0,,,,,,,,,2000
2000-01-01 14:40:00+00:00,100.0,-0.00995,-1.0,1,,100.0,-0.00995,100.0,-0.00995,,,,,,,,2000
2000-01-01 14:45:00+00:00,101.0,0.00995,1.0,1,,101.0,0.00995,101.0,0.00995,,,,,,,,2000


# Compute portfolio bar metrics

In [15]:
portfolio_df_dict = {}
bar_metrics_dict = {}
for key, config in config_dict.items():
    if config["load_all_tiles_in_memory"]:
        fep = dtfmod.ForecastEvaluatorFromPrices(
            **config["column_names"].to_dict()
        )
        portfolio_df, bar_metrics = fep.annotate_forecasts(
            tile_df,
            **config["annotate_forecasts_kwargs"].to_dict(),
        )
    else:
        portfolio_df, bar_metrics = dtfmod.annotate_forecasts_by_tile(
            config["dir_name"],
            config["start_date"],
            config["end_date"],
            config["asset_id_col"],
            config["column_names"]["price_col"],
            config["column_names"]["volatility_col"],
            config["column_names"]["prediction_col"],
            asset_ids=None,
            annotate_forecasts_kwargs=config[
                "annotate_forecasts_kwargs"
            ].to_dict(),
            return_portfolio_df=True,
        )
    portfolio_df_dict[key] = portfolio_df
    bar_metrics_dict[key] = bar_metrics
portfolio_stats_df = pd.concat(bar_metrics_dict, axis=1)

  0%|          | 0/1 [00:00<?, ?it/s]

INFO  spread is `None`; imputing spread_lower_bound=0.000100


In [None]:
coplotti.plot_portfolio_stats(
    portfolio_stats_df, freq=default_config["pnl_resampling_frequency"]
)

In [None]:
coplotti.plot_portfolio_binned_stats(
    portfolio_df_dict,
    **config["bin_annotated_portfolio_df_kwargs"],
)

# Compute aggregate portfolio stats

In [None]:
stats_computer = dtfmod.StatsComputer()

In [None]:
portfolio_stats, daily_metrics = stats_computer.compute_portfolio_stats(
    portfolio_stats_df,
    default_config["pnl_resampling_frequency"],
)
display(portfolio_stats)