# Description

The notebook stitches together portfolios for multiple daily prod system runs and plots the resulting PnL curves.

# Imports

In [5]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [6]:
import logging
from typing import Any, Dict, Tuple

import pandas as pd

import core.config as cconfig
import core.plotting as coplotti
import dataflow.model as dtfmod
import helpers.hdatetime as hdateti
import helpers.hdbg as hdbg
import helpers.henv as henv
import helpers.hpandas as hpandas
import helpers.hprint as hprint
import oms.reconciliation as omreconc

  import tqdm.autonotebook as tauton


In [7]:
hdbg.init_logger(verbosity=logging.INFO)

_LOG = logging.getLogger(__name__)

_LOG.info("%s", henv.get_system_signature()[0])

hprint.config_notebook()

[0m[36mINFO[0m: > cmd='/venv/lib/python3.9/site-packages/ipykernel_launcher.py -f /home/.local/share/jupyter/runtime/kernel-766c99b7-160d-4d53-bb1d-f60896c4b0e5.json'
(CVXPY) Oct 12 03:22:02 PM: Encountered unexpected exception importing solver GLPK:
ImportError("cannot import name 'glpk' from 'cvxopt' (/venv/lib/python3.9/site-packages/cvxopt/__init__.py)")
(CVXPY) Oct 12 03:22:02 PM: Encountered unexpected exception importing solver GLPK_MI:
ImportError("cannot import name 'glpk' from 'cvxopt' (/venv/lib/python3.9/site-packages/cvxopt/__init__.py)")
INFO  # Git
  branch_name='SorrTask584_Use_str_to_timestamp_instead_of_timestamp_as_str_to_timestamp'
  hash='4fff2e34d'
  # Last commits:
    * 4fff2e34d jainamp14 Replaced timestamp_as_str_to_timestamp() with str_to_timestamp()  (  22 hours ago) Wed Oct 11 17:00:51 2023  (HEAD -> SorrTask584_Use_str_to_timestamp_instead_of_timestamp_as_str_to_timestamp, origin/SorrTask584_Use_str_to_timestamp_instead_of_timestamp_as_str_to_timestamp)

# Build config

In [8]:
# Get config from env when running the notebook via the `run_notebook.py`
# script, e.g., in the system reconciliation flow.
config = cconfig.get_config_from_env()
if config:
    _LOG.info("Using config from env vars")
else:
    _LOG.info("Using hardwired config")
    # Specify the config directly when running the notebook manually.
    # Below is just an example.
    dst_root_dir = "/shared_data/ecs/preprod/prod_reconciliation"
    dag_builder_name = "C3a"
    run_mode = "paper_trading"
    start_timestamp_as_str = "20230716_000000"
    end_timestamp_as_str = "20230723_000000"
    config = omreconc.build_multiday_system_reconciliation_config(
        dst_root_dir,
        dag_builder_name,
        run_mode,
        start_timestamp_as_str,
        end_timestamp_as_str,
    )
    config = config[0]
print(config)

INFO  Using hardwired config
dst_root_dir: /shared_data/ecs/preprod/prod_reconciliation
dag_builder_name: C3a
run_mode: paper_trading
start_timestamp: 2023-07-15 20:00:00-04:00
end_timestamp: 2023-07-22 20:00:00-04:00
pnl_resampling_frequency: 5T


# Functions

In [9]:
# TODO(Grisha): move all functions under `oms/reconciliation.py`.

In [10]:
# TODO(Grisha): can we use this idiom in the other system reconciliation
# notebooks?
def get_prod_dag_output_for_last_node(
    system_log_path_dict: Dict[str, str],
) -> pd.DataFrame:
    """
    Load DAG data for a specified node for all bar timestamps.

    :param system_log_path_dict: system log dirs paths for different experiments
    """
    data_type = "dag_data"
    dag_path_dict = omreconc.get_system_log_paths(system_log_path_dict, data_type)
    hdbg.dassert_in("prod", dag_path_dict.keys())
    hdbg.dassert_path_exists(dag_path_dict["prod"])
    # Get DAG node names.
    dag_node_names = omreconc.get_dag_node_names(dag_path_dict["prod"])
    # Get DAG output for the last node and the last timestamp.
    dag_df_prod = omreconc.load_dag_outputs(
        dag_path_dict["prod"], dag_node_names[-1]
    )
    return dag_df_prod


def compute_research_portfolio(
    dag_df_prod: pd.DataFrame,
    forecast_evaluator_from_prices_dict: Dict[str, Dict[str, Any]],
    start_timestamp: pd.Timestamp,
    end_timestamp: pd.Timestamp,
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Compute research portfolio and align the indices with the system run start
    and end timestamps.

    :param system_log_path_dict: system log dirs paths for different experiments, e.g.,
        ```
        {
            "prod": "/shared_data/system_log_dir",
            "sim": ...
        }
        ```
    :param forecast_evaluator_from_prices_dict: params to initialize
        `ForecastEvaluatorFromPrices`
    """
    fep = dtfmod.ForecastEvaluatorFromPrices(
        **forecast_evaluator_from_prices_dict["init"]
    )
    annotate_forecasts_kwargs = forecast_evaluator_from_prices_dict[
        "annotate_forecasts_kwargs"
    ].to_dict()
    research_portfolio_df, research_portfolio_stats_df = fep.annotate_forecasts(
        dag_df_prod,
        **annotate_forecasts_kwargs,
    )
    # TODO(Grisha): remove columns sorting if it is not needed.
    research_portfolio_df = research_portfolio_df.sort_index(axis=1)
    research_portfolio_stats_df = research_portfolio_stats_df.sort_index(axis=1)
    # Align index with prod and sim portfolios.
    # TODO(Grisha): remove timestamps filtering if it is not needed.
    research_portfolio_df = research_portfolio_df.loc[
        start_timestamp:end_timestamp
    ]
    research_portfolio_stats_df = research_portfolio_stats_df.loc[
        start_timestamp:end_timestamp
    ]
    return research_portfolio_df, research_portfolio_stats_df

# Load portfolio stats

In [11]:
system_run_params = omreconc.get_system_run_parameters(
    config["dst_root_dir"],
    config["dag_builder_name"],
    config["run_mode"],
    config["start_timestamp"],
    config["end_timestamp"],
)
system_run_params

AssertionError: 
################################################################################
* Failed assertion *
Dir '/shared_data/ecs/preprod/prod_reconciliation/C3a/paper_trading' doesn't exist
################################################################################


In [12]:
portfolio_stats = []
bar_duration = None
for start_timestamp_as_str, end_timestamp_as_str, mode in system_run_params:
    # Build system reconciliation config.
    config_list = omreconc.build_reconciliation_configs(
        config["dst_root_dir"],
        config["dag_builder_name"],
        start_timestamp_as_str,
        end_timestamp_as_str,
        config["run_mode"],
        mode,
    )
    reconciliation_config = config_list[0]
    system_log_path_dict = reconciliation_config["system_log_path"].to_dict()
    bar_duration = reconciliation_config["meta"]["bar_duration"]
    # Load prod and sim portfolios.
    data_type = "portfolio"
    portfolio_path_dict = omreconc.get_system_log_paths(
        system_log_path_dict, data_type
    )
    portfolio_dfs, portfolio_stats_dfs = omreconc.load_portfolio_dfs(
        portfolio_path_dict,
        bar_duration,
    )
    # Compute research portfolio.
    dag_df_prod = get_prod_dag_output_for_last_node(system_log_path_dict)
    start_timestamp = hdateti.str_to_timestamp(
        start_timestamp_as_str
    )
    end_timestamp = hdateti.str_to_timestamp(end_timestamp_as_str)
    forecast_evaluator_from_prices_dict = reconciliation_config[
        "research_forecast_evaluator_from_prices"
    ]
    (
        research_portfolio_df,
        research_portfolio_stats_df,
    ) = compute_research_portfolio(
        dag_df_prod,
        forecast_evaluator_from_prices_dict,
        start_timestamp,
        end_timestamp,
    )
    # Concatenate prod, sim and research portfolios.
    portfolio_stats_dfs["research"] = research_portfolio_stats_df
    portfolio_stats_df = pd.concat(portfolio_stats_dfs, axis=1)
    portfolio_stats.append(portfolio_stats_df)
# Concatenate multiple daily portfolios.
portfolio_stats_df = pd.concat(portfolio_stats, axis=0)
hpandas.df_to_str(portfolio_stats_df, num_rows=5, log_level=logging.INFO)

NameError: name 'system_run_params' is not defined

In [13]:
bars_to_burn = 1
coplotti.plot_portfolio_stats(
    portfolio_stats_df.iloc[bars_to_burn:],
    freq=config["pnl_resampling_frequency"],
)

NameError: name 'portfolio_stats_df' is not defined

In [14]:
stats_computer = dtfmod.StatsComputer()
stats_sxs, _ = stats_computer.compute_portfolio_stats(
    portfolio_stats_df.iloc[bars_to_burn:], bar_duration
)
display(stats_sxs)

NameError: name 'portfolio_stats_df' is not defined

In [15]:
# Correlate PnLs.
portfolio_stats_df[[("prod", "pnl"), ("sim", "pnl"), ("research", "pnl")]].corr()

NameError: name 'portfolio_stats_df' is not defined