## 0. Config

Configuration and settings for the project


In [None]:
# Notebook Config & Settings (NDD first cell)
from __future__ import annotations

import os
import sys
import random
import warnings
from pathlib import Path
from datetime import datetime, timezone

import numpy as np
import pandas as pd
from dotenv import load_dotenv
from loguru import logger
import yaml

# Jupyter niceties (safe if not in IPython)
try:
    get_ipython().run_line_magic("load_ext", "autoreload")
    get_ipython().run_line_magic("autoreload", "2")
except Exception:
    pass

# -----------------------------------------------------------------------------
# Project root, env, config
# -----------------------------------------------------------------------------
def find_project_root(start: Path) -> Path:
    for p in [start, *start.parents]:
        if (p / "pyproject.toml").exists():
            return p
    return start

ROOT = find_project_root(Path.cwd())
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))

# Local utils from repo
from src.utils.config import (
    load_env,
    load_data_config,
    get_paths,
    get_source_env,
    ensure_folders,
)

# Load .env (non-overriding) and YAML configs
load_env()
DATA_CFG = load_data_config(ROOT / "config" / "data.yml")
PATHS = get_paths(DATA_CFG)
ensure_folders(PATHS)

# Source credentials (CryptoDataDownload)
CCD = get_source_env(DATA_CFG, "ccd")
if not CCD.get("base_url"):
    logger.warning(
        "CCD base URL is not set. Define CCD_API_BASE_URL in your .env. "
        "Example: CCD_API_BASE_URL=https://api.cryptodatadownload.com/v1"
    )
if not CCD.get("api_key"):
    logger.warning(
        "CCD API key not found. Define CCD_API_KEY in your .env to avoid rate limits/errors."
    )

# Optional data-quality/report thresholds
REPORT_CFG = {}
try:
    with open(ROOT / "config" / "report.yml", "r", encoding="utf-8") as f:
        REPORT_CFG = yaml.safe_load(f) or {}
except FileNotFoundError:
    REPORT_CFG = {}

# -----------------------------------------------------------------------------
# Runtime basics & notebook UX
# -----------------------------------------------------------------------------
SEED = int(os.getenv("SEED", "42"))
random.seed(SEED)
np.random.seed(SEED)
try:
    import torch  # optional
    torch.manual_seed(SEED)
    torch.use_deterministic_algorithms(True)
    if torch.backends.cudnn.is_available():
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
except Exception:
    pass

TZ = "UTC"
CUT_OFF_UTC = str(DATA_CFG.get("cut_off_utc", "00:00"))

# Display & plotting
pd.set_option("display.width", 140)
pd.set_option("display.max_columns", 60)
pd.set_option("display.max_rows", 200)
warnings.filterwarnings("ignore")

try:
    import seaborn as sns
    import matplotlib.pyplot as plt
    sns.set_theme(style="whitegrid", context="notebook")
except Exception:
    pass

# Loguru minimal config
logger.remove()
logger.add(sys.stderr, level=os.getenv("LOG_LEVEL", "INFO"))

# -----------------------------------------------------------------------------
# Data scope & IO
# -----------------------------------------------------------------------------
ASSETS = list(DATA_CFG.get("assets", ["BTC"]))
ASSET = os.getenv("ASSET", ASSETS[0] if ASSETS else "BTC")
START_DATE = os.getenv("START_DATE", DATA_CFG.get("defaults", {}).get("start", "2019-01-01"))
END_DATE = os.getenv("END_DATE", DATA_CFG.get("defaults", {}).get("end"))  # None => latest

IO = {
    "format": DATA_CFG.get("io", {}).get("format", "parquet"),
    "parquet": {
        "engine": DATA_CFG.get("io", {}).get("parquet", {}).get("engine", "pyarrow"),
        "compression": DATA_CFG.get("io", {}).get("parquet", {}).get("compression", "zstd"),
    },
    "partitioning": DATA_CFG.get("storage", {}).get("partitioning", "asset={asset}/date={date}"),
}

# Derived paths and run metadata
RUN_ID = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
ARTIFACTS_DIR = ROOT / "artifacts" / RUN_ID
REPORTS_DIR = ROOT / "reports" / RUN_ID
for _d in (ARTIFACTS_DIR, REPORTS_DIR):
    _d.mkdir(parents=True, exist_ok=True)

# -----------------------------------------------------------------------------
# Feature / Model / Backtest placeholders (align with s02/s03)
# -----------------------------------------------------------------------------
FEATURES = {
    "rv_windows": [1, 5, 22],
    "har": {"use": True},
    "lookback_days": 90,
    "zscore": {"window": 5, "method": "robust"},
    "winsorize_sigma": 6,
    "use_jump_proxy": True,
    "use_funding_onchain": True,
}

MODEL = {
    "baselines": {
        "lasso_alpha": 0.001,
        "ridge_alpha": 1.0,
    },
    "gbm": {
        "library": "lightgbm",
        "num_leaves": 31,
        "learning_rate": 0.05,
        "n_estimators": 500,
        "min_data_in_leaf": 50,
        "feature_fraction": 0.8,
        "bagging_fraction": 0.8,
        "bagging_freq": 1,
        "seed": SEED,
    },
    "lstm": {
        "hidden_size": 64,
        "num_layers": 2,
        "dropout": 0.1,
        "lr": 1e-3,
        "batch_size": 64,
        "epochs": 50,
        "early_stopping_patience": 8,
        "aux_classification_head": False,
        "seed": SEED,
    },
    "walkforward": {
        "val_window_days": 365,
        "test_window_days": 365,
        "expand_train": True,
        "refit_each_fold": True,
    },
    "loss": {"regression": "huber", "cls_weight": 0.2},
}

BACKTEST = {
    "thresholds": {
        "t1d": 0.0,
        "t1w": 0.0,
        "t2w": 0.0,
    },
    "costs": {
        "fee_bps": 1.0,
        "slippage_bps": 1.0,
        "funding_apr": 0.0,
    },
    "risk": {
        "max_contracts": 1,
        "kill_switch_drawdown_pct": 20,
    },
    "report": {
        "metrics": ["RMSE", "MAE", "Spearman", "HitRate", "Sharpe", "MDD"],
        "plots": {"equity": True, "pred_vs_actual": True, "feature_importance": True},
        "output_dir": str(REPORTS_DIR),
    },
}

DQ = REPORT_CFG

# -----------------------------------------------------------------------------
# Runtime toggles
# -----------------------------------------------------------------------------
DRY_RUN = os.getenv("DRY_RUN", "0") == "1"
SAVE_ARTIFACTS = os.getenv("SAVE_ARTIFACTS", "1") == "1"
USE_CACHE = os.getenv("USE_CACHE", "1") == "1"

# Materialized final config (inspect/use in later cells)
CFG = {
    "runtime": {
        "root": str(ROOT),
        "seed": SEED,
        "tz": TZ,
        "cut_off_utc": CUT_OFF_UTC,
        "run_id": RUN_ID,
        "dry_run": DRY_RUN,
        "save_artifacts": SAVE_ARTIFACTS,
        "use_cache": USE_CACHE,
    },
    "paths": {
        "raw_root": str(PATHS.raw_root),
        "staging_root": str(PATHS.staging_root),
        "features_root": str(PATHS.features_root),
        "artifacts": str(ARTIFACTS_DIR),
        "reports": str(REPORTS_DIR),
        "partitioning": IO["partitioning"],
    },
    "data": {
        "assets": ASSETS,
        "asset": ASSET,
        "start": START_DATE,
        "end": END_DATE,
        "io": IO,
        "sources": {"ccd": {"base_url": CCD.get("base_url"), "has_api_key": bool(CCD.get("api_key"))}},
        "endpoints": DATA_CFG.get("sources", {}).get("ccd", {}).get("endpoints", {}),
    },
    "features": FEATURES,
    "model": MODEL,
    "backtest": BACKTEST,
    "dq": DQ,
}

logger.info("CFG loaded: asset={}, start={}, end={}, raw_root={}", ASSET, START_DATE, END_DATE, PATHS.raw_root)
