In [None]:
import os
import time
from typing import List, Dict
import pandas as pd
import yfinance as yf

from modules.tools import get_tqdm

In [None]:
def download_data(
    symbols: List[str],
    period: str,
    interval: str,
    batch_size: int = 100,
    delay: float = 0.1
) -> Dict[str, pd.DataFrame]:
    """
    Download OHLCV data for multiple symbols using Yahoo Finance
    and save all downloaded data as one .pkl file in data/historical_data.

    Args:
        symbols (List[str]): List of ticker symbols to download.
        period (str): Data period (e.g., "1d", "5d", "1mo", "1y").
        interval (str): Candlestick interval (e.g., "1m", "15m", "1h", "1d").
        batch_size (int, optional): Number of symbols per request batch. Defaults to 100.
        delay (float, optional): Delay (in seconds) between batches to avoid rate limiting. Defaults to 0.1.

    Returns:
        Dict[str, pd.DataFrame]: Mapping of symbol → OHLCV DataFrame.
    """
    tqdm = get_tqdm()

    # === Ensure output directory exists ===
    base_dir = os.path.dirname(os.path.abspath(__file__))
    data_dir = os.path.join(base_dir, "../../data/historical_data")
    os.makedirs(data_dir, exist_ok=True)

    results: Dict[str, pd.DataFrame] = {}
    num_batches = (len(symbols) - 1) // batch_size + 1

    for i in tqdm(range(0, len(symbols), batch_size), desc="Downloading batches", total=num_batches):
        batch = symbols[i:i + batch_size]

        try:
            raw_df = yf.download(
                tickers=batch,
                period=period,
                interval=interval,
                group_by="ticker",
                progress=False,
                threads=True,
                ignore_tz=True,
                auto_adjust=False
            )
        except Exception as e:
            print(f"Error downloading batch {batch}: {e}")
            time.sleep(delay)
            continue

        if raw_df.empty:
            time.sleep(delay)
            continue

        # Normalize structure
        raw_df = raw_df.reset_index(drop=False)
        if isinstance(raw_df.columns, pd.MultiIndex):
            raw_df.columns = [
                f"{symbol}_{field}" if field else "Datetime"
                for symbol, field in raw_df.columns
            ]
        else:
            raw_df.columns = [
                "Datetime" if col == "Date" else f"{batch[0]}_{col}"
                for col in raw_df.columns
            ]

        # Extract data per symbol
        for symbol in batch:
            cols = [col for col in raw_df.columns if col.startswith(f"{symbol}_")]
            if not cols:
                continue

            df_symbol = raw_df[["Datetime"] + cols].copy()
            df_symbol.columns = [
                col.split("_", 1)[1] if "_" in col else col
                for col in df_symbol.columns
            ]
            df_symbol["Symbol"] = symbol

            if df_symbol.empty or df_symbol["Open"].isna().all():
                continue

            results[symbol] = df_symbol

        time.sleep(delay)

    # === Save entire dictionary as a single pickle file ===
    filename = f"historical_data_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.pkl"
    save_path = os.path.join(data_dir, filename)
    pd.to_pickle(results, save_path)
    print(f"\n✅ Saved all downloaded data to {save_path}")

    return results