In [1]:
import pickle as pkl
import pandas as pd
import yfinance as yf
import time

sp500_url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
table = pd.read_html(sp500_url)
sp500_df = table[0]  # First table is the constituents list
tickers = sp500_df["Symbol"].tolist()

# Yahoo uses BRK-B instead of BRK.B, same for BF-B -> BF.B
tickers = [t.replace(".", "-") for t in tickers]


def fetch_history(tickers, start="2020-01-01", end=None, interval="1d"):
    all_data = []
    chunk_size = 50  # <= 100 is safer

    for i in range(0, len(tickers), chunk_size):
        batch = tickers[i : i + chunk_size]
        print(f"Fetching {batch[0]}...{batch[-1]}")

        df = yf.download(
            tickers=batch,
            start=start,
            end=end,
            interval=interval,
            group_by="ticker",
            auto_adjust=False,
            threads=True,
        )
        all_data.append(df)

        time.sleep(1.5)  # avoid being throttled

    return all_data


# history_batches = fetch_history(tickers, start="2020-01-01")
history_batches = pkl.load(open("./sp500.pkl", "rb"))

In [2]:
# Merge batches
history = pd.concat(history_batches, axis=1)

# Example: get Close prices in wide format
close_df = history.xs("Close", axis=1, level=1)

# Similarly for Open/High/Low/Volume
open_df = history.xs("Open", axis=1, level=1)
high_df = history.xs("High", axis=1, level=1)
low_df = history.xs("Low", axis=1, level=1)
vol_df = history.xs("Volume", axis=1, level=1)

In [3]:
dates = history.index.values

In [4]:
# Convert to polars for your pipeline
import polars as pl
from backtest_lib.market.polars_impl import Axis

close_pl = pl.from_pandas(close_df)
securities = close_pl.columns
axis = Axis.from_names(securities)

In [5]:
pastview = close_pl.transpose()

pastview = pastview.rename(
    {orig: orig.replace("column_", "period_") for orig in pastview.columns}
)

window_size = 4
stagger = 0

pastview.select(pastview.columns[stagger : window_size + stagger])

period_0,period_1,period_2,period_3
f64,f64,f64,f64
87.639999,87.239998,87.550003,90.199997
49.099998,48.599998,48.389999,48.25
12.795,12.553125,12.67875,12.803125
187.830002,184.949997,187.119995,187.5
68.433998,68.075996,69.890503,69.755501
…,…,…,…
46.119999,46.490002,46.389999,45.82
137.509995,137.020004,137.169998,135.160004
259.140015,256.049988,258.01001,256.470001
144.85437,144.475723,143.640778,143.514557


In [6]:
# now with the lib
from backtest_lib.market.polars_impl import PolarsPastView
from backtest_lib.market import PastView

close_prices_df = close_pl.with_columns(pl.Series("date", dates))
past_cost_prices = PolarsPastView.from_data_frame(close_prices_df)
print(isinstance(past_cost_prices, PastView))
print(past_cost_prices.by_period[-1]["AAPL"])

True


AttributeError: 'numpy.ndarray' object has no attribute 'cast'

In [7]:
print(past_cost_prices.by_security["AAPL"])

PolarsTimeseries(_vec=array([ 75.08750153,  74.35749817,  74.94999695, ..., 233.33000183,
       232.77999878, 231.58999634], shape=(1413,)), _axis=PeriodAxis(dt64=array(['2020-01-02T00:00:00.000000000', '2020-01-03T00:00:00.000000000',
       '2020-01-06T00:00:00.000000000', ...,
       '2025-08-13T00:00:00.000000000', '2025-08-14T00:00:00.000000000',
       '2025-08-15T00:00:00.000000000'],
      shape=(1413,), dtype='datetime64[ns]'), labels=('2020-01-02', '2020-01-03', '2020-01-06', '2020-01-07', '2020-01-08', '2020-01-09', '2020-01-10', '2020-01-13', '2020-01-14', '2020-01-15', '2020-01-16', '2020-01-17', '2020-01-21', '2020-01-22', '2020-01-23', '2020-01-24', '2020-01-27', '2020-01-28', '2020-01-29', '2020-01-30', '2020-01-31', '2020-02-03', '2020-02-04', '2020-02-05', '2020-02-06', '2020-02-07', '2020-02-10', '2020-02-11', '2020-02-12', '2020-02-13', '2020-02-14', '2020-02-18', '2020-02-19', '2020-02-20', '2020-02-21', '2020-02-24', '2020-02-25', '2020-02-26', '2020-02-27', '202

In [8]:
from backtest_lib.market import BySecurity, ByPeriod

print(
    isinstance(past_cost_prices.by_security, BySecurity),
    isinstance(past_cost_prices.by_period, ByPeriod),
)

True True


In [None]:
import polars as pl

universe = past_cost_prices.by_period[-1].names

df = pl.DataFrame(past_cost_prices.by_security[name].as_series() for name in universe)


periods = range(len(universe))

print(past_cost_prices.by_period[0])
df = pl.DataFrame(past_cost_prices.by_period[0].as_series() for period in periods)

PolarsPastView(by_period=shape: (503, 1)
┌───────────────────────────────┐
│ 2020-01-02 00:00:00.000000000 │
│ ---                           │
│ f64                           │
╞═══════════════════════════════╡
│ 87.639999                     │
│ 49.099998                     │
│ 12.795                        │
│ 187.830002                    │
│ 68.433998                     │
│ …                             │
│ 46.119999                     │
│ 137.509995                    │
│ 259.140015                    │
│ 144.85437                     │
│ 134.139999                    │
└───────────────────────────────┘, by_security=shape: (1, 503)
┌───────────┬───────────┬────────┬────────────┬───┬────────────┬───────────┬───────────┬───────────┐
│ AKAM      ┆ AMD       ┆ ANET   ┆ ADSK       ┆ … ┆ TRV        ┆ ZBRA      ┆ ZBH       ┆ ZTS       │
│ ---       ┆ ---       ┆ ---    ┆ ---        ┆   ┆ ---        ┆ ---       ┆ ---       ┆ ---       │
│ f64       ┆ f64       ┆ f64    ┆ f64        ┆  

RuntimeError: copy not allowed: cannot convert to a NumPy array without copying data