In [None]:
import numpy as np

def hanzo_df_array(symbols, df):
    close = df["Close"]
    vol = df["Volume"]

    # Check NaN ratios
    for sym in symbols:
        sym_close = df["Close"][sym].to_numpy()
        sym_vol = df["Volume"][sym].to_numpy()
        nan_ratio_close = np.isnan(sym_close).sum() / len(sym_close)
        nan_ratio_vol = np.isnan(sym_vol).sum() / len(sym_vol)
        nan_ratio = max(nan_ratio_close, nan_ratio_vol)
        if nan_ratio > 0.1:
            print(f"{sym} nan ratio: {nan_ratio:.4f} - dropping")
            close = close.drop(sym, axis=1)
            vol = vol.drop(sym, axis=1)

    # Convert to np array
    close_np, vol_np = close.to_numpy(), vol.to_numpy()
    hist, future = 35, 7
    x, y = [], []

    for idx in range(hist+1, len(close_np)-future):
        p0 = close_np[idx - hist - 1:idx - 1]
        p1 = close_np[idx - hist:idx]
        p_idx = p1 / p0 - 1

        v0 = vol_np[idx - hist - 1:idx - 1]
        v1 = vol_np[idx - hist:idx]
        v_idx = v1 / v0 - 1

        x_idx = np.stack((p_idx.T, v_idx.T), axis=2)

        y0 = close_np[idx]
        y1 = close_np[idx + future]
        y_idx = np.array(([y1 / y0 - 1])).T

        if not np.any(np.isnan(x_idx)) and not np.any(np.isnan(y_idx)):
            x.append(x_idx)
            y.append(y_idx)

    x_np = np.stack(x, axis=0)
    y_np = np.stack(y, axis=0)

    return { "x": x_np, "y": y_np }

  v_idx = v1 / v0 - 1


In [44]:
x_np.shape

(1628, 101, 35, 2)

In [30]:
idx = hist + 1
p0 = close_np[idx - hist - 1:idx - 1]
p1 = close_np[idx - hist:idx]
p_idx = p1 / p0 - 1

v0 = vol_np[idx - hist - 1:idx - 1]
v1 = vol_np[idx - hist:idx]
v_idx = v1 / v0 - 1

y0 = close_np[idx]
y1 = close_np[idx + future]
y_idx = np.array(([y1 / y0 - 1])).T
y_idx.shape



(101, 1)

In [9]:
len(x)

1705

In [6]:
hanzo_data = hanzo_df_array(symbols, df)
hanzo_data["x"][:, :, :, 0]

array([[[181.88000488, 175.58500671, 112.98000336, ...,  49.125     ,
          56.0233345 , 102.30500031],
        [181.99000549, 175.52000427, 112.73999786, ...,  48.90999985,
          56.18666458, 102.52999878],
        [182.0249939 , 175.13000488, 113.30999756, ...,  51.29000092,
          56.22333145, 103.07499695],
        ...,
        [184.94000244, 177.19999695, 119.12999725, ...,  53.65000153,
          58.42256546, 105.31999969],
        [184.55999756, 176.76100159, 118.86000061, ...,  53.60499954,
          58.43666458, 104.95999908],
        [184.36999512, 176.72999573, 119.        , ...,  53.375     ,
          58.4600029 , 104.75      ]],

       [[181.99000549, 175.52000427, 112.73999786, ...,  48.90999985,
          56.18666458, 102.52999878],
        [182.0249939 , 175.13000488, 113.30999756, ...,  51.29000092,
          56.22333145, 103.07499695],
        [182.05000305, 175.57000732, 113.86000061, ...,  51.75      ,
          56.41666794, 103.25      ],
        ...,


In [2]:
import constants
import yfinance as yf

# Get symbols from constants file
lines = constants.sa_str.splitlines()
symbols = [line.split("\t")[1] for line in lines][:100]

# Download data
df = yf.download(symbols + ["SPY"], period="1y", interval="1h", ignore_tz=True)

[*********************100%***********************]  101 of 101 completed
