In [2]:
# --- SETUP ---
from pathlib import Path
import pandas as pd, numpy as np, yfinance as yf

# Folders
DATA_RAW  = Path("../data/raw");  DATA_RAW.mkdir(parents=True, exist_ok=True)
DATA_PROC = Path("../data/processed"); DATA_PROC.mkdir(parents=True, exist_ok=True)

# Date
TODAY = pd.Timestamp.today().normalize()
TODAY_STR = TODAY.strftime("%Y-%m-%d")

print("Environment ready", TODAY_STR)


Environment ready 2025-10-13


In [3]:
# --- FETCH SPY OPTION CHAINS & BUILD CALL-PUT PAIRS ---
def fetch_pairs_exact(symbol="SPY", max_exp=10):
    tk = yf.Ticker(symbol)
    exps = sorted(tk.options)[:max_exp]
    today = pd.Timestamp.today().normalize()
    all_pairs = []

    # Spot price
    hist = yf.download(symbol, period="5d", auto_adjust=True, progress=False)
    close = hist["Close"][symbol] if isinstance(hist.columns, pd.MultiIndex) else hist["Close"]
    S = float(close.iloc[-1])

    for ex in exps:
        ch = tk.option_chain(ex)
        calls, puts = ch.calls.copy(), ch.puts.copy()
        calls["option_type"], puts["option_type"] = "call", "put"
        calls["expiry"] = puts["expiry"] = pd.to_datetime(ex)

        # Clean & calculate mids
        for df in [calls, puts]:
            df.rename(columns={"lastPrice":"last","openInterest":"oi","impliedVolatility":"iv"}, inplace=True)
            df["mid"] = (df["bid"].fillna(0)+df["ask"].fillna(0))/2
            df["spread"] = (df["ask"]-df["bid"]).abs()

        # Match exact strikes
        common = np.intersect1d(calls["strike"], puts["strike"])
        calls = calls[calls["strike"].isin(common)]
        puts = puts[puts["strike"].isin(common)]
        pairs = pd.merge(calls, puts, on=["expiry","strike"], suffixes=("_c","_p"))
        if len(pairs)==0: continue

        pairs["S"], pairs["trade_date"] = S, today
        pairs["T_days"] = (pairs["expiry"] - today).dt.days.clip(lower=0)
        pairs["T"] = pairs["T_days"]/365.0
        pairs = pairs[pairs["T_days"]>=3]
        all_pairs.append(pairs)

    return pd.concat(all_pairs, ignore_index=True) if all_pairs else pd.DataFrame()

pairs = fetch_pairs_exact("SPY", max_exp=10)
print("Pairs found:", len(pairs))
pairs[["expiry","strike","mid_c","mid_p","spread_c","spread_p"]].head(10)


Pairs found: 564


Unnamed: 0,expiry,strike,mid_c,mid_p,spread_c,spread_p
0,2025-10-16,550.0,112.375,0.035,2.63,0.01
1,2025-10-16,570.0,92.455,0.055,2.55,0.01
2,2025-10-16,575.0,87.345,0.055,2.77,0.01
3,2025-10-16,601.0,61.575,0.115,2.47,0.01
4,2025-10-16,602.0,60.545,0.12,2.57,0.0
5,2025-10-16,603.0,59.435,0.125,2.79,0.01
6,2025-10-16,604.0,58.39,0.125,2.86,0.01
7,2025-10-16,605.0,57.54,0.125,2.6,0.01
8,2025-10-16,606.0,56.535,0.125,2.59,0.01
9,2025-10-16,607.0,55.56,0.135,2.58,0.01


In [4]:
pairs.to_parquet("../data/processed/pairs_snapshot_SPY.parquet", index=False, engine="pyarrow")
print("Saved pairs snapshot")


Saved pairs snapshot


In [5]:
#check
print("Unique expiries:", pairs["expiry"].nunique())
print("Strike range:", pairs["strike"].min(), "to", pairs["strike"].max())
pairs.sample(5)


Unique expiries: 7
Strike range: 325.0 to 850.0


Unnamed: 0,contractSymbol_c,lastTradeDate_c,strike,last_c,bid_c,ask_c,change_c,percentChange_c,volume_c,oi_c,...,inTheMoney_p,contractSize_p,currency_p,option_type_p,mid_p,spread_p,S,trade_date,T_days,T
562,SPY251024C00701000,2025-10-13 15:09:39+00:00,701.0,0.03,0.03,0.04,0.0,0.0,2.0,250.0,...,True,REGULAR,USD,put,38.455,2.53,663.039978,2025-10-13,11,0.030137
13,SPY251016C00611000,2025-10-10 14:58:23+00:00,611.0,60.36,50.65,52.86,0.0,0.0,,2.0,...,False,REGULAR,USD,put,0.155,0.01,663.039978,2025-10-13,3,0.008219
401,SPY251022C00689000,2025-10-10 19:04:46+00:00,689.0,0.07,0.07,0.08,0.0,0.0,,89.0,...,True,REGULAR,USD,put,26.325,2.73,663.039978,2025-10-13,9,0.024658
280,SPY251020C00660000,2025-10-13 15:27:43+00:00,660.0,7.75,7.6,7.64,3.1,66.666664,1324.0,1523.0,...,False,REGULAR,USD,put,4.66,0.04,663.039978,2025-10-13,7,0.019178
521,SPY251024C00657000,2025-10-13 15:21:46+00:00,657.0,11.99,11.9,11.96,3.849999,47.297287,85.0,546.0,...,False,REGULAR,USD,put,5.385,0.05,663.039978,2025-10-13,11,0.030137


In [6]:
# --- Fix underlying spot price if needed ---
spot_df = yf.download("SPY", period="1d", progress=False)
spot_price = float(spot_df["Close"].iloc[-1])
print("Correct SPY spot price:", spot_price)

# Replace old S values with the real spot
pairs["S"] = spot_price


  spot_df = yf.download("SPY", period="1d", progress=False)


Correct SPY spot price: 663.1099853515625


  spot_price = float(spot_df["Close"].iloc[-1])


In [7]:
# --- CLEAN LIQUIDITY FILTER ---
pairs_clean = pairs[
    (pairs["spread_c"] < 1.0) &  # calls spread under $1
    (pairs["spread_p"] < 1.0) &  # puts spread under $1
    (pairs["mid_c"] > 0) &
    (pairs["mid_p"] > 0)
].copy()

print("Before filter:", len(pairs))
print("After filter:", len(pairs_clean))
pairs_clean[["expiry","strike","mid_c","mid_p","spread_c","spread_p"]].head(10)


Before filter: 564
After filter: 413


Unnamed: 0,expiry,strike,mid_c,mid_p,spread_c,spread_p
16,2025-10-16,614.0,48.875,0.165,0.19,0.01
17,2025-10-16,615.0,47.9,0.175,0.18,0.01
18,2025-10-16,616.0,46.86,0.18,0.18,0.02
19,2025-10-16,617.0,45.985,0.185,0.17,0.01
20,2025-10-16,618.0,44.865,0.195,0.19,0.01
21,2025-10-16,620.0,42.92,0.2,0.18,0.02
22,2025-10-16,622.0,40.935,0.22,0.19,0.02
23,2025-10-16,623.0,39.945,0.215,0.19,0.01
24,2025-10-16,624.0,38.925,0.24,0.19,0.02
25,2025-10-16,625.0,37.96,0.245,0.18,0.01
