In [2]:
from datetime import datetime, date
import numpy as np
import pandas as pd
import yfinance as yf
from pathlib import Path

# project paths
DATA_RAW = Path("data/raw")
DATA_PROC = Path("data/processed")
DATA_RAW.mkdir(parents=True, exist_ok=True)
DATA_PROC.mkdir(parents=True, exist_ok=True)

TODAY_STR = pd.Timestamp.today().strftime("%Y-%m-%d")
TODAY = pd.Timestamp.today().normalize()
TRADING_DAYS_PER_YEAR = 252  # for later use



In [3]:
candidates = ["^FTSE", "ISF.L", "EWU"]  # index, UK ETF (London), UK ETF (US)
picked = None
for sym in candidates:
    try:
        tk = yf.Ticker(sym)
        exps = tk.options
        if exps and len(exps) > 0:
            picked = sym
            break
    except Exception as e:
        pass

if picked is None:
    raise RuntimeError("No option chains found on Yahoo for the FTSE candidates.")

picked, yf.Ticker(picked).options[:5]  # show first few expiries


('EWU', ('2025-10-17', '2025-11-21', '2026-01-16', '2026-04-17'))

In [4]:
tk = yf.Ticker(picked)
expiries = tk.options
expiries = sorted(expiries)[:4]  # take first ~4 near-dated expiries

def fetch_chain(symbol, expiry):
    chain = tk.option_chain(expiry)
    calls = chain.calls.copy()
    puts  = chain.puts.copy()
    calls["option_type"] = "call"
    puts["option_type"]  = "put"
    calls["expiry"] = pd.to_datetime(expiry)
    puts["expiry"]  = pd.to_datetime(expiry)
    return calls, puts

frames = []
for ex in expiries:
    c, p = fetch_chain(picked, ex)
    frames.append(("calls", c))
    frames.append(("puts", p))

len(frames), expiries


(8, ['2025-10-17', '2025-11-21', '2026-01-16', '2026-04-17'])

In [5]:
def tidy(df, symbol):
    keep = ["contractSymbol","lastPrice","bid","ask","change","percentChange",
            "volume","openInterest","impliedVolatility","strike","expiry","option_type"]
    df = df[keep].copy()
    df.rename(columns={
        "lastPrice":"last",
        "openInterest":"oi",
        "impliedVolatility":"iv"
    }, inplace=True)
    df["mid"] = (df["bid"].fillna(0) + df["ask"].fillna(0)) / 2
    df["spread"] = (df["ask"] - df["bid"]).abs()
    df["symbol"] = symbol
    df["trade_date"] = TODAY
    return df

tidied = []
for kind, df in frames:
    tidied.append(tidy(df, picked))

opt = pd.concat(tidied, ignore_index=True)
opt.head()


Unnamed: 0,contractSymbol,last,bid,ask,change,percentChange,volume,oi,iv,strike,expiry,option_type,mid,spread,symbol,trade_date
0,EWU251017C00036000,4.36,5.2,6.2,0.0,0.0,5.0,7,1.255863,36.0,2025-10-17,call,5.7,1.0,EWU,2025-10-13
1,EWU251017C00037000,5.0,0.0,0.0,0.0,0.0,5.0,84,1e-05,37.0,2025-10-17,call,0.0,0.0,EWU,2025-10-13
2,EWU251017C00038000,4.68,0.0,0.0,0.0,0.0,2.0,21,1e-05,38.0,2025-10-17,call,0.0,0.0,EWU,2025-10-13
3,EWU251017C00039000,3.58,0.0,0.0,0.0,0.0,2.0,75,1e-05,39.0,2025-10-17,call,0.0,0.0,EWU,2025-10-13
4,EWU251017C00040000,2.45,0.0,0.0,0.0,0.0,4.0,1530,1e-05,40.0,2025-10-17,call,0.0,0.0,EWU,2025-10-13


In [6]:
# Robust spot fetch for single or multi-index columns
hist = yf.download(picked, period="5d", auto_adjust=True, progress=False)

if isinstance(hist.columns, pd.MultiIndex):
    # level 0 like: ('Close', 'High', ...)  level 1: ticker
    close = hist['Close']
    if picked in close.columns:
        spot = close[picked].iloc[-1]
    else:
        # sometimes the column name differs slightly; squeeze to a Series
        spot = close.iloc[-1].squeeze()
else:
    # normal single-index columns
    spot = hist['Close'].iloc[-1]

spot = float(spot)   # ensure plain Python float
opt['S'] = spot


# Time to maturity in years (ACT/365)
opt["T_days"] = (opt["expiry"].dt.normalize() - TODAY).dt.days.clip(lower=0)
opt["T"] = opt["T_days"] / 365.0

# Basic filters (liquidity & sanity)
opt = opt[(opt["bid"].notna()) & (opt["ask"].notna())]
opt = opt[(opt["oi"].fillna(0) >= 10)]                 # tweak threshold if thin
opt = opt[(opt["spread"] >= 0)]                        # no negatives
opt = opt[(opt["mid"] > 0)]                            # price must be positive
opt = opt[(opt["T_days"] >= 3)]                        # avoid very near expiry noise

len(opt), opt.head(3)


(21,
         contractSymbol  last   bid   ask  change  percentChange  volume  oi  \
 9   EWU251017P00027000  0.53  0.05  0.20     0.0            0.0     NaN  20   
 10  EWU251017P00028000  0.30  0.00  0.15     0.0            0.0     1.0  21   
 11  EWU251017P00030000  0.05  0.00  0.75     0.0            0.0     2.0  40   
 
           iv  strike     expiry option_type    mid  spread symbol trade_date  \
 9   2.097661    27.0 2025-10-17         put  0.125    0.15    EWU 2025-10-13   
 10  1.781251    28.0 2025-10-17         put  0.075    0.15    EWU 2025-10-13   
 11  2.123052    30.0 2025-10-17         put  0.375    0.75    EWU 2025-10-13   
 
         S  T_days         T  
 9   41.73       4  0.010959  
 10  41.73       4  0.010959  
 11  41.73       4  0.010959  )

In [7]:
opt["moneyness"] = opt["strike"] / opt["S"]
opt["ln_moneyness"] = np.log(opt["moneyness"])

calls = opt[opt["option_type"]=="call"].copy()
puts  = opt[opt["option_type"]=="put"].copy()

pairs = pd.merge(
    calls, 
    puts, 
    on=["symbol","expiry","strike","trade_date","S","T","T_days"],
    suffixes=("_c","_p"),
    how="inner"
)

len(pairs), pairs.head(3)


(0,
 Empty DataFrame
 Columns: [contractSymbol_c, last_c, bid_c, ask_c, change_c, percentChange_c, volume_c, oi_c, iv_c, strike, expiry, option_type_c, mid_c, spread_c, symbol, trade_date, S, T_days, T, moneyness_c, ln_moneyness_c, contractSymbol_p, last_p, bid_p, ask_p, change_p, percentChange_p, volume_p, oi_p, iv_p, option_type_p, mid_p, spread_p, moneyness_p, ln_moneyness_p]
 Index: []
 
 [0 rows x 35 columns])

In [10]:
%pip install pyarrow


Collecting pyarrow
  Downloading pyarrow-21.0.0-cp311-cp311-win_amd64.whl.metadata (3.4 kB)
Downloading pyarrow-21.0.0-cp311-cp311-win_amd64.whl (26.2 MB)
   ---------------------------------------- 0.0/26.2 MB ? eta -:--:--
   - -------------------------------------- 0.8/26.2 MB 4.8 MB/s eta 0:00:06
   -- ------------------------------------- 1.6/26.2 MB 4.4 MB/s eta 0:00:06
   --- ------------------------------------ 2.6/26.2 MB 4.4 MB/s eta 0:00:06
   ----- ---------------------------------- 3.4/26.2 MB 4.3 MB/s eta 0:00:06
   ------ --------------------------------- 4.2/26.2 MB 4.3 MB/s eta 0:00:06
   ------- -------------------------------- 5.2/26.2 MB 4.3 MB/s eta 0:00:05
   --------- ------------------------------ 6.0/26.2 MB 4.3 MB/s eta 0:00:05
   ---------- ----------------------------- 6.8/26.2 MB 4.3 MB/s eta 0:00:05
   ----------- ---------------------------- 7.9/26.2 MB 4.3 MB/s eta 0:00:05
   ------------- -------------------------- 8.7/26.2 MB 4.3 MB/s eta 0:00:05
   --

In [8]:
raw_path = DATA_RAW / f"option_chain_{picked}_{TODAY_STR}.parquet"
opt.to_parquet(raw_path, index=False, engine="pyarrow")

proc_path = DATA_PROC / f"pairs_{picked}_{TODAY_STR}.parquet"
pairs.to_parquet(proc_path, index=False, engine="pyarrow")

print("Raw saved:", raw_path.exists(), "| Proc saved:", proc_path.exists())


Raw saved: True | Proc saved: True


In [9]:
# checks

In [10]:
import pandas as pd, numpy as np
from pathlib import Path

raw_path  = Path("data/raw").glob("option_chain_*").__iter__().__next__()
proc_path = Path("data/processed").glob("pairs_*").__iter__().__next__()

opt   = pd.read_parquet(raw_path, engine="pyarrow")
pairs = pd.read_parquet(proc_path, engine="pyarrow")

print("rows raw:", len(opt), "| rows pairs:", len(pairs))
assert len(opt)>0 and len(pairs)>0, "No data!"


rows raw: 21 | rows pairs: 0


AssertionError: No data!

In [11]:
pairs.head()

Unnamed: 0,contractSymbol_c,last_c,bid_c,ask_c,change_c,percentChange_c,volume_c,oi_c,iv_c,strike,...,change_p,percentChange_p,volume_p,oi_p,iv_p,option_type_p,mid_p,spread_p,moneyness_p,ln_moneyness_p


In [12]:
print("Picked:", picked)
display(opt["option_type"].value_counts())
display(opt.groupby(["expiry","option_type"]).size().unstack(fill_value=0))


Picked: EWU


option_type
put     13
call     8
Name: count, dtype: int64

option_type,call,put
expiry,Unnamed: 1_level_1,Unnamed: 2_level_1
2025-10-17,0,9
2026-01-16,8,4


In [13]:
expiry_check = pd.Timestamp("2026-01-16")
subset = opt[opt["expiry"] == expiry_check]

print("Unique strikes by type:")
print("CALLS:", sorted(subset.loc[subset["option_type"]=="call", "strike"].unique()))
print("PUTS :", sorted(subset.loc[subset["option_type"]=="put", "strike"].unique()))


Unique strikes by type:
CALLS: [np.float64(31.0), np.float64(32.0), np.float64(33.0), np.float64(34.0), np.float64(35.0), np.float64(36.0), np.float64(38.0), np.float64(46.0)]
PUTS : [np.float64(22.0), np.float64(26.0), np.float64(29.0), np.float64(30.0)]


In [15]:
pairs_list = []
tol = 1.0  # strike difference tolerance
for _, c in calls.iterrows():
    # Find the put with the closest strike
    diffs = (puts["strike"] - c["strike"]).abs()
    best_idx = diffs.idxmin()
    best_put = puts.loc[best_idx]
    if abs(best_put["strike"] - c["strike"]) <= tol:
        merged = c.to_dict()
        merged.update({f"{col}_p": best_put[col] for col in puts.columns})
        pairs_list.append(merged)

pairs = pd.DataFrame(pairs_list)
print("Pairs within ±1 strike:", len(pairs))
pairs[["strike", "strike_p", "mid", "mid_p"]].head()


Pairs within ±1 strike: 1


Unnamed: 0,strike,strike_p,mid,mid_p
0,31.0,30.0,4.15,0.5


In [16]:
import pandas as pd, numpy as np, yfinance as yf
from pathlib import Path

def fetch_pairs_exact(symbol, max_exp=20):
    tk = yf.Ticker(symbol)
    exps = sorted(tk.options)[:max_exp]
    all_pairs = []

    # spot (robust to MultiIndex)
    hist = yf.download(symbol, period="5d", auto_adjust=True, progress=False)
    close = hist["Close"][symbol] if isinstance(hist.columns, pd.MultiIndex) else hist["Close"]
    S = float(close.iloc[-1])
    today = pd.Timestamp.today().normalize()

    for ex in exps:
        ch = tk.option_chain(ex)
        for df, typ in [(ch.calls, "call"), (ch.puts, "put")]:
            df["option_type"] = typ
            df["expiry"] = pd.to_datetime(ex)

        calls = ch.calls[["contractSymbol","lastPrice","bid","ask","openInterest","impliedVolatility","strike"]].copy()
        puts  = ch.puts[ ["contractSymbol","lastPrice","bid","ask","openInterest","impliedVolatility","strike"]].copy()
        calls.rename(columns={"lastPrice":"last","openInterest":"oi","impliedVolatility":"iv"}, inplace=True)
        puts.rename (columns={"lastPrice":"last","openInterest":"oi","impliedVolatility":"iv"}, inplace=True)
        calls["expiry"] = pd.to_datetime(ex); puts["expiry"] = pd.to_datetime(ex)

        # exact common strikes only
        common = np.intersect1d(calls["strike"].values, puts["strike"].values)
        if common.size == 0: 
            continue
        calls_ = calls[calls["strike"].isin(common)].copy()
        puts_  = puts [puts ["strike"].isin(common)].copy()

        # mids & basic filters
        for df in (calls_, puts_):
            df["mid"] = (df["bid"].fillna(0) + df["ask"].fillna(0))/2
            df["spread"] = (df["ask"] - df["bid"]).abs()
        calls_ = calls_[(calls_["mid"]>0) & (calls_["bid"].notna()) & (calls_["ask"].notna())]
        puts_  = puts_ [(puts_ ["mid"]>0) & (puts_ ["bid"].notna()) & (puts_ ["ask"].notna())]

        if len(calls_)==0 or len(puts_)==0: 
            continue

        pairs = pd.merge(calls_, puts_, on=["expiry","strike"], suffixes=("_c","_p"), how="inner")
        if len(pairs)==0: 
            continue

        pairs["S"] = S
        pairs["trade_date"] = today
        pairs["T_days"] = (pairs["expiry"] - today).dt.days.clip(lower=0)
        pairs["T"] = pairs["T_days"]/365.0

        all_pairs.append(pairs)

    if not all_pairs:
        return pd.DataFrame()
    out = pd.concat(all_pairs, ignore_index=True)

    # quality screen (loose first; tighten later in Part C)
    out = out[(out["T_days"]>=3)]
    return out

candidates = ["ISF.L","EWU"]
best_symbol, best_pairs = None, pd.DataFrame()
for sym in candidates:
    p = fetch_pairs_exact(sym, max_exp=20)
    print(sym, "→ pairs:", len(p))
    if len(p) > len(best_pairs):
        best_symbol, best_pairs = sym, p

picked = best_symbol
pairs  = best_pairs
print("USING:", picked, "| total exact-match pairs:", len(pairs))
pairs[["expiry","strike","mid_c","mid_p","spread_c","spread_p"]].head(10)


ISF.L → pairs: 0
EWU → pairs: 0
USING: None | total exact-match pairs: 0


KeyError: "None of [Index(['expiry', 'strike', 'mid_c', 'mid_p', 'spread_c', 'spread_p'], dtype='object')] are in the [columns]"

In [17]:
from pathlib import Path
import pandas as pd

save_dir = Path("data/processed")
save_dir.mkdir(parents=True, exist_ok=True)

opt_path = save_dir / f"opt_snapshot_{picked}.parquet"
pairs_path = save_dir / f"pairs_snapshot_{picked}.parquet"

opt.to_parquet(opt_path, index=False, engine="pyarrow")
pairs.to_parquet(pairs_path, index=False, engine="pyarrow")

print("Saved to:")
print(opt_path)
print(pairs_path)


Saved to:
data\processed\opt_snapshot_None.parquet
data\processed\pairs_snapshot_None.parquet


In [18]:
!pip freeze > requirements.txt
print("Environment saved to requirements.txt")


Environment saved to requirements.txt
