In [None]:
import numpy as np
import pandas as pd
from scipy.stats import norm
from scipy.optimize import brentq
from datetime import time

In [None]:
#Global Variables

risk_free_rate=0.05
timeframe= '30min'
trading_sessions= 14

In [None]:
#Math Logic

# Black-Scholes formula
def black_scholes_price(S, K, T, r, sigma, option_type="C"):
    d1 = (np.log(S / K) + (r + 0.5 * sigma ** 2) * T) / (sigma * np.sqrt(T))
    d2 = d1 - sigma * np.sqrt(T)

    if option_type == "C":
        return S * norm.cdf(d1) - K * np.exp(-r * T) * norm.cdf(d2)
    elif option_type == "P":
        return K * np.exp(-r * T) * norm.cdf(-d2) - S * norm.cdf(-d1)

# Implied Volatility using Newton Raphson
def calculate_iv(option_price, S, K, T, r, option_type="C"):
    def bs_error(sigma):
        return black_scholes_price(S, K, T, r, sigma, option_type) - option_price

    try:
        return brentq(bs_error, 1e-6, 3)  
    except ValueError:
        return np.nan  

#Find z-score for mean reversion
def calculate_zscore(data, lookback):
    mean = data.rolling(window=lookback).mean()
    std = data.rolling(window=lookback).std()
    zscore = (data - mean) / std
    return zscore

#Calculates sharpe for a pnl series
def calculate_sharpe_ratio(returns, risk_free_rate=0):
    excess_returns = returns - risk_free_rate
    sharpe_ratio= excess_returns.mean() / excess_returns.std()
    return sharpe_ratio

#Max drawdown of a pnl series
def calculate_drawdown(returns):
  cumulative_returns = (1 + returns).cumprod()
  peak = cumulative_returns.cummax()
  drawdown = (cumulative_returns - peak) / peak
  return drawdown

In [136]:
# Data operations

#Load Data
def load_data(file_path):
    data = pd.read_csv(file_path, parse_dates=['start_ts'], usecols=['symbol','start_ts','open','high','low','close','volume','vwap'])
    data = data.dropna()
    return data

def strip_extra_trading_hours(data):
    
    data=data.reset_index()

    data = data.drop(data[data['start_ts'].dt.time< time(9,30,0)].index)
    data = data.drop(data[data['start_ts'].dt.time>= time(16,0,0)].index)

    return data

'''Filter the Data
    1. Convert to 30min timeframe
    2. Strip off extra trading hours
'''
def convert_eqt_data(eqt,new_timeframe):  

    eqt = eqt.set_index('start_ts')

    eqt_data = eqt.resample(new_timeframe).agg(    
        close= ('close', 'last'),
        Weighed_Price_Volume= ('vwap', lambda x:(x* eqt.loc[x.index, 'volume']).sum()),
        volume= ('volume','sum'),
        symbol= ('symbol','last')
    )

    eqt_data['vwap'] = eqt_data['Weighed_Price_Volume']/eqt_data['volume']
    eqt_data=eqt_data.drop(columns=['Weighed_Price_Volume'])

    #Strip off extra trading hours
    eqt_data=strip_extra_trading_hours(eqt_data)

    eqt_data= eqt_data.reset_index()
    eqt_data = eqt_data.dropna()
    return eqt_data

#See the data is alredy sorted
def data_filtering(option):

    #Add columns 
    option_characteristics=pd.DataFrame((x.split('_') for x in option['symbol']), columns=['ticker','expiry_date','strike_price','option_type'])
    option_characteristics['expiry_date']=pd.to_datetime(option_characteristics['expiry_date'])
    option_characteristics['strike_price']= option_characteristics['strike_price'].astype(float)
    option=pd.concat([option,option_characteristics],axis=1)

    option=option.drop(option[option['option_type']=='P'].index)

    option=option.reset_index()

    return option


'''Computationally cumbersome- Assumptions
        1. Selected the strike just greater than spot may not be closest but should be fairly liquid
        2. TTE != 0DTE
        3. Trading only on option closes
'''
def convert_option_data(option,new_timeframe):  

    option_close= option.pivot(index='start_ts', columns='symbol', values='close')
    option_close= option_close.resample(new_timeframe).last()
    option_close= strip_extra_trading_hours(option_close)    

    # option_volume= option.pivot(index='start_ts', columns='symbol', values='volume')
    # option_volume= option_volume.resample(new_timeframe).sum()
    # option_volume= strip_extra_trading_hours(option_volume)         

    return option_close

def precompute_spread(eqt, option_close, option_all):

    spread=pd.DataFrame()

    option_close=option_close.set_index('start_ts')

    for _,rows in eqt.iterrows():
        symbol=rows['symbol']
        start_ts=rows['start_ts']
        spot_price=rows['close']
        
        valid_expires = option_all[option_all['expiry_date'] > rows['start_ts']]
        closest_expires= valid_expires['expiry_date'].unique()[:2]
        near_expiry= str(closest_expires[0].date().strftime("%Y%m%d"))
        far_expiry= str(closest_expires[1].date().strftime("%Y%m%d"))

        otm_strikes= option_all[option_all['strike_price'] >= spot_price]
        nearest_atm_strike= int(otm_strikes.iloc[0]['strike_price'])

        option1=f"{symbol}_{near_expiry}_{nearest_atm_strike}_C"
        option2=f"{symbol}_{far_expiry}_{nearest_atm_strike}_C"

        option1_price= option_close.loc[start_ts,option1]
        option2_price= option_close.loc[start_ts,option2]

        # print(start_ts, option1, option1_price, option2, option2_price)

        tte1=(closest_expires[0]- start_ts)/pd.Timedelta(days=365)
        tte2=(closest_expires[1]- start_ts)/pd.Timedelta(days=365)

        temp={}
        temp['start_ts']= start_ts
        temp['price']= option2_price -option1_price
        temp['iv']= calculate_iv(option2_price, spot_price, nearest_atm_strike, tte2 ,risk_free_rate, 'C')- calculate_iv(option1_price, spot_price, nearest_atm_strike, tte1 ,risk_free_rate, 'C')

        spread=pd.concat([spread, pd.DataFrame([temp])])

    return spread

    

In [124]:
# Feature Engineering
def generate_features(data, window=60):

    # IV and price spreads
    data['IV_Spread'] = data['IV_Exp1'] - data['IV_Exp2']
    data['Price_Spread'] = data['Price_Exp1'] - data['Price_Exp2']
    
    # Rolling statistics
    data['IV_Mean'] = data['IV_Spread'].rolling(window).mean()
    data['IV_Std'] = data['IV_Spread'].rolling(window).std()
    data['Price_Mean'] = data['Price_Spread'].rolling(window).mean()
    data['Price_Std'] = data['Price_Spread'].rolling(window).std()
    
    # Z-scores
    data['IV_Z_Score'] = (data['IV_Spread'] - data['IV_Mean']) / data['IV_Std']
    data['Price_Z_Score'] = (data['Price_Spread'] - data['Price_Mean']) / data['Price_Std']
    
    # Lagged features
    for lag in range(1, 4):
        data[f'IV_Spread_Lag{lag}'] = data['IV_Spread'].shift(lag)
        data[f'Price_Spread_Lag{lag}'] = data['Price_Spread'].shift(lag)
        data[f'IV_Z_Score_Lag{lag}'] = data['IV_Z_Score'].shift(lag)
    
    # Target: Binary classification (Profit/Loss based on future spread return)
    data['Target'] = (data['Price_Spread'].shift(-1) - data['Price_Spread']) > 0
    data['Target'] = data['Target'].astype(int)
    
    # print(data.head)
    return data.dropna()

In [None]:
'''Equity trading hours from 4am to 8pm'''
eqt_path = 'CBOE Data/testeqt.csv'
eqt = load_data(eqt_path)

eqt_data = convert_eqt_data(eqt,timeframe)
print(eqt_data.to_string())

In [None]:
'''Options trading hours from 9.30am to 4pm = 14 Trading sessions'''
option_path = 'CBOE Data/testopt.csv'
option = load_data(option_path)

option_all= data_filtering(option)

#Convert to 30min Timeframe
option_close = convert_option_data(option,timeframe)
print(option_close.to_string())

In [137]:
spread= precompute_spread(eqt_data,option_close,option_all)
print(spread)

             start_ts  price        iv
0 2023-01-03 09:30:00   1.46 -0.118540
0 2023-01-03 10:00:00   1.35 -0.143631
0 2023-01-03 10:30:00   1.45 -0.118076
0 2023-01-03 11:00:00   1.45 -0.114945
0 2023-01-03 11:30:00   1.49 -0.117617
0 2023-01-03 12:00:00   1.47 -0.117927
0 2023-01-03 12:30:00   1.50 -0.115089
0 2023-01-03 13:00:00   1.50 -0.122156
0 2023-01-03 13:30:00   1.49 -0.120786
0 2023-01-03 14:00:00   1.44 -0.131022
0 2023-01-03 14:30:00   1.47 -0.129440
0 2023-01-03 15:00:00   1.43 -0.128920
0 2023-01-03 15:30:00   1.50 -0.114424
0 2023-01-04 09:30:00   1.50 -0.160269
0 2023-01-04 10:00:00   1.46 -0.168187
0 2023-01-04 10:30:00   1.39 -0.165763
0 2023-01-04 11:00:00   1.47 -0.149953
0 2023-01-04 11:30:00   1.54 -0.139541
0 2023-01-04 12:00:00   1.47 -0.141188
0 2023-01-04 12:30:00   1.50 -0.153102
0 2023-01-04 13:00:00   1.45 -0.161549
0 2023-01-04 13:30:00   1.53 -0.138948
0 2023-01-04 14:00:00   1.61 -0.147354
0 2023-01-04 14:30:00   1.50 -0.151272
0 2023-01-04 15:00:00   1