In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm
import os

#ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
COLUMN_OF_INTEREST =['QUOTE_UNIXTIME','QUOTE_DATE','EXPIRE_DATE','EXPIRE_UNIX',
                            'DTE','C_BID','C_ASK', 'P_BID','P_ASK',
                             'UNDERLYING_LAST','STRIKE','STRIKE_DISTANCE']

In [3]:
df_options_spy = pd.read_csv('data/2020-2022_SPY_30days.csv')

# convert QUOTE_DATE and EXPIRE_DATE to datetime
df_options_spy['QUOTE_DATE'] = pd.to_datetime(df_options_spy['QUOTE_DATE'])
df_options_spy['EXPIRE_DATE'] = pd.to_datetime(df_options_spy['EXPIRE_DATE'])

# drop QUOTE_UNIXTIME and EXPIRE_UNIX
df_options_spy = df_options_spy.drop(['QUOTE_UNIXTIME', 'EXPIRE_UNIX'], axis=1)

# set "" to NaN
df_options_spy = df_options_spy.replace(r'^\s*$', np.nan, regex=True)

# drop rows with NaN
df_options_spy = df_options_spy.dropna()

# convert C_BID, C_ASK, P_BID, P_ASK to float
df_options_spy['C_BID'] = df_options_spy['C_BID'].astype(float)
df_options_spy['C_ASK'] = df_options_spy['C_ASK'].astype(float)
df_options_spy['P_BID'] = df_options_spy['P_BID'].astype(float)
df_options_spy['P_ASK'] = df_options_spy['P_ASK'].astype(float)

In [4]:
df_options_spy.dtypes

QUOTE_DATE         datetime64[ns]
EXPIRE_DATE        datetime64[ns]
DTE                       float64
C_BID                     float64
C_ASK                     float64
P_BID                     float64
P_ASK                     float64
UNDERLYING_LAST           float64
STRIKE                    float64
STRIKE_DISTANCE           float64
dtype: object

In [5]:
df_options_spy.head()

Unnamed: 0,QUOTE_DATE,EXPIRE_DATE,DTE,C_BID,C_ASK,P_BID,P_ASK,UNDERLYING_LAST,STRIKE,STRIKE_DISTANCE
0,2020-01-03,2020-01-03,0.0,62.43,63.04,0.0,0.01,322.43,260.0,62.4
1,2020-01-03,2020-01-03,0.0,57.43,58.07,0.0,0.01,322.43,265.0,57.4
2,2020-01-03,2020-01-03,0.0,52.34,53.06,0.0,0.01,322.43,270.0,52.4
3,2020-01-03,2020-01-03,0.0,49.91,50.54,0.0,0.01,322.43,272.5,49.9
4,2020-01-03,2020-01-03,0.0,47.35,48.04,0.0,0.0,322.43,275.0,47.4


In [46]:
# Calculate the rolling volatility of the underlying price
import yfinance as yf
from pypfopt.risk_models import CovarianceShrinkage

TICKER = ['SPY']

start_date = '2017-01-01' # you need to have at least 2 years of data before the start date as we are calculating the rolling volatility of 2 years
end_date = '2023-01-01' # end date is exclusive, the data will be downloaded until 2022-12-31

df = yf.download(TICKER, start=start_date, end=end_date)

# calculate the rolling volatility of 2 years (252 trading days per year)
# rolling_period = 252*2
rolling_period = int(252)
# dt = 1/252

# VOLATILITY METHOD 1: calculate the volatility using log return and take the standard deviation as is
# df['log_return'] = np.log(df['Adj Close'] / df['Adj Close'].shift(1))
# df['volatility'] = df['log_return'].rolling(rolling_period).std() * np.sqrt(252)

# VOLATILITY METHOD 2: calculate the volatility using log return and CovarianceShrinkage
df_spy = df['Adj Close']
df_spy.index.name = None
df['volatility'] = np.nan

#Apply rolling period of 2 years to CovarianceShrinkage
for i in range(rolling_period, len(df_spy)):
    cov_matrix = CovarianceShrinkage(df_spy.iloc[i-rolling_period:i], log_returns=True).ledoit_wolf() 

    #calculate the volatility. cov_matrix is annualized, so no need to multiply by sqrt(252)
    df['volatility'].iloc[i] = np.sqrt(cov_matrix.iloc[0,0])
    
# we only need the data from 2020 to 2022
df = df.loc['2020-01-01':'2022-12-31']

[*********************100%***********************]  1 of 1 completed


In [47]:
# risk free rate for 2020 to 2022 is around 2.5% per annum
risk_free_rate = 0.025

# dividend yield is rate is roughly 1.5% per annum for SPY
dividend_yield = 0.015

In [48]:
def binomial_model(S0, K, T, r, sigma, N, 
                   option='call', div=0, 
                   european=True, method='crr'):
    ''' Binomial model for option pricing.
    Parameters
    ==========
    S0 : float
        initial stock/index level
    K : float
        strike price
    T : float
        maturity (in year fractions)
    r : float
        constant risk-free short rate
    sigma : float
        volatility factor in diffusion term 
    N : int
        number of time intervals
    option : string
        type of the option to be valued ('call' or 'put')
    div : float
        annualized continuous dividend yield
    european : boolean
        True for European option valuation
        False for American option valuation
    method : string
        'crr' for Cox, Ross and Rubinstein
        'jr' for Jarrow and Rudd/equal probabilities 
    Returns
    =======
    value : float
        present value of the European option
    '''
    dt = T / N
    
    # calculate the risk-neutral probability
    if method == 'crr':
        u = np.exp(sigma * np.sqrt(dt))
        d = 1 / u
        p = (np.exp((r - div) * dt) - d) / (u - d)
    elif method == 'jr':
        u = np.exp((r-div-0.5*sigma**2)*dt+sigma*np.sqrt(dt))
        d = np.exp((r-div-0.5*sigma**2)*dt-sigma*np.sqrt(dt))
        p = 0.5
    else:
        raise ValueError('method not recognized')

    q = 1 - p

    # initialize terminal stock prices at maturity
    stock_prices = np.zeros((N + 1, N + 1))
    stock_prices[0, 0] = S0
    for i in range(1, N + 1):
        stock_prices[0, i] = stock_prices[0, i - 1] * u
        for j in range(1, i + 1):
            stock_prices[j, i] = stock_prices[j - 1, i - 1] * d

    # calculate option values at maturity
    option_values = np.zeros((N + 1, N + 1))
    if option == 'call':
        option_values[:, N] = np.maximum(stock_prices[:, N] - K, 0)
    else:
        option_values[:, N] = np.maximum(K - stock_prices[:, N], 0)

    # backward induction
    if european:
        for i in range(N - 1, -1, -1):
            for j in range(i + 1):
                option_values[j, i] = np.exp(-r * dt) * (p * option_values[j, i + 1] + q * option_values[j + 1, i + 1])
    else:
        if option == 'call':
            for i in range(N - 1, -1, -1):
                for j in range(i + 1):
                    option_values[j, i] = np.exp(-r * dt) * (p * option_values[j, i + 1] + q * option_values[j + 1, i + 1])
                    option_values[j, i] = np.maximum(option_values[j, i], stock_prices[j, i] - K)
        else:
            for i in range(N - 1, -1, -1):
                for j in range(i + 1):
                    option_values[j, i] = np.exp(-r * dt) * (p * option_values[j, i + 1] + q * option_values[j + 1, i + 1])
                    option_values[j, i] = np.maximum(option_values[j, i], K - stock_prices[j, i])
    
    return option_values[0, 0]

In [49]:
S0 = 3246.23
K = 2000
T = 21/252
r = 0.025
div = 0.015
sigma = 0.1498878078948466
N = 21
print(binomial_model(S0, K, T, r, sigma, N, option='call', div=div, european=True, method='crr'))
print(binomial_model(S0, K, T, r, sigma, N, option='call', div=div, european=True, method='jr'))

1246.3370769621977
1246.3370318672155


In [50]:
# merge the dataframes of df_options and df where df_options QUOTE_DATE is same as df index
df_model_input = df_options_spy.merge(df['volatility'], left_on='QUOTE_DATE', right_on=df.index)
df_model_input = df_model_input[df_model_input['DTE']==30]

In [51]:
df_model_input.head()

Unnamed: 0,QUOTE_DATE,EXPIRE_DATE,DTE,C_BID,C_ASK,P_BID,P_ASK,UNDERLYING_LAST,STRIKE,STRIKE_DISTANCE,volatility
178,2020-01-06,2020-02-05,30.0,34.14,34.43,0.3,0.33,323.68,290.0,33.7,0.119088
179,2020-01-06,2020-02-05,30.0,29.38,29.67,0.42,0.45,323.68,295.0,28.7,0.119088
180,2020-01-06,2020-02-05,30.0,24.57,24.84,0.64,0.64,323.68,300.0,23.7,0.119088
181,2020-01-06,2020-02-05,30.0,19.85,20.12,0.93,0.93,323.68,305.0,18.7,0.119088
182,2020-01-06,2020-02-05,30.0,18.97,19.21,1.0,1.02,323.68,306.0,17.7,0.119088


In [52]:
# backtest the binomial model
# track the progress of the backtest
tqdm().pandas()
N_PLUS=0
df_model_input['binomial_model'] = df_model_input.progress_apply(
    lambda x: binomial_model(x['UNDERLYING_LAST'], x['STRIKE'], 
                             x['DTE']/365, risk_free_rate, x['volatility'], 
                             int(x['DTE'])+N_PLUS, option='call', div=dividend_yield, 
                             european=False, method='crr'), axis=1)

0it [00:00, ?it/s]

  0%|          | 0/29298 [00:00<?, ?it/s]

In [53]:
df_model_input['binomial_model_jr'] = df_model_input.progress_apply(
    lambda x: binomial_model(x['UNDERLYING_LAST'], x['STRIKE'], 
                             x['DTE']/365, risk_free_rate, x['volatility'], 
                             int(x['DTE'])+N_PLUS, option='call', div=dividend_yield, 
                             european=False, method='jr'), axis=1)


  0%|          | 0/29298 [00:00<?, ?it/s]

In [54]:
df_model_input.head()

Unnamed: 0,QUOTE_DATE,EXPIRE_DATE,DTE,C_BID,C_ASK,P_BID,P_ASK,UNDERLYING_LAST,STRIKE,STRIKE_DISTANCE,volatility,binomial_model,binomial_model_jr
178,2020-01-06,2020-02-05,30.0,34.14,34.43,0.3,0.33,323.68,290.0,33.7,0.119088,33.877621,33.8776
179,2020-01-06,2020-02-05,30.0,29.38,29.67,0.42,0.45,323.68,295.0,28.7,0.119088,28.89516,28.895169
180,2020-01-06,2020-02-05,30.0,24.57,24.84,0.64,0.64,323.68,300.0,23.7,0.119088,23.93618,23.936404
181,2020-01-06,2020-02-05,30.0,19.85,20.12,0.93,0.93,323.68,305.0,18.7,0.119088,19.071043,19.070296
182,2020-01-06,2020-02-05,30.0,18.97,19.21,1.0,1.02,323.68,306.0,17.7,0.119088,18.121653,18.121616


In [55]:
df_eval = df_model_input[['QUOTE_DATE', 'binomial_model', 'binomial_model_jr', 'C_BID','C_ASK','STRIKE_DISTANCE']]
df_eval['C_PRICE'] = (df_eval['C_BID'] + df_eval['C_ASK'])/2
df_eval = df_eval.set_index('QUOTE_DATE')
df_eval.head(50)

Unnamed: 0_level_0,binomial_model,binomial_model_jr,C_BID,C_ASK,STRIKE_DISTANCE,C_PRICE
QUOTE_DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-01-06,33.877621,33.8776,34.14,34.43,33.7,34.285
2020-01-06,28.89516,28.895169,29.38,29.67,28.7,29.525
2020-01-06,23.93618,23.936404,24.57,24.84,23.7,24.705
2020-01-06,19.071043,19.070296,19.85,20.12,18.7,19.985
2020-01-06,18.121653,18.121616,18.97,19.21,17.7,19.09
2020-01-06,17.172263,17.172936,18.15,18.37,16.7,18.26
2020-01-06,16.226144,16.224256,17.11,17.39,15.7,17.25
2020-01-06,15.327007,15.325913,16.22,16.5,14.7,16.36
2020-01-06,14.42787,14.428004,15.45,15.65,13.7,15.55
2020-01-06,13.528733,13.530095,14.56,14.7,12.7,14.63


In [56]:
#calculate percentage of binomial model that is between bid and ask
df_eval['binomial_model_between_bid_ask'] = np.where((df_eval['binomial_model'] > df_eval['C_BID']) & (df_eval['binomial_model'] < df_eval['C_ASK']), 1, 0)
df_eval['binomial_model_jr_between_bid_ask'] = np.where((df_eval['binomial_model_jr'] > df_eval['C_BID']) & (df_eval['binomial_model_jr'] < df_eval['C_ASK']), 1, 0)

#calculate percentage of binomial model that is between bid and ask
print(df_eval['binomial_model_between_bid_ask'].mean())

#calculate percentage of binomial model that is between bid and ask
print(df_eval['binomial_model_jr_between_bid_ask'].mean())



0.24189364461738003
0.24151819236807973


In [57]:
df_eval[df_eval['binomial_model_between_bid_ask'] == 1]

Unnamed: 0_level_0,binomial_model,binomial_model_jr,C_BID,C_ASK,STRIKE_DISTANCE,C_PRICE,binomial_model_between_bid_ask,binomial_model_jr_between_bid_ask
QUOTE_DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2020-01-08,4.231997,4.227924,4.13,4.26,0.6,4.195,1,1
2020-01-15,6.348696,6.345365,6.34,6.37,3.2,6.355,1,1
2020-01-15,0.019416,0.019699,0.00,0.02,29.3,0.010,1,1
2020-01-15,0.004134,0.004096,0.00,0.02,34.3,0.010,1,1
2020-01-22,4.725785,4.730791,4.71,4.77,0.4,4.740,1,1
...,...,...,...,...,...,...,...,...
2022-12-28,0.024577,0.024549,0.02,0.03,78.3,0.025,1,1
2022-12-28,0.012420,0.011519,0.01,0.02,83.3,0.015,1,1
2022-12-28,0.000250,0.000252,0.00,0.01,113.3,0.005,1,1
2022-12-28,0.000115,0.000104,0.00,0.01,118.3,0.005,1,1


In [58]:
#calculate absolute error of binomial model
df_eval['binomial_model_error'] = df_eval['binomial_model'] - df_eval['C_PRICE']
df_eval['binomial_model_jr_error'] = df_eval['binomial_model_jr'] - df_eval['C_PRICE']

#calculate root mean squared error of binomial model
print('Root Mean Squared Error of CRR model:',np.sqrt(np.mean(df_eval['binomial_model_error']**2)))
print('Root Mean Squared Error of JR model:',np.sqrt(np.mean(df_eval['binomial_model_jr_error']**2)))

#calculate mean absolute error of binomial model
print('Mean Absolute Error of CRR model:',np.mean(np.abs(df_eval['binomial_model_error'])))
print('Mean Absolute Error of JR model:',np.mean(np.abs(df_eval['binomial_model_jr_error'])))

#calculate mean absolute percentage error of binomial model
temp = df_eval[df_eval['C_PRICE']!=0]
print('Percentage Mean Absolute Error of CRR model:', np.mean(np.abs(temp['binomial_model_error']/temp['C_PRICE']))*100,"%")
print('Percentage Mean Absolute Error of JR model:', np.mean(np.abs(temp['binomial_model_jr_error']/temp['C_PRICE']))*100,"%")


Root Mean Squared Error of CRR model: 2.2014125639111435
Root Mean Squared Error of JR model: 2.201063362103447
Mean Absolute Error of CRR model: 1.3305752133602538
Mean Absolute Error of JR model: 1.3303507821992744
Percentage Mean Absolute Error of CRR model: 152.2148535037009 %
Percentage Mean Absolute Error of JR model: 151.9153260183108 %


In [59]:
#find row where binomial model is furthest to the actual price
df_eval[np.abs(df_eval['binomial_model_error']).max()==np.abs(df_eval['binomial_model_error'])]

Unnamed: 0_level_0,binomial_model,binomial_model_jr,C_BID,C_ASK,STRIKE_DISTANCE,C_PRICE,binomial_model_between_bid_ask,binomial_model_jr_between_bid_ask,binomial_model_error,binomial_model_jr_error
QUOTE_DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2020-03-16,6.221368,6.24122,20.15,20.53,0.2,20.34,0,0,-14.118632,-14.09878


In [60]:
# METHOD 1: using std as is
# Root Mean Squared Error of CRR model:  23.86935498966928
# Root Mean Squared Error of JR model:  25.103188203551444
# Mean Absolute Error of CRR model:  14.540254685604587
# Mean Absolute Error of JR model:  16.29314434180136
# Percentage of Error of CRR model:  3.081296435831501
# Percentage of Error of JR model:  3.4527598501135346


# METHOD 2: using COVARIANCE_SHRINKAGE
# Root Mean Squared Error of CRR model:  23.920759846458463
# Root Mean Squared Error of JR model:  25.15368471620861
# Mean Absolute Error of CRR model:  14.557521681224893
# Mean Absolute Error of JR model:  16.30927181380145
# Percentage of Error of CRR model:  3.0849555692657344
# Percentage of Error of JR model:  3.456177501527993