In [1]:
import wrds
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import os
import sys
from pathlib import Path

# add the src directory to the path in order to import config
current_directory = Path.cwd()
src_path = current_directory.parent / "src"
sys.path.insert(0, str(src_path))

import load_option_data_01 as l1
import filter_option_data_01 as f1
import bsm_pricer as bsm
import config
WRDS_USERNAME = Path(config.WRDS_USERNAME)
DATA_DIR = Path(config.DATA_DIR)

In [3]:
def pull_option_price(wrds_username, year=2012): 
	db = wrds.Connection(wrds_username=wrds_username)

	sql_query = f"""
					SELECT 
						*
					FROM
						optionm_all.opprcd{year} AS a
					WHERE
						a.secid=108105;
				"""
		
	optm_df = db.raw_sql(sql_query, date_cols = ["date"])

	db.close()

	return optm_df

def pull_all_option_price(wrds_username, last_year=2012): 
	db = wrds.Connection(wrds_username=wrds_username)

	optm_df = []

	for year in range(1996,last_year+1):
		sql_query = f"""
						SELECT 
							a.date, a.exdate, a.cp_flag, a.strike_price
						FROM
							optionm_all.opprcd{year} AS a
						WHERE
							a.secid=108105;
					"""
		
		optm = db.raw_sql(sql_query, date_cols = ["date"])

		optm_df.append(optm)
		
	db.close()

	optm_df = pd.concat(optm_df)
	
	return optm_df


def pull_all_req_data(wrds_username, start_year=1996, end_year=2012): 
	db = wrds.Connection(wrds_username=wrds_username)

	optm_df = []

	for year in range(start_year, end_year+1):
		sql_query = f"""
						SELECT 
							a.date, a.exdate, a.cp_flag, a.strike_price, a.best_bid, a.best_offer,a.volume,
							b.open, b.close, a.impl_volatility, c.tb_m3
						FROM
							optionm_all.opprcd{year} AS a
						JOIN 
							optionm_all.secprd{year} AS b ON a.date = b.date AND a.secid = b.secid
						JOIN 
							frb_all.rates_daily AS c ON c.date = a.date 
						WHERE
							a.secid=108105 AND a.date <= '2012-01-31';
					"""
		
		optm = db.raw_sql(sql_query, date_cols = ["date"])

		optm_df.append(optm)
		
	db.close()

	optm_df = pd.concat(optm_df)
	
	return optm_df

def pull_Opt_Sec_info_WRDS(wrds_username = WRDS_USERNAME, start = '1996-01-04', end = '2012-01-31'): 
	#https://wrds-www.wharton.upenn.edu/pages/get-data/option-suite-wrds/us-option-level-output/
	sql_query = f"""
		SELECT  
			a.*, c.tb_m3
		FROM
			beta.wrdsapps_optionsig  AS a
		JOIN 
			frb_all.rates_daily AS c ON c.date = a.date 
		WHERE
			(a.secid = 108105) 
		AND 
			(a.date <= \'{end}\') 
		AND 
			(a.date >= \'{start}\')
		LIMIT 1000
	""" 
	#LIMIT 1000
	db = wrds.Connection(wrds_username=wrds_username)
	optm = db.raw_sql(sql_query, date_cols = ["date", "exdate"])
	db.close()
	return optm

In [2]:
optm_df = l1.load_all_optm_data()

In [3]:
print(optm_df.shape)
print(optm_df['cp_flag'].value_counts().to_dict())

(3410580, 14)
{'P': 1706360, 'C': 1704220}


In [4]:
optm_df['date'] = pd.to_datetime(optm_df['date'])
optm_df['option_price'] = (optm_df['best_bid'] + optm_df['best_offer'])/2
optm_df['index_price'] = (optm_df['open'] + optm_df['close'])/2
optm_df['strike_price'] = optm_df['strike_price']/1000
optm_df['tb_m3'] = optm_df['tb_m3']/100

### Level 1 filters

#### Identical Filter:
The OptionMetrics data set contain duplicate observations,
defined as two or more quotes with identical option type, strike, expiration
date, and price. In each such case, we eliminate all but one of the quotes.

In [5]:
# Assuming 'df' is your DataFrame

def delete_identical_filter(df):
    columns_to_check = ['cp_flag', 'strike_price','date', 'exdate', 'best_offer']

    # Drop duplicates based on specified columns
    df = df.drop_duplicates(subset=columns_to_check, keep='first')

    return df

In [6]:
optm_l1_df = delete_identical_filter(optm_df)
print(optm_l1_df.shape)

(3410580, 16)


In [31]:
optm_l1_df = pd.read_parquet(DATA_DIR / 'data_1996_2012_appendixB.parquet')
optm_l1_df.shape

(3410580, 15)

#### Identical Except Price Filter:
There are a few sets of quotes with identical
terms (type, strike, and maturity) but different prices. When this occurs, we
keep the quote whose T-bill-based implied volatility is closest to that of its
moneyness neighbors, and delete the others.

NEXT STEPS - determine moneyness neighbors >> only 18 duplicates so 9 deleted versus 11 in paper

In [32]:
def delete_identical_xprice_filter(df):
    columns_to_check = ['cp_flag', 'strike_price','date', 'exdate']

    # Drop duplicates based on specified columns
    df = df.drop_duplicates(subset=columns_to_check, keep='first')

    return df

In [33]:
columns_to_check = ['cp_flag', 'strike_price','date', 'exdate']

identical_df = optm_l1_df[optm_l1_df.duplicated(columns_to_check,keep=False)].sort_values(columns_to_check)

identical_df.shape

(20, 15)

In [34]:
identical_df = optm_l1_df[optm_l1_df.duplicated(columns_to_check,keep='first')].sort_values(columns_to_check)
identical_df.shape

(10, 15)

In [35]:
optm_l1_df = delete_identical_xprice_filter(optm_l1_df)
print(optm_l1_df.shape)

(3410570, 15)


In [36]:
def delete_zero_bid(df):
    df=df.loc[df['best_bid'] > 0]
    return df

In [37]:
optm_l1_df = delete_zero_bid(optm_l1_df)

In [38]:
optm_df.shape[0] - optm_l1_df.shape[0]

272088

In [39]:
optm_l1_df.shape[0]

3138492

### Level 2 filters
* “Days to Maturity <7 or >180” Filter
* “IV<5% or >100%” Filter
* “Moneyness <0.8 or >1.2” Filter
* “Implied Interest Rate <0” Filter

In [43]:
def clear_module_cache(module_name):
    if module_name in sys.modules:
        del sys.modules[module_name]

In [84]:
clear_module_cache('filter_option_data_02')

In [85]:
import filter_option_data_02 as f2

In [165]:
optm_l2_df = optm_l1_df.copy()
optm_l2_df.drop(columns=['secid'], inplace=True)
optm_l2_df['tb_m3'] = optm_l2_df['tb_m3']/100
optm_l2_df['tb_m3'].ffill(inplace=True)

In [166]:
optm_l2_df.shape

(3138492, 14)

In [168]:
optm_l2_df = f2.filter_time_to_maturity(optm_l2_df)
optm_l2_df.shape

(1840763, 16)

In [169]:
optm_l2_df.loc[(optm_l2_df['impl_volatility']>=0.05) & (optm_l2_df['impl_volatility']<=1)].shape

(1637279, 16)

In [140]:
test = optm_l2_df.loc[19]
test

date                    1996-01-04 00:00:00
open                                 621.32
close                                 617.7
cp_flag                                   P
exdate                  1996-03-16 00:00:00
impl_volatility                    0.181884
tb_m3                                0.0504
volume                                  0.0
best_bid                             1.1875
best_offer                           1.3125
strike_price                          550.0
contract_size                         100.0
sec_price                             617.7
mnyns                                0.8904
time_to_maturity                         72
time_to_matility_yrs                0.19726
Name: 19, dtype: object

In [175]:
clear_module_cache('bsm_pricer')
import bsm_pricer as bsm

In [178]:
bsm.calc_implied_volatility(test['best_bid'], test['sec_price'], test['strike_price'], test['time_to_matility_yrs'], test['tb_m3'], test['cp_flag'],initial_guess=0.18)

0.18163934515105062

In [170]:
def calc_implied_volatility_wrapper(row):
    try:
        implied_volatility = bsm.calc_implied_volatility(row['best_bid'], row['sec_price'], row['strike_price'], row['time_to_matility_yrs'], row['tb_m3'], row['cp_flag'], initial_guess=row['tb_m3'])
        print(f"Implied Volatility {row.name}: {implied_volatility}")
        return implied_volatility
    except ValueError as e:
        # Handle the error as per your requirement, you can print the error or return a default value
        print(f"Error for row {row.name}: {e}")
        return 0

In [160]:
optm_l2_df['impl_vol_bsm'] = optm_l2_df.apply(calc_implied_volatility_wrapper, axis=1)

Implied Volatility 0: 0.2
Implied Volatility 1: 0.2
Error for row 2: Optimization was not successful. Try different bounds or initial guess.
Implied Volatility 3: 1e-15
Implied Volatility 4: 0.2
Implied Volatility 5: 0.2
Implied Volatility 6: 0.2
Implied Volatility 7: 1e-15
Error for row 8: Optimization was not successful. Try different bounds or initial guess.
Implied Volatility 9: 0.2
Implied Volatility 10: 0.2
Implied Volatility 11: 0.2
Implied Volatility 12: 1e-15
Implied Volatility 13: 1e-15
Error for row 14: Optimization was not successful. Try different bounds or initial guess.
Implied Volatility 15: 0.2
Implied Volatility 16: 0.2
Error for row 17: Optimization was not successful. Try different bounds or initial guess.
Implied Volatility 18: 1e-15
Implied Volatility 19: 0.18082110239002566
Implied Volatility 20: 1e-15
Implied Volatility 21: 1e-15
Implied Volatility 22: 1e-15
Implied Volatility 23: 1e-15
Implied Volatility 24: 0.18666989988906618
Implied Volatility 25: 0.12120008

KeyboardInterrupt: 

In [29]:
len(optm_l2_df.loc[optm_l2_df['impl_volatility'].isna()])

187052

In [32]:
optm_l2_df.loc[(optm_l2_df['impl_volatility']>=0.05) & (optm_l2_df['impl_volatility']<=1.00)].shape[0]

1637279

In [28]:
optm_l2_df.shape[0]

1840763

In [34]:
conn.get_table(library="wrdsapps", table="opcrsphist", obs=5)

Unnamed: 0,secid,sdate,edate,permno,score
0,5001.0,1996-01-02,1996-03-13,10074.0,1.0
1,5002.0,1996-01-01,1996-02-22,10154.0,1.0
2,5003.0,,,,6.0
3,5004.0,1996-01-01,2000-01-27,80071.0,1.0
4,5005.0,1996-01-01,1997-08-12,85041.0,1.0


In [41]:
stdop = conn.get_table(library="optionm_all", table="stdopd1996")

In [42]:
stdop = stdop.loc[stdop['secid']==108105]

In [None]:
df = pull_Option_info("hholt")

In [None]:
opinfd.loc[opinfd['secid']==108105]

In [None]:
df.head()

In [None]:
dup_cols = ['cp_flag','strike_price']

duplicates = df[df.duplicated(subset=dup_cols, keep=False)]


In [None]:
duplicates.head()