In [1]:
import wrds
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import os
import sys
from pathlib import Path

# add the src directory to the path in order to import config
current_directory = Path.cwd()
src_path = current_directory.parent / "src"
sys.path.insert(0, str(src_path))

import load_option_data_01 as l1
import filter_option_data_01 as f1
import config
WRDS_USERNAME = Path(config.WRDS_USERNAME)
DATA_DIR = Path(config.DATA_DIR)

In [2]:
import bsm_pricer

In [4]:
def pull_option_price(wrds_username, year=2012): 
	db = wrds.Connection(wrds_username=wrds_username)

	sql_query = f"""
					SELECT 
						*
					FROM
						optionm_all.opprcd{year} AS a
					WHERE
						a.secid=108105;
				"""
		
	optm_df = db.raw_sql(sql_query, date_cols = ["date"])

	db.close()

	return optm_df

def pull_all_option_price(wrds_username, last_year=2012): 
	db = wrds.Connection(wrds_username=wrds_username)

	optm_df = []

	for year in range(1996,last_year+1):
		sql_query = f"""
						SELECT 
							a.date, a.exdate, a.cp_flag, a.strike_price
						FROM
							optionm_all.opprcd{year} AS a
						WHERE
							a.secid=108105;
					"""
		
		optm = db.raw_sql(sql_query, date_cols = ["date"])

		optm_df.append(optm)
		
	db.close()

	optm_df = pd.concat(optm_df)
	
	return optm_df


def pull_all_req_data(wrds_username, start_year=1996, end_year=2012): 
	db = wrds.Connection(wrds_username=wrds_username)

	optm_df = []

	for year in range(start_year, end_year+1):
		sql_query = f"""
						SELECT 
							a.date, a.exdate, a.cp_flag, a.strike_price, a.best_bid, a.best_offer,a.volume,
							b.open, b.close, a.impl_volatility, c.tb_m3
						FROM
							optionm_all.opprcd{year} AS a
						JOIN 
							optionm_all.secprd{year} AS b ON a.date = b.date AND a.secid = b.secid
						JOIN 
							frb_all.rates_daily AS c ON c.date = a.date 
						WHERE
							a.secid=108105 AND a.date <= '2012-01-31';
					"""
		
		optm = db.raw_sql(sql_query, date_cols = ["date"])

		optm_df.append(optm)
		
	db.close()

	optm_df = pd.concat(optm_df)
	
	return optm_df

def pull_Opt_Sec_info_WRDS(wrds_username = WRDS_USERNAME, start = '1996-01-04', end = '2012-01-31'): 
	#https://wrds-www.wharton.upenn.edu/pages/get-data/option-suite-wrds/us-option-level-output/
	sql_query = f"""
		SELECT  
			a.*, c.tb_m3
		FROM
			beta.wrdsapps_optionsig  AS a
		JOIN 
			frb_all.rates_daily AS c ON c.date = a.date 
		WHERE
			(a.secid = 108105) 
		AND 
			(a.date <= \'{end}\') 
		AND 
			(a.date >= \'{start}\')
		LIMIT 1000
	""" 
	#LIMIT 1000
	db = wrds.Connection(wrds_username=wrds_username)
	optm = db.raw_sql(sql_query, date_cols = ["date", "exdate"])
	db.close()
	return optm

In [3]:
optm_df = l1.load_all_optm_data()

In [4]:
print(optm_df.shape)
print(optm_df['cp_flag'].value_counts().to_dict())

(3410580, 12)
{'P': 1706360, 'C': 1704220}


In [5]:
optm_df['date'] = pd.to_datetime(optm_df['date'])
optm_df['option_price'] = (optm_df['best_bid'] + optm_df['best_offer'])/2
optm_df['index_price'] = (optm_df['open'] + optm_df['close'])/2
optm_df['strike_price'] = optm_df['strike_price']/1000
optm_df['tb_m3'] = optm_df['tb_m3']/100

### Level 1 filters

#### Identical Filter:
The OptionMetrics data set contain duplicate observations,
defined as two or more quotes with identical option type, strike, expiration
date, and price. In each such case, we eliminate all but one of the quotes.

In [6]:
# Assuming 'df' is your DataFrame

def delete_identical_filter(df):
    columns_to_check = ['cp_flag', 'strike_price','date', 'exdate', 'best_offer']

    # Drop duplicates based on specified columns
    df = df.drop_duplicates(subset=columns_to_check, keep='first')

    return df

In [7]:
optm_l1_df = delete_identical_filter(optm_df)
print(optm_l1_df.shape)

(3410580, 14)


#### Identical Except Price Filter:
There are a few sets of quotes with identical
terms (type, strike, and maturity) but different prices. When this occurs, we
keep the quote whose T-bill-based implied volatility is closest to that of its
moneyness neighbors, and delete the others.

NEXT STEPS - determine moneyness neighbors >> only 18 duplicates so 9 deleted versus 11 in paper

In [8]:
def delete_identical_xprice_filter(df):
    columns_to_check = ['cp_flag', 'strike_price','date', 'exdate']

    # Drop duplicates based on specified columns
    df = df.drop_duplicates(subset=columns_to_check, keep='first')

    return df

In [9]:
columns_to_check = ['cp_flag', 'strike_price','date', 'exdate']

identical_df = optm_l1_df[optm_l1_df.duplicated(columns_to_check,keep=False)].sort_values(columns_to_check)

identical_df.shape

(20, 14)

In [10]:
identical_df = optm_l1_df[optm_l1_df.duplicated(columns_to_check,keep='first')].sort_values(columns_to_check)
identical_df.shape

(10, 14)

In [11]:
optm_l1_df = delete_identical_xprice_filter(optm_l1_df)
print(optm_l1_df.shape)

(3410570, 14)


In [12]:
def delete_zero_bid(df):
    df=df.loc[df['best_bid'] > 0]
    return df

In [13]:
optm_l1_df = delete_zero_bid(optm_l1_df)

In [14]:
optm_df.shape[0] - optm_l1_df.shape[0]

272088

### Level 2 filters
* “Days to Maturity <7 or >180” Filter
* “IV<5% or >100%” Filter
* “Moneyness <0.8 or >1.2” Filter
* “Implied Interest Rate <0” Filter

In [15]:
opmt_l2_df = optm_l1_df.copy()
opmt_l2_df.drop(columns=['secid'], inplace=True)

In [16]:
# Days to maturity <7 or >180
opmt_l2_df['time_to_maturity'] = (opmt_l2_df['exdate'] - opmt_l2_df['date']).dt.days

opmt_l2_df = opmt_l2_df.loc[(opmt_l2_df['time_to_maturity'] >= 7) & (opmt_l2_df['time_to_maturity'] <= 180)]

In [17]:
opmt_l2_df['time_to_maturity_yrs'] = opmt_l2_df['time_to_maturity']/365

In [18]:
test = opmt_l2_df.iloc[41964,:]

In [19]:
test.to_frame()

Unnamed: 0,66621
date,1996-11-12 00:00:00
open,731.87
close,729.56
cp_flag,P
exdate,1997-03-22 00:00:00
impl_volatility,0.155791
tb_m3,0.0504
volume,9.0
best_bid,18.25
best_offer,19.0


In [20]:
bsm_pricer.calc_implied_volatility(18.625, 730.715, 720, 0.36, .0504, 'P')

0.16820384210428072

In [23]:
def calc_implied_volatility_wrapper(row):
    try:
        implied_volatility = bsm_pricer.calc_implied_volatility(row['option_price'], row['index_price'], row['strike_price'], row['time_to_maturity_yrs'], row['tb_m3'], row['cp_flag'])
        return implied_volatility
    except ValueError as e:
        # Handle the error as per your requirement, you can print the error or return a default value
        print(f"Error for row {row.name}: {e}")
        return 0

In [24]:
opmt_l2_df['impl_vol_bsm'] = opmt_l2_df.apply(calc_implied_volatility_wrapper, axis=1)

Error for row 31: Optimization was not successful. Try different bounds or initial guess.
Error for row 55: Optimization was not successful. Try different bounds or initial guess.
Error for row 75: Optimization was not successful. Try different bounds or initial guess.
Error for row 85: Optimization was not successful. Try different bounds or initial guess.
Error for row 92: Optimization was not successful. Try different bounds or initial guess.
Error for row 110: Optimization was not successful. Try different bounds or initial guess.
Error for row 125: Optimization was not successful. Try different bounds or initial guess.
Error for row 131: Optimization was not successful. Try different bounds or initial guess.
Error for row 174: Optimization was not successful. Try different bounds or initial guess.
Error for row 197: Optimization was not successful. Try different bounds or initial guess.
Error for row 208: Optimization was not successful. Try different bounds or initial guess.
Erro

KeyboardInterrupt: 

In [29]:
len(opmt_l2_df.loc[opmt_l2_df['impl_volatility'].isna()])

187052

In [32]:
opmt_l2_df.loc[(opmt_l2_df['impl_volatility']>=0.05) & (opmt_l2_df['impl_volatility']<=1.00)].shape[0]

1637279

In [28]:
opmt_l2_df.shape[0]

1840763

In [34]:
conn.get_table(library="wrdsapps", table="opcrsphist", obs=5)

Unnamed: 0,secid,sdate,edate,permno,score
0,5001.0,1996-01-02,1996-03-13,10074.0,1.0
1,5002.0,1996-01-01,1996-02-22,10154.0,1.0
2,5003.0,,,,6.0
3,5004.0,1996-01-01,2000-01-27,80071.0,1.0
4,5005.0,1996-01-01,1997-08-12,85041.0,1.0


In [41]:
stdop = conn.get_table(library="optionm_all", table="stdopd1996")

In [42]:
stdop = stdop.loc[stdop['secid']==108105]

In [None]:
df = pull_Option_info("hholt")

In [None]:
opinfd.loc[opinfd['secid']==108105]

In [None]:
df.head()

In [None]:
dup_cols = ['cp_flag','strike_price']

duplicates = df[df.duplicated(subset=dup_cols, keep=False)]


In [None]:
duplicates.head()