In [5]:
import wrds
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import os
import sys
from pathlib import Path

# add the src directory to the path in order to import config
current_directory = Path.cwd()
src_path = current_directory.parent / "src"
sys.path.insert(0, str(src_path))

import config
WRDS_USERNAME = Path(config.WRDS_USERNAME)

In [50]:
def pull_option_price(wrds_username, year=2012): 
	db = wrds.Connection(wrds_username=wrds_username)

	sql_query = f"""
					SELECT 
						*
					FROM
						optionm_all.opprcd{year} AS a
					WHERE
						a.secid=108105;
				"""
		
	optm_df = db.raw_sql(sql_query, date_cols = ["date"])

	db.close()

	return optm_df

def pull_all_option_price(wrds_username, last_year=2012): 
	db = wrds.Connection(wrds_username=wrds_username)

	optm_df = []

	for year in range(1996,last_year+1):
		sql_query = f"""
						SELECT 
							a.date, a.exdate, a.cp_flag, a.strike_price
						FROM
							optionm_all.opprcd{year} AS a
						WHERE
							a.secid=108105;
					"""
		
		optm = db.raw_sql(sql_query, date_cols = ["date"])

		optm_df.append(optm)
		
	db.close()

	optm_df = pd.concat(optm_df)
	
	return optm_df


def pull_all_req_data(wrds_username, start_year=1996, end_year=2012): 
	db = wrds.Connection(wrds_username=wrds_username)

	optm_df = []

	for year in range(start_year, end_year+1):
		sql_query = f"""
						SELECT 
							a.date, a.exdate, a.cp_flag, a.strike_price, a.best_bid, a.best_offer,a.volume,
							b.open, b.close, a.impl_volatility, c.tb_m3
						FROM
							optionm_all.opprcd{year} AS a
						JOIN 
							optionm_all.secprd{year} AS b ON a.date = b.date AND a.secid = b.secid
						JOIN 
							frb_all.rates_daily AS c ON c.date = a.date 
						WHERE
							a.secid=108105 AND a.date <= '2012-01-31';
					"""
		
		optm = db.raw_sql(sql_query, date_cols = ["date"])

		optm_df.append(optm)
		
	db.close()

	optm_df = pd.concat(optm_df)
	
	return optm_df

In [51]:
optm_df = pull_all_req_data(WRDS_USERNAME)

Loading library list...
Done


In [53]:
print(optm_df.shape)
print(optm_df['cp_flag'].value_counts().to_dict())

(3410580, 11)
{'P': 1706360, 'C': 1704220}


In [54]:
optm_df['date'] = pd.to_datetime(optm_df['date'])
optm_df['option_price'] = (optm_df['best_bid'] + optm_df['best_offer'])/2
optm_df['index_price'] = (optm_df['open'] + optm_df['close'])/2
optm_df['strike_price'] = optm_df['strike_price']/1000

### Level 1 filters

#### Identical Filter:
The OptionMetrics data set contain duplicate observations,
defined as two or more quotes with identical option type, strike, expiration
date, and price. In each such case, we eliminate all but one of the quotes.

In [84]:
# Assuming 'df' is your DataFrame

def delete_identical_filter(df):
    columns_to_check = ['cp_flag', 'strike_price','date', 'exdate', 'best_offer']

    # Drop duplicates based on specified columns
    df = df.drop_duplicates(subset=columns_to_check, keep='first')

    return df

In [85]:
optm_l1_df = delete_identical_filter(optm_df)
print(optm_l1_df.shape)

(3410580, 13)


#### Identical Except Price Filter:
There are a few sets of quotes with identical
terms (type, strike, and maturity) but different prices. When this occurs, we
keep the quote whose T-bill-based implied volatility is closest to that of its
moneyness neighbors, and delete the others.

NEXT STEPS - determine moneyness neighbors >> only 18 duplicates so 9 deleted versus 11 in paper

In [86]:
def delete_identical_xprice_filter(df):
    columns_to_check = ['cp_flag', 'strike_price','date', 'exdate']

    # Drop duplicates based on specified columns
    df = df.drop_duplicates(subset=columns_to_check, keep='first')

    return df

In [87]:
columns_to_check = ['cp_flag', 'strike_price','date', 'exdate']

identical_df = optm_l1_df[optm_l1_df.duplicated(columns_to_check,keep=False)].sort_values(columns_to_check)

identical_df.shape

(20, 13)

In [88]:
identical_df = optm_l1_df[optm_l1_df.duplicated(columns_to_check,keep='first')].sort_values(columns_to_check)
identical_df.shape

(10, 13)

In [89]:
optm_l1_df = delete_identical_xprice_filter(optm_l1_df)
print(optm_l1_df.shape)

(3410570, 13)


In [90]:
def delete_zero_bid(df):
    df=df.loc[df['best_bid'] > 0]
    return df

In [91]:
optm_l1_df = delete_zero_bid(optm_l1_df)

In [92]:
optm_df.shape[0] - optm_l1_df.shape[0]

272088

In [32]:
conn = wrds.Connection(wrds_username=WRDS_USERNAME)
conn.list_tables(library="wrdsapps")

Loading library list...
Done


['_eq_2_bm_',
 'bdxcrspcomplink',
 'bdxinslink',
 'boardex_trinsider',
 'boardex_trinsider_link',
 'bondcrsp_link',
 'bondret',
 'chars',
 'compeushortlink',
 'country',
 'crspm',
 'ds2ws_linktable',
 'dswslink',
 'dwcountryreturns',
 'eushort',
 'exec_boardex',
 'exec_boardex_link',
 'exec_ciq',
 'exec_ciq_link',
 'firm_ratio',
 'firm_ratio_ccm',
 'firm_ratio_ibes',
 'firm_ratio_ibes_ccm',
 'fscrsplink',
 'ibcrsphist',
 'id',
 'id_ccm',
 'id_ibes',
 'id_ibes_ccm',
 'mastertable',
 'motherfile',
 'mwcountryreturns',
 'opcrsphist',
 'price',
 'price_202005',
 'price_202006',
 'price_202007',
 'price_202008',
 'price_202009',
 'price_202010',
 'price_202011',
 'price_202012',
 'price_pre',
 'promo',
 'promo_20200501',
 'promo_20200502',
 'promo_20200503',
 'promo_20200504',
 'promo_20200505',
 'promo_20200506',
 'promo_20200507',
 'promo_20200508',
 'promo_20200509',
 'promo_20200510',
 'promo_20200511',
 'promo_20200512',
 'promo_20200513',
 'promo_20200514',
 'promo_20200515',
 'promo_

In [34]:
conn.get_table(library="wrdsapps", table="opcrsphist", obs=5)

Unnamed: 0,secid,sdate,edate,permno,score
0,5001.0,1996-01-02,1996-03-13,10074.0,1.0
1,5002.0,1996-01-01,1996-02-22,10154.0,1.0
2,5003.0,,,,6.0
3,5004.0,1996-01-01,2000-01-27,80071.0,1.0
4,5005.0,1996-01-01,1997-08-12,85041.0,1.0


In [41]:
stdop = conn.get_table(library="optionm_all", table="stdopd1996")

In [42]:
stdop = stdop.loc[stdop['secid']==108105]

In [None]:
df = pull_Option_info("hholt")

In [None]:
opinfd.loc[opinfd['secid']==108105]

In [None]:
df.head()

In [None]:
dup_cols = ['cp_flag','strike_price']

duplicates = df[df.duplicated(subset=dup_cols, keep=False)]


In [None]:
duplicates.head()