# OptionsFilters Summary

## Imports

In [None]:
import numpy as np
import pandas as pd
import random
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.cm as cm

import os

import sys

sys.path.insert(1, "./../src/")

import time

import pull_option_data

from pathlib import Path

import bsm_pricer as bsm
from settings import config
import datetime
import level_1_filters as f1
import level_2_filters as f2
import level_3_filters as f3
import pull_option_data
import pull_option_data as l1


import time
import warnings
import wrds

In [None]:
import importlib

In [None]:
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

pio.templates.default = "plotly_white"
warnings.filterwarnings("ignore")

In [None]:
OUTPUT_DIR = Path(config("OUTPUT_DIR"))
DATA_DIR = Path(config("DATA_DIR"))
WRDS_USERNAME = config("WRDS_USERNAME")
DATA_DIR = DATA_DIR / "options"

# Import dates from pull_option_data
from pull_option_data import START_DATE_01, END_DATE_01, START_DATE_02, END_DATE_02

NOTE_START = START_DATE_01
NOTE_END = END_DATE_01

In [None]:
DATE_RANGE = f"{pd.Timestamp(START_DATE_01):%Y-%m}_{pd.Timestamp(END_DATE_02):%Y-%m}"

## Function Definitions

In [None]:
def compare_filtered_data_to_orig(filtered_df, orig_df, filter_name="Identical Filter"):
    print("|", filter_name, ":")
    print(
        f">> Records removed: {orig_df.shape[0] - filtered_df.shape[0]:,.0f} out of {orig_df.shape[0]:,.0f} ({(orig_df.shape[0] - filtered_df.shape[0]) / orig_df.shape[0]:,.2%})"
    )
    print(
        f">> Filtered data shape: {filtered_df.shape[0]:,.0f} rows // {filtered_df.shape[1]:,.0f} columns"
    )

## Data Note

- The original CJS 2013 paper used data from 1986 through 2012 (26 years of data).
- Due to the unavailability of SPX option data from 1985 to 1995, we replicated the **54 CJS portfolios** using data from **January 1996 to December 2019** (23 years).
- Our dataset (from 1996 to 2019) comprises over 19.2 million rows of SPX options data.
- The original effectiveness of the data filters was examined in *The Puzzle of Filtering Index Options (Desai, Hammock, Holt; 2024)*. Due to similar reasons as outlined in that work (loss of data filter elegance when transposed across timeframes), we expect that the data filter parameters (and thus the portfolios constructed) will not yield identical results to the original published work, and the user should not have this expectation. 

*The spirit of this project is to replicate with the highest practical fidelity the *process* of data filtration and portfolio construction in the original CJS and HKM papers, without commenting on the effectiveness or appropriateness of the process and parameters. We leave that analysis to a future study.*

In [None]:
file_path = Path(DATA_DIR / "data_1996-01_2019-12.parquet")

if file_path.exists():
    print(">> File already exists. Loading data from file...")
    # Load the data from the file
    raw_option_data = pd.read_parquet(file_path)
else:
    print(">> File does not exist. Loading data from WRDS...")
    # Load the data from WRDS
    raw_option_data = pd.concat(
        [
            pd.read_parquet(Path(DATA_DIR / "data_1996-01_2012-01.parquet")),
            pd.read_parquet(Path(DATA_DIR / "data_2012-02_2019-12.parquet")),
        ]
    )
    raw_option_data.to_parquet(file_path)

# add the mid price
raw_option_data["mid_price"] = (
    raw_option_data["best_bid"] + raw_option_data["best_offer"]
) / 2
# adjust strike price
raw_option_data["strike_price"] /= 1000
# calc moneyness
raw_option_data = f1.calc_moneyness(raw_option_data)
# rename IV column
raw_option_data.rename(columns={"impl_volatility": "IV"}, inplace=True)

raw_option_data

In [None]:
# Histogram parameters
counts, bins, patches = plt.hist(raw_option_data["date"], bins=30)

# Apply magma colormap
cmap = cm.get_cmap("plasma_r", len(patches))
normed = (counts - counts.min()) / (counts.max() - counts.min())
for patch, norm in zip(patches, normed):
    patch.set_facecolor(cmap(norm))

# Labels and aesthetics
plt.xlabel("Date")
plt.yscale("log")
plt.grid()
plt.ylabel("Number of Observations (log scale)")
plt.title("Raw Data Over Time: Significant Increase in Observations")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# 1. Data Filtration

In order to minimize possible quoting errors, CJS filtered the raw options data through 3 levels of filters. The filters are applied to the trade-in (buy) side to make sure the portfolios are buying into reliable quotes. When positions are exited, if there is no quote in the filtered data, the raw data is searched. These filters are detailed in *Appendix B* of CJS. 

## Level 1 Filters

* **Identical Filter:** Retain only one instance of quotes with the same **option type**, **strike price**, **expiration date/maturity**, and **price**. 

* **Identical Except Price Filter:** There are a few sets of quotes with identical terms (**type**, **strike**, and **maturity**) but different prices. Keep the quote whose **T-bill-based implied volatility** is closest to that of its **moneyness neighbors**, and delete the others.  

* **Bid = 0 Filter:** Drop quotes with a **bid price** of zero, thereby avoiding low-valued options. Also, a zero bid may indicate censoring as negative bids cannot be recorded.

* **Volume = 0 Filter:** Drop quotes of zero for volumes. *Note: Appendix B of CJS does not explicitly detail this filter, but we include it here since it is included in *Table B.1. Filters* of CJS.*  



### Identical Filter

We drop records with identical **option type, strike, expiration date, and price**. In each such case, we eliminate all but one of the quotes. 

In [None]:
spx_filtered = f1.identical_filter(raw_option_data)
spx_filtered

In [None]:
compare_filtered_data_to_orig(spx_filtered, raw_option_data, "Identical Filter")

### Identical Except Price Filter

We drop records with **identical terms (type, strike, and maturity) but different prices**. Retained quotes are those whose T-bill-based implied volatility is closest to that of its moneyness neighbors, and delete the others. 

In [None]:
spx_filtered_2 = f1.identical_but_price_filter(spx_filtered)
spx_filtered_2

In [None]:
compare_filtered_data_to_orig(
    spx_filtered_2, spx_filtered, "Identical Except Price Filter:"
)

## Filter Options with Bid = 0 

We drop quotes with bids of zero (implying little to no market interest) and thereby avoiding low-valued options.

In [None]:
spx_filtered_3 = f1.delete_zero_bid_filter(spx_filtered_2)
compare_filtered_data_to_orig(spx_filtered_3, spx_filtered_2, "Delete Zero Bid Filter")

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 6))

# Plot distribution of best_bid for optm_l1_id
axes[0].hist(spx_filtered_2["best_bid"], bins=30)
axes[0].set_xlabel("Best Bid")
axes[0].set_ylabel("Frequency")
axes[0].set_title("Distribution of Best Bid - Pre-filter")

# Plot distribution of best_bid for optm_l1_zbid
axes[1].hist(spx_filtered_3["best_bid"], bins=30, color="darkred")
axes[1].set_xlabel("Best Bid")
axes[1].set_ylabel("Frequency")
axes[1].set_title("Distribution of Best Bid - Post-filter")

plt.tight_layout()
plt.show()
print("Need some better plots here...")

## Filter Options with Vol = 0

Table B.1 of *CJS 2013* appears to signal the inclusion of a Volume = 0 filter, however, Appendix B does not describe this filter. We note that there are a significant number of quotes with zero volume in our dataset, and the application of this filter would dramatically skew the results from the original dataset (with 70% of remaining records dropped using this criterion). Given that Appendix B of *CJS 2013* did not describe this filter, we assume its inclusion in Table B.1 was an error.

In [None]:
spx_filtered_4 = f1.delete_zero_volume_filter(spx_filtered_3)
compare_filtered_data_to_orig(
    spx_filtered_4, spx_filtered_3, "Delete Zero Volume Filter"
)

In [None]:
spx_filtered_4 = spx_filtered_3.copy(deep=True)

In [None]:
zero_vol_rows = spx_filtered_3[spx_filtered_3["volume"] == 0]
plt.hist(zero_vol_rows["date"], bins=30)
plt.xlabel("Date")
plt.ylabel("Count")
plt.title("Distribution of Options with Zero Volume")
plt.show()

In [None]:
# save L1 filtered data
spx_l1_filtered = spx_filtered_4.copy(deep=True)
spx_l1_filtered.to_parquet(
    Path(DATA_DIR / f"L1_filtered_{DATE_RANGE}.parquet"), index=False
)

---

## Level 2 Filters


* **Days to Maturity <7 or >180 Filter:** Drop options with fewer than seven or more than 180 calendar days to expiration. 


* **IV<5% or >100% Filter:** We remove all option quotes with implied volatilities lower than 5% or higher than 100%, computed using T-bill interest rates.

* **Moneyness <0.8 or >1.2 Filter:** We remove all option quotes with moneyness, the ratio of strike price to index price, below 0.8 or above 1.2. These options have little value beyond their intrinsic value and are also very thinly traded.

* **Implied Interest Rate <0 Filter:** When filtering outliers, we use T-bill interest rates to compute implied volatilities. T-bill interest rates are obtained from the Federal Reserve’s H.15 release. We assign a T-bill rate to each observation by assuming that we can use the next shortest rate if the time to expiration of the option is shorter than the shortest constant maturity rate.
Our goal is to obtain an interest rate that is as close as possible to the one faced by investors in the options market. It appears that the T-bill rates are
not the relevant ones when pricing these options. Specifically, when the T-bill rates are used, put and call implied volatilities do not line up very well; for
example, the T-bill rate tends to be too high for short maturity options, perhaps because no T-bill has maturity of less than a month. To address these
issues, we compute a put-call parity-implied interest rate. Since we believe that put-call parity holds reasonably well in this deep and liquid European options
market, we use the put-call parity-implied interest rate as our interest rate in the remainder of the paper and for further filters.
To construct this rate, we take all put-call pairs of a given maturity and impose put-call parity using the bid-ask midpoint as the price, and allowing the interest rate to adjust. We remove 89,563 pairs with a negative implied interest rate. We then take the median-implied interest rate across all remaining pairs of the same maturity with moneyness between 0.95 and 1.05 and assign it to all quotes with that maturity. We are able to directly assign an implied interest rate to 93% of our sample in this way. We fill in the gaps by interpolating across maturities and if necessary, across days. Our implied interest rate is on average 54 bps above the T-bill rate

* **Unable to Compute IV Filter:** We remove quotes that imply negative time
value.


### Days to Maturity <7 or >180 Filter

We drop records with days to maturity less than 7 and greater than 180 days. The short maturity options tend to move erratically close to expiration and the long maturity options lack volume and open interest.

In [None]:
spx_filtered_5 = f2.days_to_maturity_filter(spx_filtered_4, min_days=7, max_days=180)
spx_filtered_5

In [None]:
compare_filtered_data_to_orig(spx_filtered_5, spx_filtered_4, "Days to Maturity Filter")

### IV<5% or >100% Filter

We drop quotes with **implied volatilities lower than 5% or higher than 100%**, computed using T-bill interest rates of the nearest matching maturity. Such extreme IV values are likely a quotation problem or attached to low-value options.

In [None]:
spx_filtered_6 = f2.iv_range_filter(spx_filtered_5, min_iv=0.05, max_iv=1.0)
spx_filtered_6

In [None]:
compare_filtered_data_to_orig(spx_filtered_6, spx_filtered_5, "IV Range Filter")

### Moneyness <0.8 or >1.2 Filter

We remove all option quotes with moneyness ($\frac{K}{S}$) below 0.8 or above 1.2. These options have little value beyond their intrinsic value and are also very thinly traded.


In [None]:
spx_filtered_7 = f2.moneyness_filter(
    spx_filtered_6, min_moneyness=0.8, max_moneyness=1.2
)
spx_filtered_7

In [None]:
compare_filtered_data_to_orig(spx_filtered_7, spx_filtered_6, "Moneyness Filter")

### Implied Interest Rate < 0 Filter

We remove all option quotes with negative implied interest rates, assuming put-call parity. 


In [None]:
spx_filtered_8 = f2.implied_interest_rate_filter(spx_filtered_7)
spx_filtered_8

In [None]:
compare_filtered_data_to_orig(
    spx_filtered_8, spx_filtered_7, "Negative Implied Interest Rate Filter"
)

### Unable to Compute IV Filter

We remove all option quotes that imply negative time value. 


In [None]:
spx_filtered_9 = f2.unable_to_compute_iv_filter(spx_filtered_8)
spx_filtered_9

In [None]:
compare_filtered_data_to_orig(
    spx_filtered_9, spx_filtered_8, "Unable to Compute IV Filter"
)

In [None]:
# save L2 filtered data
spx_l2_filtered = spx_filtered_9.copy(deep=True)
spx_l2_filtered.to_parquet(
    Path(DATA_DIR / f"L2_filtered_{DATE_RANGE}.parquet"), index=False
)

---

## Level 3 Filters


* **IV Filter:** The IV filter removes volatility outliers to reduce the prevalence of apparent butterfly arbitrage. 

* **Put-Call Parity Filter:** The puts and calls need to be matched up based on trading date, expiry date, and option type.


### IV Filter

The IV filter removes volatility outliers to reduce the prevalence of apparent butterfly arbitrage. This involves dropping calls and puts that have the same expiration date and strike price, but have anomalous prices due to extreme implied volatility values. For each *date* and *maturity*, we fit a quadratic curve to the implied volatility of puts and calls (separately) through the observed log implied volatilities.

In [None]:
spx_filtered_10 = f3.IV_filter(spx_filtered_9, DATE_RANGE)
spx_filtered_10

In [None]:
compare_filtered_data_to_orig(spx_filtered_10, spx_filtered_9, "IV Filter")

### Put-Call Parity Filter

The puts and calls need to be matched up based on trading date, expiry date, and option type. We then calculate the put-call parity implied interest rate, and filter out outliers based on the standard deviation of the relative distance between the interest rate implied by put-call parity, and the calculated daily median 3-month T-bill rate from the pulled data.

In [None]:
spx_filtered_11 = f3.put_call_filter(spx_filtered_10, DATE_RANGE)
spx_filtered_11

In [None]:
compare_filtered_data_to_orig(
    spx_filtered_11, spx_filtered_10, "Put-Call Parity Filter"
)

In [None]:
# save L3 filtered data
spx_l3_filtered = spx_filtered_11.copy(deep=True)
spx_l3_filtered.to_parquet(
    Path(DATA_DIR / f"L3_filtered_{DATE_RANGE}.parquet"), index=False
)

In [None]:
# save final cleaned data to folder
spx_filtered_final = spx_filtered_11.copy(deep=True)

final_savefile = DATA_DIR / f"spx_filtered_final_{DATE_RANGE}.parquet"

try:
    spx_filtered_final.to_parquet(final_savefile, index=True)
    print(f">> Final filtered data saved to {final_savefile}")
except FileNotFoundError:
    print(f">> {DATA_DIR} does not exist. Creating directory...")
    os.makedirs(DATA_DIR, exist_ok=True)
    spx_filtered_final.to_parquet(final_savefile, index=True)
    print(f">> Final filtered data saved to {final_savefile}")

----