In [1]:
from IPython.display import Image
import datetime
import numpy as np
import os
import pandas as pd
import plotly.graph_objects as go
import plotly.io as pio
from plotly.subplots import make_subplots
import time
from option_strats import BullCallSpread

# Ensure working directory path is correct
while os.path.split(os.getcwd())[-1] != "Options-Project":
    os.chdir(os.path.dirname(os.getcwd()))

# Set plotly theme
pio.templates.default = "ggplot2"

### User Defined Variables

12 sub-models, representing lags (days): `1`, `5`, `10`, `15`, `20`, `40`, `65` (1/4 year), `90`, `130` (1/2 year), `260` (1 year), `390`, `520`, `780` (3 years)

In [2]:
ticker = "AAPL"
model_params_path = "data/model_params/"
adj_close_path = "data/adj_close/"
dividends_path = "data/dividends/"

sub_model_lags = [1, 5, 10, 15, 20, 40, 65, 90, 130, 260, 390, 520]
train_test_ratio = 0.9
kernel_resolution = 100

### Load Features Derived From Options

In [10]:
start_time = time.time()

options_params = {}

for param_type in ["Delta", "VIX", "custom"]:

    param_path = os.path.join(model_params_path, param_type, ticker)
    param_list = []

    # Sanity check
    assert os.path.isdir(param_path), f"Can't find {param_type} parameters for {ticker}!"

    for file_year in os.listdir(param_path):
        # Only take "param" type files
        if file_year.split("_")[-1] == "param.csv":
            df = pd.read_csv(os.path.join(param_path, file_year))

            # Convert columns to correct format
            df["date"] = pd.to_datetime(df["date"]).dt.date

            if "interval" in df.columns:
                df["interval"] = df["interval"].astype(str)

            param_list.append(df)

    # Concat, flatten, and add to model params
    if param_list:
        # Concat for all years
        df = pd.concat(param_list, ignore_index=True)

        # Flatten to "date" level
        if param_type == "custom":
            pivot_cols = ["tag"]
        else:
            pivot_cols = ["tag", "interval"]

        df = df.pivot(index="date", columns=pivot_cols)

        # Flatten column levels
        df.columns = ["-".join(col) for col in df.columns.values]

        options_params[param_type] = df

print(f"Load options parameters - {round(time.time() - start_time, 2)} seconds")

Load options parameters - 0.21 seconds


### Load Treasury Yields

In [11]:
start_time = time.time()

rates_params = {}

for rate_type in ["1_Month", "3_Month", "6_Month", "1_Year", "2_Year", "3_Year", "5_Year", "5_Year_Inflation"]:
    file_path = os.path.join(model_params_path, "treasury_yields", f"{rate_type}.csv")

    df = pd.read_csv(file_path)

    # Convert columns to correct format
    df["date"] = pd.to_datetime(df["date"]).dt.date

    # Rename column since they

    rates_params[rate_type] = df[["date", "continuous rate"]].rename(
        columns={"continuous rate": f"{rate_type.lower()}_rate"})

print(f"Load rate parameters - {round(time.time() - start_time, 2)} seconds")

Load rate parameters - 0.09 seconds


### Load Prediction Target

In [5]:
date_close_df = pd.read_csv(os.path.join(adj_close_path, ticker, (ticker + ".csv")))
date_close_df["date"] = pd.to_datetime(date_close_df["date"]).dt.date

dividends_df = pd.read_csv(os.path.join(dividends_path, ticker, (ticker + "_ts.csv")))
dividends_df["date"] = pd.to_datetime(dividends_df["date"]).dt.date

date_close_df = date_close_df[["date", "close"]].merge(right=dividends_df,
                                                       how="inner", on="date")

### Preprocess and Combine

In [17]:
date_close_df["adj_close"] = date_close_df["close"] - date_close_df["dividend"]

params_df = date_close_df[["date", "adj_close"]].copy()

# Add treasury rates
for n in rates_params.keys():
    params_df = pd.merge(params_df, rates_params[n],
                         how="inner", on="date",
                         validate="1:1")

# Add options params
for m in options_params.keys():
    params_df = pd.merge(params_df, options_params[m],
                         how="inner", on="date",
                         validate="1:1")

params_df.shape

(4240, 78)

By doing inner combine on dates with closing prices, treasury yields, and features derived from options, we have a total of **76** features over **4240** days.

However, due to missing features for various days (e.g. FRED takes a few more holidays a year than NYSE), and missing options features (unable to derive feature from Delta curve etc.), the actual amount is a bit less.

In [18]:
params_df.dropna(inplace=True)

params_df.shape

(3711, 78)