In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import MinMaxScaler

In [2]:
def read_files(path, filenames):
    """Reads all files and returns a dataframe"""
    return pd.concat((pd.read_csv(path + f, skipinitialspace=True) for f in filenames))

def process_options(df_opt, call = True):
    """Cleans up column names and add time to live (Ttl) and volatility column to the dataframe"""
    keys = {key: key[key.find("[")+1:key.find("]")][0] + key[key.find("[")+1:key.find("]")][1:].lower()  for key in df_opt.keys()}
    df_opt = df_opt.rename(columns=keys)

    if call:
        keys = {"C_ask": "Ask", "C_bid": "Bid"}
    else:
        keys = {"P_ask": "Ask", "P_bid": "Bid"}
    df_opt = df_opt.rename(columns=keys)

    df_opt["Quote_date"] = pd.to_datetime(df_opt["Quote_date"])
    df_opt["Expire_date"] = pd.to_datetime(df_opt["Expire_date"])
    df_opt["Ttl"] = df_opt.apply(lambda row: (row.Expire_date - row.Quote_date).days, axis = 1)

    df_opt["Moneyness"] = df_opt["Underlying_last"] / df_opt["Strike"]
    
    df_vol = calculate_volatility(df_opt)
    df_opt = pd.merge(df_opt, df_vol, on ="Quote_date", how = "left")

    columns = ["Quote_date", "Expire_date",  "Underlying_last", "Strike", "Moneyness", "Ask", "Bid", "Ttl", "Volatility"]
    df_opt = df_opt[columns]
    df_opt = df_opt[df_opt["Ttl"] != 0]
    return df_opt[columns]

def calculate_volatility(df):
    """Calculate underlying 90 days annualized moving average volatility from dataset of options"""
    df_vol = df[["Quote_date", "Underlying_last"]].drop_duplicates()
    df_vol["Volatility"] = np.log(df_vol["Underlying_last"] / df_vol["Underlying_last"].shift()).rolling(90).std()*(252**0.5)
    return df_vol[["Quote_date", "Volatility"]]

def process_rates(df_r):
    """Renames rate duration"""
    df_r["Date"] = pd.to_datetime(df_r["Date"])
    df_r = df_r.rename(columns = {"Date" : "Quote_date", "3 Mo": "R"})
    return df_r[["Quote_date", "R"]]

def get_model_dataset(path_opt, filenames_opt, path_r, filenames_r, call = True):
    """Wrapper function to extract option data and rates. Returns a combined dataframe"""
    df_opt = read_files(path_opt, filenames_opt)
    df_r = read_files(path_r, filenames_r)
    df_opt = process_options(df_opt, call)
    df_r = process_rates(df_r)
    df = df_opt = pd.merge(df_opt, df_r, on ="Quote_date", how = "left")
    return df.dropna() #TODO: Fix handling of nan values

In [3]:
path_opt = "./data/options/"
filenames_opt = ["spx_eod_" + str(year) + (str(month) if month >= 10 else "0"+str(month)) +".txt" for year in range(2022, 2022) for month in range(1, 13)] + ["spx_eod_2022" + (str(month) if month >= 10 else "0"+str(month)) +".txt" for month in range(1, 10)]
#filenames_opt = ["spx_eod_202209.txt"]
path_r = "./data/rates/"
filenames_r = ["yield-curve-rates-2022.csv", "yield-curve-rates-1990-2021.csv"]

df_read = get_model_dataset(path_opt, filenames_opt, path_r, filenames_r, True)
print(df_read)
df_read.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 192 entries, 0 to 172733
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   Quote_date  192 non-null    datetime64[ns]
 1   Volatility  102 non-null    float64       
dtypes: datetime64[ns](1), float64(1)
memory usage: 4.5 KB
        Quote_date Expire_date  Underlying_last   Strike  Moneyness    Ask  \
725110  2022-05-09  2022-05-10          3993.26   3100.0   1.288148  893.3   
725111  2022-05-09  2022-05-10          3993.26   3200.0   1.247894  792.9   
725112  2022-05-09  2022-05-10          3993.26   3300.0   1.210079  692.2   
725113  2022-05-09  2022-05-10          3993.26   3400.0   1.174488  592.2   
725114  2022-05-09  2022-05-10          3993.26   3500.0   1.140931  493.0   
...            ...         ...              ...      ...        ...    ...   
1533707 2022-09-30  2026-12-18          3589.70   8800.0   0.407920   31.4   
1533708 2022-09-30

In [13]:
def lag_features(df, features, seq_length):
    """Transforms a raw 2D dataframe of option data into 2D dataframe ofsequence data.
    Last 2 indexes per sequence are bid and ask price. The len(features)*seq_length
    features before are sequences of features"""
    df = df.sort_values(["Expire_date", "Strike", "Ttl"], ascending = [True, True, False])

    for step in range(seq_length)[::-1]:
        for feature in features:
            df[feature + "-" + str(step)] = df[feature].shift(step)
    
    df["Check_strike"] = df["Strike"] == df["Strike"].shift(seq_length-1)
    df["Check_expire"] = df["Expire_date"] == df["Expire_date"].shift(seq_length-1)
    df = df[(df["Check_strike"] == True) & (df["Check_expire"] == True)]
    df = df.drop(["Check_strike", "Check_expire"], axis=1)
    df[["Bid_last", "Ask_last"]] = df[["Bid", "Ask"]]
    return df

def create_train_test(df, features, split_date, seq_length):
    """Splits data in training and test set, and transforms data to right 2D format"""
    train = lag_features(df[df["Quote_date"] < split_date], features, seq_length)
    test = lag_features(df[df["Quote_date"] >= split_date], features, seq_length)
    return train, test

def df_to_xy(df, num_features, seq_length):
    array = df.to_numpy()
    array_x, array_y = array[:, -num_features*seq_length - 2:-2].astype(np.float32), array[:,-2:].astype(np.float32)
    return array_x, array_y

def min_max_scale(train, test):
    """Scales a training and test set using MinMaxScaler. The scaler is calibrated on the training set"""
    scaler = MinMaxScaler()
    train = scaler.fit_transform(train)
    test = scaler.transform(test)
    return train, test


features = ["Moneyness", "Ttl", "R", "Volatility"]
train, test = create_train_test(df_read, features,  "2022-09-18", 5)

train_x, train_y = split_x_y(train, len(features), 5)

print(train.shape)
print(train)
print(test.shape)
print(train)

print(train_x.shape)
print(train_x)
print(train_y.shape)
print(train_y)


<class 'pandas.core.frame.DataFrame'>
Int64Index: 595433 entries, 757749 to 1449828
Data columns (total 32 columns):
 #   Column           Non-Null Count   Dtype         
---  ------           --------------   -----         
 0   Quote_date       595433 non-null  datetime64[ns]
 1   Expire_date      595433 non-null  datetime64[ns]
 2   Underlying_last  595433 non-null  float64       
 3   Strike           595433 non-null  float64       
 4   Moneyness        595433 non-null  float64       
 5   Ask              595433 non-null  float64       
 6   Bid              595433 non-null  float64       
 7   Ttl              595433 non-null  int64         
 8   Volatility       595433 non-null  float64       
 9   R                595433 non-null  float64       
 10  Moneyness-4      595433 non-null  float64       
 11  Ttl-4            595433 non-null  float64       
 12  R-4              595433 non-null  float64       
 13  Volatility-4     595433 non-null  float64       
 14  Moneyness-3   

In [5]:
df_read.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 776381 entries, 725110 to 1533711
Data columns (total 10 columns):
 #   Column           Non-Null Count   Dtype         
---  ------           --------------   -----         
 0   Quote_date       776381 non-null  datetime64[ns]
 1   Expire_date      776381 non-null  datetime64[ns]
 2   Underlying_last  776381 non-null  float64       
 3   Strike           776381 non-null  float64       
 4   Moneyness        776381 non-null  float64       
 5   Ask              776381 non-null  float64       
 6   Bid              776381 non-null  float64       
 7   Ttl              776381 non-null  int64         
 8   Volatility       776381 non-null  float64       
 9   R                776381 non-null  float64       
dtypes: datetime64[ns](2), float64(7), int64(1)
memory usage: 65.2 MB


In [9]:
from lstm import LSTM