In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from preprocessing import read_files, process_options

In [2]:
def process_rates(df_r):
    """Renames rate duration"""
    df_r["Date"] = pd.to_datetime(df_r["Date"])
    keys = {  "Date" : "Quote_date",
                                    "1 Mo": 30,
                                    "3 Mo": 90,
                                    "6 Mo": 180,
                                    "1 Yr": 365,
                                    "2 Yr": 365*2,
                                    "3 Yr": 365*3,
                                    "5 Yr": 365*5,
                                    "7 Yr": 365*7,
                                    "10 Yr": 365*10}
    df_r = df_r.rename(columns = keys)
    return df_r[keys.values()]

def combine_opt_rates(df_opt, df_r):
    df_opt = pd.merge(df_opt, df_r, on ="Quote_date", how = "left")
    rates = list(df_r.columns)
    rates.remove("Quote_date")
    df_opt["Ttl_diff"] = df_opt["Ttl"].apply(lambda x: (np.abs(np.array(rates) - x)).argmin())
    df_opt["R"] = df_opt[["Ttl_diff"] + rates].values.tolist()
    df_opt["R"] = df_opt["R"].apply(lambda x: x[int(x[0]+1)])
    df_opt = df_opt.drop(rates + ["Ttl_diff"], axis=1)
    return df_opt.dropna()

def get_model_dataset(path_opt, filenames_opt, path_r, filenames_r, call = True):
    """Wrapper function to extract option data and rates. Returns a combined dataframe"""
    df_opt = read_files(path_opt, filenames_opt)
    df_r = read_files(path_r, filenames_r)
    df_opt = process_options(df_opt, call)
    df_r = process_rates(df_r)
    df = combine_opt_rates(df_opt, df_r)
    return df #TODO: Fix handling of nan values

In [3]:
path_opt = "./data/options/"
filenames_opt = ["spx_eod_" + str(year) + (str(month) if month >= 10 else "0"+str(month)) + ".txt" for year in range(2022, 2022) for month in range(1, 13)] + ["spx_eod_2022" + (str(month) if month >= 10 else "0" + str(month)) + ".txt" for month in range(5, 10)]
#filenames_opt = ["spx_eod_202209.txt"]
path_r = "./data/rates/"
filenames_r = ["yield-curve-rates-2022.csv", "yield-curve-rates-1990-2021.csv"]

df_read = get_model_dataset(path_opt, filenames_opt, path_r, filenames_r, True)


In [4]:
def lag_features(df, features, seq_length):
    """Transforms a raw 2D dataframe of option data into 2D dataframe ofsequence data.
    Last 2 indexes per sequence are bid and ask price. The len(features)*seq_length
    features before are sequences of features"""
    df = df.sort_values(["Expire_date", "Strike", "Ttl"], ascending = [True, True, False])

    for step in range(seq_length)[::-1]:
        for feature in features:
            df[feature + "-" + str(step)] = df[feature].shift(step)
    
    df["Check_strike"] = df["Strike"] == df["Strike"].shift(seq_length-1)
    df["Check_expire"] = df["Expire_date"] == df["Expire_date"].shift(seq_length-1)
    df = df[(df["Check_strike"] == True) & (df["Check_expire"] == True)]
    df = df.drop(["Check_strike", "Check_expire"], axis=1)
    df[["Bid_last", "Ask_last"]] = df[["Bid", "Ask"]]
    return df

def create_train_test(df, features, split_date, seq_length):
    """Splits data in training and test set, and transforms data to right 2D format"""
    train = lag_features(df[df["Quote_date"] < split_date], features, seq_length)
    test = lag_features(df[df["Quote_date"] >= split_date], features, seq_length)
    return train, test

def df_to_xy(df, num_features, seq_length):
    array = df.to_numpy()
    array_x, array_y = array[:, -num_features*seq_length - 2:-2].astype(np.float32), array[:,-2:].astype(np.float32)
    return array_x, array_y

def min_max_scale(train, test):
    """Scales a training and test set using MinMaxScaler. The scaler is calibrated on the training set"""
    scaler = MinMaxScaler()
    train = scaler.fit_transform(train)
    test = scaler.transform(test)
    return train, test

