In [75]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [76]:
def read_files(path, filenames):
    """
    Reads all files and returns a dataframe with only the specificed columns
    """
    return pd.concat((pd.read_csv(path + f, skipinitialspace=True) for f in filenames))

def process_options(df_opt, call = True):
    """
    Cleans up column names and add time to live (Ttl) column to the dataframe
    """

    keys = {key: key[key.find("[")+1:key.find("]")][0] + key[key.find("[")+1:key.find("]")][1:].lower()  for key in df_opt.keys()}
    df_opt = df_opt.rename(columns=keys)

    if call:
        keys = {"C_ask": "Ask", "C_bid": "Bid"}
    else:
        keys = {"P_ask": "Ask", "P_bid": "Bid"}
    df_opt = df_opt.rename(columns=keys)

    df_opt["Quote_date"] = pd.to_datetime(df_opt["Quote_date"])
    df_opt["Expire_date"] = pd.to_datetime(df_opt["Expire_date"])
    df_opt["Ttl"] = df_opt.apply(lambda row: (row.Expire_date - row.Quote_date).days, axis = 1)

    columns = ["Quote_date", "Expire_date",  "Underlying_last", "Strike", "Ask", "Bid", "Ttl"]
    df_opt = df_opt[columns]
    df_opt = df_opt[df_opt["Ttl"] != 0]
    return df_opt[columns]

def process_rates(df_r):
    """
    Rename rate duration
    """
    df_r["Date"] = pd.to_datetime(df_r["Date"])
    df_r = df_r.rename(columns = {"Date" : "Quote_date", "3 Mo": "R"})
    #rate_keys = {key: key if key == "Date" else int(key.split(" ")[0])*30 if key.split(" ")[1] == "Mo" else int(key.split(" ")[0])*365  for key in df_r.keys()}
    #df_r = df_r.rename(columns=rate_keys)
    columns = ["Quote_date", "R"]
    return df_r[columns]

def combine_opt_r(df_opt, df_r):
    """
    Combines the dataset for options and rates
    """
    #df_opt["R"] = df_opt.apply(lambda row : df_rates[str(min(df_r.drop(["Date"], axis = 1).keys(), key = lambda x:abs(int(x)-row.Ttl)))][row.Quote_date], axis = 1)
    df_opt = pd.merge(df_opt, df_r, on ="Quote_date", how = "left")
    return df_opt

def get_model_dataset(path_opt, filenames_opt, path_r, filenames_r, call = True):
    """
    Wrapper function to extract option data and rates. Returns a combined dataframe
    """
    df_opt = read_files(path_opt, filenames_opt)
    df_r = read_files(path_r, filenames_r)
    df_opt = process_options(df_opt, call)
    df_r = process_rates(df_r)
    df = combine_opt_r(df_opt, df_r)
    return df.dropna() #TODO: Fix handling of nan values

In [77]:
path_opt = "./data/options/"
#filenames = ["spx_eod_" + str(year) + (str(month) if month >= 10 else "0"+str(month)) +".txt" for year in range(2010, 2022) for month in range(1, 13)] + ["spx_eod_2022" + (str(month) if month >= 10 else "0"+str(month)) +".txt" for month in range(1, 10)]
filenames_opt = ["spx_eod_202209.txt"]
path_r = "./data/rates/"
filenames_r = ["yield-curve-rates-2022.csv", "yield-curve-rates-1990-2021.csv"]

df_read = get_model_dataset(path_opt, filenames_opt, path_r, filenames_r, True)
print(df_read)
df_read.info()

       Quote_date Expire_date  Underlying_last   Strike     Ask     Bid   Ttl  \
0      2022-09-01  2022-09-02          3968.05   1000.0  2972.1  2961.6     1   
1      2022-09-01  2022-09-02          3968.05   1200.0  2771.9  2761.7     1   
2      2022-09-01  2022-09-02          3968.05   1400.0  2572.1  2561.7     1   
3      2022-09-01  2022-09-02          3968.05   1600.0  2368.2  2357.8     1   
4      2022-09-01  2022-09-02          3968.05   1800.0  2165.9  2158.1     1   
...           ...         ...              ...      ...     ...     ...   ...   
177070 2022-09-30  2026-12-18          3589.70   8800.0    31.4    10.0  1540   
177071 2022-09-30  2026-12-18          3589.70   9000.0    29.5     6.1  1540   
177072 2022-09-30  2026-12-18          3589.70   9200.0    16.2    11.2  1540   
177073 2022-09-30  2026-12-18          3589.70   9600.0    24.5     3.3  1540   
177074 2022-09-30  2026-12-18          3589.70  10000.0    21.8     7.3  1540   

           R  
0       2.97

In [155]:
def lstm_format(df, seq_length):
    """
    HIGHLY WIP
    Transforms a raw 2D list of option data into a 3D format of sequential data for LSTM model.
    """
    df = df.sort_values(["Expire_date", "Strike", "Ttl"], ascending = [True, True, False])
    
    features = ["Underlying_last", "Strike", "Ttl", "R"]
    df["F-0"] = df[features].values.tolist()
    #df["F-0"] = df.apply(lambda row: np.array([[row.Underlying_last], [row.Strike], [row.Ttl], [row.R]]), axis = 1)
    for step in range(1,seq_length):
        df["F-"+str(step)] = df["F-0"].shift(step)
    
    df["Check_strike"] = df["Strike"] == df["Strike"].shift(seq_length-1)
    df["Check_expire"] = df["Expire_date"] == df["Expire_date"].shift(seq_length-1)
    df = df[(df["Check_strike"] == True) & (df["Check_expire"] == True)]
    
    return df[["F-" + str(step) for step in range(seq_length)]].to_numpy(), df[["Bid", "Ask"]].to_numpy()


def specific_option(df, seq_length):
    """
    HIGHLY WIP
    Creates the sequential 3D format for a single option from a 2D list of all its quotes
    """
    for step in range(1,seq_length):
        df["F-"+str(step)] = df["F"].shift(step)
    df.info()
    return df


x, y = lstm_format(df_read, 5)
print("X shape", x.shape)
#print("Y shape", y.shape)
print(x)
#print(y)

x_3d = np.reshape(x, (len(x), 5, 1))
#print("X reshape", x_3d.shape)
#print(x_3d)

print(np.dstack(x).shape)
print(np.dstack(x))


ValueError: Wrong number of items passed 4, placement implies 1