In [1]:
import pandas as pd 
from utils import get_config, print_config, get_results, write_results
from utils.dataloader import dataloader, drop_settlement_dup, bin_avg
from utils.loss import plot_loss
import yaml
import time
from datetime import datetime
from model.ahbs import AHBS
from pathlib import Path
import os
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [14]:
option_type = 'put'
run = 'short_ttm'
smooth = True
folder_path = 'data/final/binned/'

In [None]:

def load_data(run, option_type, covariate_columns, full_train=False):
        
    # Let's reshape our input data of the thing... we are going to need labels, and we are going to need train surface.
    # The labels will be, the smoothed IVs of our data
    # The train will be the dimensions, with time x ttm x moneyness encoders
    # If we have covariates, the channels will be larger? -> Yes, starting channels will be added to the layers
    # Load the data for train/val/test

    if not full_train:
        if run == 'short_ttm':
            data_train = pd.read_csv('data/final/smoothed/data_train.csv')
            data_val = pd.read_csv('data/final/evaluation/validation_set.csv')
            data_test = pd.read_csv('data/final/evaluation/test_set.csv')

            if option_type =='put':
                print(f'selected {option_type}')
                data_train = data_train[data_train['cp_flag']=='P']
                data_val = data_val[data_val['cp_flag']=='P']
                data_test = data_test[data_test['cp_flag']=='P']
            elif option_type =='call':
                print(f'selected {option_type}')
                data_train = data_train[data_train['cp_flag']=='C']
                data_val = data_val[data_val['cp_flag']=='C']
                data_test = data_test[data_test['cp_flag']=='C']

            if covariate_columns:
                covar_df = pd.read_excel('data/final/covariates/covariates_train.xlsx')
                covar_df_val = pd.read_excel('data/final/covariates/covariates_validation.xlsx')

                covar_df = covar_df.rename(columns={'Date':'date'})
                covar_df_val = covar_df_val.rename(columns={'Date':'date'})
                covar_df = covar_df[['date'] + covariate_columns]
                covar_df_val = covar_df_val[['date'] + covariate_columns]
            else:
                covar_df = None

        else:
            print('Select a dataset')

        return data_train, data_val, data_test, covar_df
    else:
        if run == 'short_ttm':
            data_train = pd.read_csv('data/final/smoothed/data_train_val.csv')
            data_test = pd.read_csv('data/final/evaluation/test_set.csv')

            if option_type =='put':
                data_train = data_train[data_train['cp_flag']=='P']
                data_test = data_test[data_test['cp_flag']=='P']
            elif option_type =='call':
                data_train = data_train[data_train['cp_flag']=='C']
                data_test = data_test[data_test['cp_flag']=='C']

            if covariate_columns:
                covar_df_val = pd.read_excel('data/final/covariates/covariates_validation.xlsx')
                covar_df_val = covar_df_val.rename(columns={'Date':'date'})
                covar_df_val = covar_df_val[['date'] + covariate_columns]
            else:
                covar_df_val = None

        elif run == 'long_ttm':
            data_train = pd.read_csv('data/final/smoothed/data_train_val_long.csv')
            data_test = pd.read_csv('data/final/evaluation/test_set_long.csv')

            if option_type =='put':
                data_train = data_train[data_train['cp_flag']=='P']
                data_test = data_test[data_test['cp_flag']=='P']
            elif option_type =='call':
                data_train = data_train[data_train['cp_flag']=='C']
                data_test = data_test[data_test['cp_flag']=='C']
                
            if covariate_columns:
                covar_df_val = pd.read_excel('data/final/covariates/covariates_validation_long.xlsx')
                covar_df_val = covar_df_val.rename(columns={'Date':'date'})
                covar_df_val = covar_df_val[['date'] + covariate_columns]
            else:
                covar_df_val = None
        
        else:
            print('Select a dataset')

        return data_train, data_test, covar_df_val

In [19]:
data_train, data_val, data_test, _ = load_data(run, option_type ,[],False) # full train True is bugged!

selected put


In [20]:
def retrieve_data(run, filename, folder_path, raw, covar_df, smooth=False):

    print(raw['cp_flag'].unique())
    # check if specific file exists. If so, just load them. if not, then compute the whole thing
    if os.path.isfile(filename):
        data = pd.read_csv(filename)
    else:
        if not os.path.exists(folder_path):
            os.makedirs(folder_path)
        # Make train data
        df = drop_settlement_dup(raw)

        if run =='long_ttm':
            bins = [0, 21, 63, 126, 189, 252]
            labels = [1, 2, 3, 4, 5] # bin the maturities if it is a long term ttm
            df['maturity'] = pd.cut(df['maturity'], bins=bins, labels=labels, include_lowest=True, right=True).astype('Int64') 
            df = df.dropna(subset=['maturity'])
        
        moneyness_grid = np.arange(0.80, 1.21, 0.05)
        data = bin_avg(df, moneyness_grid, train=smooth)
        data.to_csv(filename)
    
    data['date'] = pd.to_datetime(data['date'])
    if covar_df is not None:
        data = pd.merge(data, covar_df, on='date', how='left')
    data = data.dropna()

    return data

train_name = f"data/final/binned/train_{run}_{option_type}_{smooth}.csv"
val_name = f"data/final/binned/val_{run}_{option_type}.csv"
test_name = f"data/final/binned/test_{run}_{option_type}.csv"

data_train = retrieve_data(run, train_name,  folder_path, data_train, None, smooth)
data_val = retrieve_data(run, val_name, folder_path, data_val, None)
data_test = retrieve_data(run, test_name, folder_path, data_test, None)

['C' 'P']
['C' 'P']
['C' 'P']


In [13]:
print(data_val)

      Unnamed: 0       date  maturity  moneyness  impl_volatility
3              3 2021-12-07         1       0.95         0.483243
4              4 2021-12-07         1       1.00         0.174480
5              5 2021-12-07         1       1.05         0.391089
6              6 2021-12-07         1       1.10         0.526888
12            12 2021-12-07         4       0.95         0.121325
...          ...        ...       ...        ...              ...
5115        5115 2022-02-03         2       0.95         0.186731
5116        5116 2022-02-03         2       1.00         0.207458
5117        5117 2022-02-03         2       1.05         0.285085
5118        5118 2022-02-03         2       1.10         0.370778
5119        5119 2022-02-03         2       1.15         0.483184

[2298 rows x 5 columns]


In [18]:
print(data_val)

      Unnamed: 0       date  maturity  moneyness  impl_volatility
3              3 2021-12-07         1       0.95         0.483243
4              4 2021-12-07         1       1.00         0.174480
5              5 2021-12-07         1       1.05         0.391089
6              6 2021-12-07         1       1.10         0.526888
12            12 2021-12-07         4       0.95         0.121325
...          ...        ...       ...        ...              ...
5115        5115 2022-02-03         2       0.95         0.186731
5116        5116 2022-02-03         2       1.00         0.207458
5117        5117 2022-02-03         2       1.05         0.285085
5118        5118 2022-02-03         2       1.10         0.370778
5119        5119 2022-02-03         2       1.15         0.483184

[2298 rows x 5 columns]


In [94]:
df= pd.read_csv('data/final/smoothed/data_train.csv')

In [96]:
print(df[df['cp_flag']=='C'])

        Unnamed: 0        date               symbol      exdate   last_date  \
0                0  2012-01-03  SPXW 120106C1200000  2012-01-06  03/01/2012   
1                1  2012-01-03  SPXW 120106C1210000  2012-01-06  03/01/2012   
2                2  2012-01-03  SPXW 120106C1220000  2012-01-06  03/01/2012   
3                3  2012-01-03  SPXW 120106C1230000  2012-01-06  03/01/2012   
4                4  2012-01-03  SPXW 120106C1235000  2012-01-06  03/01/2012   
...            ...         ...                  ...         ...         ...   
579882      579882  2021-12-06  SPXW 211213C4830000  2021-12-13  06/12/2021   
579883      579883  2021-12-06  SPXW 211213C4835000  2021-12-13  06/12/2021   
579884      579884  2021-12-06  SPXW 211213C4840000  2021-12-13  06/12/2021   
579885      579885  2021-12-06  SPXW 211213C4845000  2021-12-13  06/12/2021   
579886      579886  2021-12-06  SPXW 211213C4850000  2021-12-13  06/12/2021   

       cp_flag  strike_price  best_bid  best_offer 

In [97]:
print(df[df['cp_flag']=='P'])

        Unnamed: 0        date               symbol      exdate   last_date  \
23              23  2012-01-03  SPXW 120106P1195000  2012-01-06  03/01/2012   
24              24  2012-01-03  SPXW 120106P1200000  2012-01-06  03/01/2012   
25              25  2012-01-03  SPXW 120106P1205000  2012-01-06  03/01/2012   
26              26  2012-01-03  SPXW 120106P1210000  2012-01-06  03/01/2012   
27              27  2012-01-03  SPXW 120106P1215000  2012-01-06  03/01/2012   
...            ...         ...                  ...         ...         ...   
580063      580063  2021-12-06  SPXW 211213P4760000  2021-12-13  06/12/2021   
580064      580064  2021-12-06  SPXW 211213P4770000  2021-12-13  06/12/2021   
580065      580065  2021-12-06  SPXW 211213P4775000  2021-12-13  06/12/2021   
580066      580066  2021-12-06  SPXW 211213P4820000  2021-12-13  06/12/2021   
580067      580067  2021-12-06  SPXW 211213P4840000  2021-12-13  06/12/2021   

       cp_flag  strike_price  best_bid  best_offer 

In [79]:
print(data_train)

             date  maturity  moneyness  impl_volatility
3      2012-01-03         3       0.95         0.185412
4      2012-01-03         3       1.00         0.185676
5      2012-01-03         3       1.05         0.278790
57     2020-07-06         5       0.95         0.180420
58     2020-07-06         5       1.00         0.191077
...           ...       ...        ...              ...
108203 2015-08-14         4       1.05         0.183059
108204 2015-08-14         4       1.10         0.282980
108228 2015-08-17         3       0.95         0.168562
108229 2015-08-17         3       1.00         0.117394
108230 2015-08-17         3       1.05         0.249170

[23862 rows x 4 columns]


In [80]:
# AHBS accepts maturity in years, not in days

data_train['maturity'] = data_train['maturity'] / 252
data_val['maturity'] = data_val['maturity'] / 252
data_test['maturity'] = data_test['maturity'] / 252

In [81]:
# AHBS is rolling, and we don't need the entire train sample to make a prediction
# it just trains on the IVS of one particular day

last_date = data_val['date'].max()
last_ivs= data_val[data_val['date']==last_date].copy()
data_test = pd.concat([last_ivs, data_test], ignore_index=True)

In [82]:
print(data_test) # run this through the abs

           date  maturity  moneyness  impl_volatility
0    2022-05-19  0.003968       0.95         0.370043
1    2022-05-19  0.003968       1.00         0.357870
2    2022-05-19  0.003968       1.05         0.431853
3    2022-05-19  0.003968       1.10         0.602428
4    2022-05-19  0.015873       0.90         0.284850
...         ...       ...        ...              ...
5563 2022-08-22  0.011905       0.95         0.206340
5564 2022-08-22  0.011905       1.00         0.219023
5565 2022-08-22  0.011905       1.05         0.262478
5566 2022-08-22  0.011905       1.10         0.408368
5567 2022-08-22  0.011905       1.15         0.500751

[5568 rows x 4 columns]


In [83]:

results = []
# Build full (moneyness, maturity) grid
all_m = sorted(data_test['moneyness'].unique())
all_t = sorted(data_test['maturity'].unique())
full_grid = pd.DataFrame([(m, t) for t in all_t for m in all_m], columns=['moneyness', 'maturity'])

# Iterate through each date
for date, group in data_test.groupby('date'):
    group = group.sort_values(['maturity', 'moneyness'])

    # Fit on available points
    m = group['moneyness'].values
    t = group['maturity'].values   # convert to years
    iv = group['impl_volatility'].values


    X_train = np.column_stack([
        np.ones_like(m),
        m,
        m**2,
        t,
        t**2,
        m * t
    ])

    model = LinearRegression().fit(X_train, iv)

    # Predict on full grid
    m_full = full_grid['moneyness'].values
    t_full = full_grid['maturity'].values 

    X_full = np.column_stack([
        np.ones_like(m_full),
        m_full,
        m_full**2,
        t_full,
        t_full**2,
        m_full * t_full
    ])

    iv_pred = model.predict(X_full)
    # 1 step, now 4 extra steps
    iv_pred_5 = iv_pred.copy()
    for i in range(4):
        model_5 = LinearRegression().fit(X_full, iv_pred_5)
        iv_pred_5 = model_5.predict(X_full)

    iv_pred_10 = iv_pred_5.copy()
    for i in range(5):
        model_10 = LinearRegression().fit(X_full, iv_pred_10)
        iv_pred_10 = model_10.predict(X_full)

    result_df = full_grid.copy()
    result_df['date'] = date
    result_df['iv_pred'] = iv_pred
    result_df['iv_pred_5'] = iv_pred_5
    result_df['iv_pred_10'] = iv_pred_10

    results.append(result_df)

# Combine and save
final_df = pd.concat(results, ignore_index=True)

print(final_df)

      moneyness  maturity       date   iv_pred  iv_pred_5  iv_pred_10
0          0.80  0.003968 2022-05-19  0.383757   0.383757    0.383757
1          0.85  0.003968 2022-05-19  0.349721   0.349721    0.349721
2          0.90  0.003968 2022-05-19  0.337053   0.337053    0.337053
3          0.95  0.003968 2022-05-19  0.345755   0.345755    0.345755
4          1.00  0.003968 2022-05-19  0.375825   0.375825    0.375825
...         ...       ...        ...       ...        ...         ...
8770       1.00  0.019841 2023-02-28  0.171737   0.171737    0.171737
8771       1.05  0.019841 2023-02-28  0.115461   0.115461    0.115461
8772       1.10  0.019841 2023-02-28  0.155284   0.155284    0.155284
8773       1.15  0.019841 2023-02-28  0.291206   0.291206    0.291206
8774       1.20  0.019841 2023-02-28  0.523228   0.523228    0.523228

[8775 rows x 6 columns]


In [84]:
# we got the predictions, now just merge with the orginal data_test to get the metric

output_df = pd.merge(final_df, data_test, on=['date', 'moneyness','maturity'], how='left')

In [85]:
print(output_df)

      moneyness  maturity       date   iv_pred  iv_pred_5  iv_pred_10  \
0          0.80  0.003968 2022-05-19  0.383757   0.383757    0.383757   
1          0.85  0.003968 2022-05-19  0.349721   0.349721    0.349721   
2          0.90  0.003968 2022-05-19  0.337053   0.337053    0.337053   
3          0.95  0.003968 2022-05-19  0.345755   0.345755    0.345755   
4          1.00  0.003968 2022-05-19  0.375825   0.375825    0.375825   
...         ...       ...        ...       ...        ...         ...   
8770       1.00  0.019841 2023-02-28  0.171737   0.171737    0.171737   
8771       1.05  0.019841 2023-02-28  0.115461   0.115461    0.115461   
8772       1.10  0.019841 2023-02-28  0.155284   0.155284    0.155284   
8773       1.15  0.019841 2023-02-28  0.291206   0.291206    0.291206   
8774       1.20  0.019841 2023-02-28  0.523228   0.523228    0.523228   

      impl_volatility  
0                 NaN  
1                 NaN  
2                 NaN  
3            0.370043  
4  

In [86]:
# THE THING IS SHIFTED
output_df['iv_pred_shifted'] = output_df.groupby(['moneyness', 'maturity'])['iv_pred'].shift(1)
output_df['iv_pred_shifted_5'] = output_df.groupby(['moneyness', 'maturity'])['iv_pred_5'].shift(5)
output_df['iv_pred_shifted_10'] = output_df.groupby(['moneyness', 'maturity'])['iv_pred_10'].shift(10)

output_df_1 = output_df.dropna(subset=['impl_volatility', 'iv_pred_shifted'])
output_df_5 = output_df.dropna(subset=['impl_volatility', 'iv_pred_shifted_5'])
output_df_10 = output_df.dropna(subset=['impl_volatility', 'iv_pred_shifted_10'])

In [87]:
def calculate_ivrmse(y_true, y_pred):
    return np.sqrt(np.mean((y_true - y_pred)**2))

In [88]:
rmse_per_date_1 = output_df_1.groupby('date').apply(
    lambda df: calculate_ivrmse(df['impl_volatility'].values, df['iv_pred_shifted'].values)
).reset_index(name='rmse')

  rmse_per_date_1 = output_df_1.groupby('date').apply(


In [89]:
rmse_per_date_5 = output_df_5.groupby('date').apply(
    lambda df: calculate_ivrmse(df['impl_volatility'].values, df['iv_pred_shifted_5'].values)
).reset_index(name='rmse')

  rmse_per_date_5 = output_df_5.groupby('date').apply(


In [90]:
rmse_per_date_10 = output_df_10.groupby('date').apply(
    lambda df: calculate_ivrmse(df['impl_volatility'].values, df['iv_pred_shifted_10'].values)
).reset_index(name='rmse')

  rmse_per_date_10 = output_df_10.groupby('date').apply(


In [91]:
rmse_per_date_1.to_csv(f'AHBS_rmse_1_{option_type}.csv')
rmse_per_date_5.to_csv(f'AHBS_rmse_5.{option_type}.csv')
rmse_per_date_10.to_csv(f'AHBS_rmse_10_{option_type}.csv')

In [92]:
print(np.average(rmse_per_date_1['rmse']))
print(np.average(rmse_per_date_5['rmse']))
print(np.average(rmse_per_date_10['rmse']))

0.22606503291390986
0.20706854078459339
0.2160839495735118


In [93]:
print(r2_score(output_df_1['impl_volatility'].values, output_df_1['iv_pred_shifted'].values))
print(r2_score(output_df_5['impl_volatility'].values, output_df_5['iv_pred_shifted_5'].values))
print(r2_score(output_df_10['impl_volatility'].values, output_df_10['iv_pred_shifted_10'].values))

0.22606620861136817
0.31476480373865545
0.2875533887884636
