In [1]:
import pandas as pd 
from utils import get_config, print_config, get_results, write_results
from utils.dataloader import dataloader, drop_settlement_dup, bin_avg, load_data
from utils.loss import plot_loss
import yaml
import time
from datetime import datetime
from model.ahbs import AHBS
from pathlib import Path
import os
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [25]:
option_type = 'call'
run = 'short_ttm'
smooth = True
folder_path = 'data/final/binned/'

In [26]:
data_train, data_val, data_test, _ = load_data(run, option_type ,[],False) # full train True is bugged!

In [27]:
def retrieve_data(run, filename, folder_path, raw, covar_df, smooth=False):

    # check if specific file exists. If so, just load them. if not, then compute the whole thing
    if os.path.isfile(filename):
        data = pd.read_csv(filename)
    else:
        if not os.path.exists(folder_path):
            os.makedirs(folder_path)
        # Make train data
        df = drop_settlement_dup(raw)

        if run =='long_ttm':
            bins = [0, 21, 63, 126, 189, 252]
            labels = [1, 2, 3, 4, 5] # bin the maturities if it is a long term ttm
            df['maturity'] = pd.cut(df['maturity'], bins=bins, labels=labels, include_lowest=True, right=True).astype('Int64') 
            df = df.dropna(subset=['maturity'])
        
        moneyness_grid = np.arange(0.80, 1.21, 0.05)
        data = bin_avg(df, moneyness_grid, train=smooth)
        data.to_csv(filename)
    
    data['date'] = pd.to_datetime(data['date'])
    if covar_df is not None:
        data = pd.merge(data, covar_df, on='date', how='left')
    data = data.dropna()

    return data

train_name = f"data/final/binned/train_{run}_{option_type}_{smooth}.csv"
val_name = f"data/final/binned/val_{run}_{option_type}.csv"
test_name = f"data/final/binned/test_{run}_{option_type}.csv"

data_train = retrieve_data(run, train_name,  folder_path, data_train, None, smooth)
data_val = retrieve_data(run, val_name, folder_path, data_val, None)
data_test = retrieve_data(run, test_name, folder_path, data_test, None)

In [28]:
print(data_val)

           date  maturity  moneyness  impl_volatility
4    2021-12-07         1       1.00         0.148244
12   2021-12-07         3       0.95         0.146452
13   2021-12-07         3       1.00         0.162244
14   2021-12-07         3       1.05         0.219659
21   2021-12-07         4       0.95         0.119259
...         ...       ...        ...              ...
5116 2022-02-07         5       1.00         0.176799
5117 2022-02-07         5       1.05         0.193010
5124 2022-02-07         2       0.95         0.194682
5125 2022-02-07         2       1.00         0.202714
5126 2022-02-07         2       1.05         0.105126

[1478 rows x 4 columns]


In [29]:
print(data_val)

           date  maturity  moneyness  impl_volatility
4    2021-12-07         1       1.00         0.148244
12   2021-12-07         3       0.95         0.146452
13   2021-12-07         3       1.00         0.162244
14   2021-12-07         3       1.05         0.219659
21   2021-12-07         4       0.95         0.119259
...         ...       ...        ...              ...
5116 2022-02-07         5       1.00         0.176799
5117 2022-02-07         5       1.05         0.193010
5124 2022-02-07         2       0.95         0.194682
5125 2022-02-07         2       1.00         0.202714
5126 2022-02-07         2       1.05         0.105126

[1478 rows x 4 columns]


In [30]:
df= pd.read_csv('data/final/smoothed/data_train.csv')

In [31]:
print(df[df['cp_flag']=='C'])

        Unnamed: 0        date               symbol  ... midpoint  year IV_smooth
0                0  2012-01-03  SPXW 120106C1200000  ...    77.25  2012  0.356446
1                1  2012-01-03  SPXW 120106C1210000  ...    67.40  2012  0.333749
2                2  2012-01-03  SPXW 120106C1220000  ...    57.45  2012  0.296131
3                3  2012-01-03  SPXW 120106C1230000  ...    47.65  2012  0.268058
4                4  2012-01-03  SPXW 120106C1235000  ...    42.75  2012  0.251624
...            ...         ...                  ...  ...      ...   ...       ...
579882      579882  2021-12-06  SPXW 211213C4830000  ...     0.20  2021  0.144974
579883      579883  2021-12-06  SPXW 211213C4835000  ...     0.15  2021  0.146664
579884      579884  2021-12-06  SPXW 211213C4840000  ...     0.15  2021  0.148513
579885      579885  2021-12-06  SPXW 211213C4845000  ...     0.15  2021  0.150522
579886      579886  2021-12-06  SPXW 211213C4850000  ...     0.15  2021  0.152694

[223649 rows x 

In [32]:
print(df[df['cp_flag']=='P'])

        Unnamed: 0        date               symbol  ... midpoint  year IV_smooth
23              23  2012-01-03  SPXW 120106P1195000  ...    0.150  2012  0.323585
24              24  2012-01-03  SPXW 120106P1200000  ...    0.175  2012  0.312246
25              25  2012-01-03  SPXW 120106P1205000  ...    0.225  2012  0.304897
26              26  2012-01-03  SPXW 120106P1210000  ...    0.275  2012  0.295065
27              27  2012-01-03  SPXW 120106P1215000  ...    0.275  2012  0.275789
...            ...         ...                  ...  ...      ...   ...       ...
580063      580063  2021-12-06  SPXW 211213P4760000  ...  170.450  2021  0.133277
580064      580064  2021-12-06  SPXW 211213P4770000  ...  180.100  2021  0.129533
580065      580065  2021-12-06  SPXW 211213P4775000  ...  185.150  2021  0.127670
580066      580066  2021-12-06  SPXW 211213P4820000  ...  229.350  2021  0.111192
580067      580067  2021-12-06  SPXW 211213P4840000  ...  249.550  2021  0.104033

[356419 rows x 

In [33]:
print(data_train)

             date  maturity  moneyness  impl_volatility
3      2012-01-03         3       0.95         0.184449
4      2012-01-03         3       1.00         0.185676
5      2012-01-03         3       1.05         0.282994
47     2020-06-24         3       0.90         0.246565
48     2020-06-24         3       0.95         0.245337
...           ...       ...        ...              ...
108184 2015-09-14         3       1.00         0.296425
108185 2015-09-14         3       1.05         0.360390
108264 2015-09-11         4       0.95         0.179961
108265 2015-09-11         4       1.00         0.217244
108266 2015-09-11         4       1.05         0.232054

[16923 rows x 4 columns]


In [34]:
# AHBS accepts maturity in years, not in days

data_train['maturity'] = data_train['maturity'] / 252
data_val['maturity'] = data_val['maturity'] / 252
data_test['maturity'] = data_test['maturity'] / 252

In [35]:
# AHBS is rolling, and we don't need the entire train sample to make a prediction
# it just trains on the IVS of one particular day

last_date = data_val['date'].max()
last_ivs= data_val[data_val['date']==last_date].copy()
data_test = pd.concat([last_ivs, data_test], ignore_index=True)

In [36]:
print(data_test) # run this through the abs

           date  maturity  moneyness  impl_volatility
0    2022-05-19  0.003968       0.95         0.370043
1    2022-05-19  0.003968       1.00         0.364060
2    2022-05-19  0.003968       1.05         0.546334
3    2022-05-19  0.003968       1.10         0.885447
4    2022-05-19  0.011905       0.90         0.282979
...         ...       ...        ...              ...
3438 2022-09-01  0.015873       1.00         0.191120
3439 2022-09-01  0.015873       1.05         0.194892
3440 2022-09-01  0.019841       0.95         0.192258
3441 2022-09-01  0.019841       1.00         0.206488
3442 2022-09-01  0.019841       1.05         0.215623

[3443 rows x 4 columns]


In [37]:

results = []
# Build full (moneyness, maturity) grid
all_m = sorted(data_test['moneyness'].unique())
all_t = sorted(data_test['maturity'].unique())
full_grid = pd.DataFrame([(m, t) for t in all_t for m in all_m], columns=['moneyness', 'maturity'])

# Iterate through each date
for date, group in data_test.groupby('date'):
    group = group.sort_values(['maturity', 'moneyness'])

    # Fit on available points
    m = group['moneyness'].values
    t = group['maturity'].values   # convert to years
    iv = group['impl_volatility'].values


    X_train = np.column_stack([
        np.ones_like(m),
        m,
        m**2,
        t,
        t**2,
        m * t
    ])

    model = LinearRegression().fit(X_train, iv)

    # Predict on full grid
    m_full = full_grid['moneyness'].values
    t_full = full_grid['maturity'].values 

    X_full = np.column_stack([
        np.ones_like(m_full),
        m_full,
        m_full**2,
        t_full,
        t_full**2,
        m_full * t_full
    ])

    iv_pred = model.predict(X_full)
    # 1 step, now 4 extra steps
    iv_pred_5 = iv_pred.copy()
    for i in range(4):
        model_5 = LinearRegression().fit(X_full, iv_pred_5)
        iv_pred_5 = model_5.predict(X_full)

    iv_pred_10 = iv_pred_5.copy()
    for i in range(5):
        model_10 = LinearRegression().fit(X_full, iv_pred_10)
        iv_pred_10 = model_10.predict(X_full)

    result_df = full_grid.copy()
    result_df['date'] = date
    result_df['iv_pred'] = iv_pred
    result_df['iv_pred_5'] = iv_pred_5
    result_df['iv_pred_10'] = iv_pred_10

    results.append(result_df)

# Combine and save
final_df = pd.concat(results, ignore_index=True)

print(final_df)

      moneyness  maturity       date   iv_pred  iv_pred_5  iv_pred_10
0          0.85  0.003968 2022-05-19  0.401434   0.401434    0.401434
1          0.90  0.003968 2022-05-19  0.354119   0.354119    0.354119
2          0.95  0.003968 2022-05-19  0.362815   0.362815    0.362815
3          1.00  0.003968 2022-05-19  0.427519   0.427519    0.427519
4          1.05  0.003968 2022-05-19  0.548233   0.548233    0.548233
...         ...       ...        ...       ...        ...         ...
7795       1.00  0.019841 2023-02-28  0.143445   0.143445    0.143445
7796       1.05  0.019841 2023-02-28  0.232196   0.232196    0.232196
7797       1.10  0.019841 2023-02-28  0.475976   0.475976    0.475976
7798       1.15  0.019841 2023-02-28  0.874784   0.874784    0.874784
7799       1.20  0.019841 2023-02-28  1.428620   1.428620    1.428620

[7800 rows x 6 columns]


In [38]:
# we got the predictions, now just merge with the orginal data_test to get the metric

output_df = pd.merge(final_df, data_test, on=['date', 'moneyness','maturity'], how='left')

In [39]:
print(output_df)

      moneyness  maturity       date  ...  iv_pred_5  iv_pred_10  impl_volatility
0          0.85  0.003968 2022-05-19  ...   0.401434    0.401434              NaN
1          0.90  0.003968 2022-05-19  ...   0.354119    0.354119              NaN
2          0.95  0.003968 2022-05-19  ...   0.362815    0.362815         0.370043
3          1.00  0.003968 2022-05-19  ...   0.427519    0.427519         0.364060
4          1.05  0.003968 2022-05-19  ...   0.548233    0.548233         0.546334
...         ...       ...        ...  ...        ...         ...              ...
7795       1.00  0.019841 2023-02-28  ...   0.143445    0.143445         0.185687
7796       1.05  0.019841 2023-02-28  ...   0.232196    0.232196         0.261263
7797       1.10  0.019841 2023-02-28  ...   0.475976    0.475976              NaN
7798       1.15  0.019841 2023-02-28  ...   0.874784    0.874784              NaN
7799       1.20  0.019841 2023-02-28  ...   1.428620    1.428620              NaN

[7800 rows x 7 

In [40]:
# THE THING IS SHIFTED
output_df['iv_pred_shifted'] = output_df.groupby(['moneyness', 'maturity'])['iv_pred'].shift(1)
output_df['iv_pred_shifted_5'] = output_df.groupby(['moneyness', 'maturity'])['iv_pred_5'].shift(5)
output_df['iv_pred_shifted_10'] = output_df.groupby(['moneyness', 'maturity'])['iv_pred_10'].shift(10)

output_df_1 = output_df.dropna(subset=['impl_volatility', 'iv_pred_shifted'])
output_df_5 = output_df.dropna(subset=['impl_volatility', 'iv_pred_shifted_5'])
output_df_10 = output_df.dropna(subset=['impl_volatility', 'iv_pred_shifted_10'])

In [41]:
def calculate_ivrmse(y_true, y_pred):
    return np.sqrt(np.mean((y_true - y_pred)**2))

In [42]:
rmse_per_date_1 = output_df_1.groupby('date').apply(
    lambda df: calculate_ivrmse(df['impl_volatility'].values, df['iv_pred_shifted'].values)
).reset_index(name='rmse')

  rmse_per_date_1 = output_df_1.groupby('date').apply(


In [43]:
rmse_per_date_5 = output_df_5.groupby('date').apply(
    lambda df: calculate_ivrmse(df['impl_volatility'].values, df['iv_pred_shifted_5'].values)
).reset_index(name='rmse')

  rmse_per_date_5 = output_df_5.groupby('date').apply(


In [44]:
rmse_per_date_10 = output_df_10.groupby('date').apply(
    lambda df: calculate_ivrmse(df['impl_volatility'].values, df['iv_pred_shifted_10'].values)
).reset_index(name='rmse')

  rmse_per_date_10 = output_df_10.groupby('date').apply(


In [45]:
rmse_per_date_1.to_csv(f'AHBS_rmse_1_{option_type}.csv')
rmse_per_date_5.to_csv(f'AHBS_rmse_5.{option_type}.csv')
rmse_per_date_10.to_csv(f'AHBS_rmse_10_{option_type}.csv')

In [46]:
print(np.average(rmse_per_date_1['rmse']))
print(np.average(rmse_per_date_5['rmse']))
print(np.average(rmse_per_date_10['rmse']))

0.15983521426281852
0.15033366091456868
0.15904768599978736


In [47]:
print(r2_score(output_df_1['impl_volatility'].values, output_df_1['iv_pred_shifted'].values))
print(r2_score(output_df_5['impl_volatility'].values, output_df_5['iv_pred_shifted_5'].values))
print(r2_score(output_df_10['impl_volatility'].values, output_df_10['iv_pred_shifted_10'].values))

0.12704082497395508
0.24810232058203185
0.22694430728749126
