In [82]:
import pandas as pd 
from utils import get_config, print_config, get_results, write_results
from utils.dataloader import dataloader, drop_settlement_dup, bin_avg, load_data
from utils.loss import plot_loss
import yaml
import time
from datetime import datetime
from pathlib import Path
import os
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [83]:
option_type = 'call'
run = 'short_ttm'
smooth = True
folder_path = 'data/final/binned/'

In [84]:
data_train, data_val, data_test, _ = load_data(run, option_type ,[],False) # full train True is bugged!

In [85]:
def retrieve_data(run, filename, folder_path, raw, covar_df, smooth=False):

    # check if specific file exists. If so, just load them. if not, then compute the whole thing
    if os.path.isfile(filename):
        data = pd.read_csv(filename)
    else:
        if not os.path.exists(folder_path):
            os.makedirs(folder_path)
        # Make train data
        df = drop_settlement_dup(raw)

        if run =='long_ttm':
            bins = [0, 21, 63, 126, 189, 252]
            labels = [1, 2, 3, 4, 5] # bin the maturities if it is a long term ttm
            df['maturity'] = pd.cut(df['maturity'], bins=bins, labels=labels, include_lowest=True, right=True).astype('Int64') 
            df = df.dropna(subset=['maturity'])
        
        moneyness_grid = np.arange(0.80, 1.21, 0.05)
        data = bin_avg(df, moneyness_grid, train=smooth)
        data.to_csv(filename)
    
    data['date'] = pd.to_datetime(data['date'])
    if covar_df is not None:
        data = pd.merge(data, covar_df, on='date', how='left')
    data = data.dropna()

    return data

train_name = f"data/final/binned/train_{run}_{option_type}_{smooth}.csv"
val_name = f"data/final/binned/val_{run}_{option_type}.csv"
test_name = f"data/final/binned/test_{run}_{option_type}.csv"

data_train = retrieve_data(run, train_name,  folder_path, data_train, None, smooth)
data_val = retrieve_data(run, val_name, folder_path, data_val, None)
data_test = retrieve_data(run, test_name, folder_path, data_test, None)

In [86]:
# df= pd.read_csv('data/final/smoothed/data_train.csv')

In [87]:
# AHBS accepts maturity in years, not in days

data_train['maturity'] = data_train['maturity'] / 252
data_val['maturity'] = data_val['maturity'] / 252
data_test['maturity'] = data_test['maturity'] / 252

In [88]:
# AHBS is rolling, and we don't need the entire train sample to make a prediction
# it just trains on the IVS of one particular day

last_date = data_val['date'].max()
last_ivs= data_val[data_val['date']==last_date].copy()
data_test = pd.concat([last_ivs, data_test], ignore_index=True)

In [89]:
print(data_test) # run this through the abs

      Unnamed: 0       date  maturity  moneyness  impl_volatility
0           1308 2022-05-19  0.003968       0.95         0.370043
1           1309 2022-05-19  0.003968       1.00         0.364060
2           1310 2022-05-19  0.003968       1.05         0.546334
3           1311 2022-05-19  0.003968       1.10         0.885447
4           1316 2022-05-19  0.011905       0.90         0.282979
...          ...        ...       ...        ...              ...
3438        8707 2022-09-01  0.015873       1.00         0.191120
3439        8708 2022-09-01  0.015873       1.05         0.194892
3440        8724 2022-09-01  0.019841       0.95         0.192258
3441        8725 2022-09-01  0.019841       1.00         0.206488
3442        8726 2022-09-01  0.019841       1.05         0.215623

[3443 rows x 5 columns]


In [90]:

results = []
# Build full (moneyness, maturity) grid
all_m = sorted(data_test['moneyness'].unique())
all_t = sorted(data_test['maturity'].unique())
full_grid = pd.DataFrame([(m, t) for t in all_t for m in all_m], columns=['moneyness', 'maturity'])

# Iterate through each date
for date, group in data_test.groupby('date'):
    group = group.sort_values(['maturity', 'moneyness'])

    # Fit on available points
    m = group['moneyness'].values
    t = group['maturity'].values   # convert to years
    iv = group['impl_volatility'].values


    X_train = np.column_stack([
        np.ones_like(m),
        m,
        m**2,
        t,
        t**2,
        m * t
    ])

    model = LinearRegression().fit(X_train, iv)

    # Predict on full grid
    m_full = full_grid['moneyness'].values
    t_full = full_grid['maturity'].values 

    X_full = np.column_stack([
        np.ones_like(m_full),
        m_full,
        m_full**2,
        t_full,
        t_full**2,
        m_full * t_full
    ])

    iv_pred = model.predict(X_full)
    # 1 step, now 4 extra steps
    iv_pred_5 = iv_pred.copy()
    for i in range(4):
        model_5 = LinearRegression().fit(X_full, iv_pred_5)
        iv_pred_5 = model_5.predict(X_full)

    iv_pred_10 = iv_pred_5.copy()
    for i in range(5):
        model_10 = LinearRegression().fit(X_full, iv_pred_10)
        iv_pred_10 = model_10.predict(X_full)
    
    iv_pred_21 = iv_pred_10.copy()
    for i in range(11):
        model_21 = LinearRegression().fit(X_full, iv_pred_21)
        iv_pred_21 = model_21.predict(X_full)

    result_df = full_grid.copy()
    result_df['date'] = date
    result_df['iv_pred'] = iv_pred
    result_df['iv_pred_5'] = iv_pred_5
    result_df['iv_pred_10'] = iv_pred_10
    result_df['iv_pred_21'] = iv_pred_21

    results.append(result_df)

# Combine and save
final_df = pd.concat(results, ignore_index=True)

print(final_df)

      moneyness  maturity       date   iv_pred  iv_pred_5  iv_pred_10  \
0          0.85  0.003968 2022-05-19  0.401434   0.401434    0.401434   
1          0.90  0.003968 2022-05-19  0.354119   0.354119    0.354119   
2          0.95  0.003968 2022-05-19  0.362815   0.362815    0.362815   
3          1.00  0.003968 2022-05-19  0.427519   0.427519    0.427519   
4          1.05  0.003968 2022-05-19  0.548233   0.548233    0.548233   
...         ...       ...        ...       ...        ...         ...   
7795       1.00  0.019841 2023-02-28  0.143445   0.143445    0.143445   
7796       1.05  0.019841 2023-02-28  0.232196   0.232196    0.232196   
7797       1.10  0.019841 2023-02-28  0.475976   0.475976    0.475976   
7798       1.15  0.019841 2023-02-28  0.874784   0.874784    0.874784   
7799       1.20  0.019841 2023-02-28  1.428620   1.428620    1.428620   

      iv_pred_21  
0       0.401434  
1       0.354119  
2       0.362815  
3       0.427519  
4       0.548233  
...      

In [91]:
# we got the predictions, now just merge with the orginal data_test to get the metric

output_df = pd.merge(final_df, data_test, on=['date', 'moneyness','maturity'], how='left')

In [92]:
# THE THING IS SHIFTED
output_df['iv_pred_shifted'] = output_df.groupby(['moneyness', 'maturity'])['iv_pred'].shift(1)
output_df['iv_pred_shifted_5'] = output_df.groupby(['moneyness', 'maturity'])['iv_pred_5'].shift(5)
output_df['iv_pred_shifted_10'] = output_df.groupby(['moneyness', 'maturity'])['iv_pred_10'].shift(10)
output_df['iv_pred_shifted_21'] = output_df.groupby(['moneyness', 'maturity'])['iv_pred_21'].shift(21)


output_df_1 = output_df.dropna(subset=['impl_volatility', 'iv_pred_shifted'])
output_df_5 = output_df.dropna(subset=['impl_volatility', 'iv_pred_shifted_5'])
output_df_10 = output_df.dropna(subset=['impl_volatility', 'iv_pred_shifted_10'])
output_df_21 = output_df.dropna(subset=['impl_volatility', 'iv_pred_shifted_21'])

In [93]:
def calculate_ivrmse(y_true, y_pred):
    return np.sqrt(np.mean((y_true - y_pred)**2))

In [94]:
rmse_per_date_1 = output_df_1.groupby('date').apply(
    lambda df: calculate_ivrmse(df['impl_volatility'].values, df['iv_pred_shifted'].values)
).reset_index(name='rmse')

  rmse_per_date_1 = output_df_1.groupby('date').apply(


In [95]:
rmse_per_date_5 = output_df_5.groupby('date').apply(
    lambda df: calculate_ivrmse(df['impl_volatility'].values, df['iv_pred_shifted_5'].values)
).reset_index(name='rmse')

  rmse_per_date_5 = output_df_5.groupby('date').apply(


In [96]:
rmse_per_date_10 = output_df_10.groupby('date').apply(
    lambda df: calculate_ivrmse(df['impl_volatility'].values, df['iv_pred_shifted_10'].values)
).reset_index(name='rmse')

  rmse_per_date_10 = output_df_10.groupby('date').apply(


In [97]:
rmse_per_date_21 = output_df_21.groupby('date').apply(
    lambda df: calculate_ivrmse(df['impl_volatility'].values, df['iv_pred_shifted_21'].values)
).reset_index(name='rmse')

  rmse_per_date_21 = output_df_21.groupby('date').apply(


In [98]:
rmse_per_date_1.to_csv(f'AHBS_rmse_1_{option_type}.csv')
rmse_per_date_5.to_csv(f'AHBS_rmse_5.{option_type}.csv')
rmse_per_date_10.to_csv(f'AHBS_rmse_10_{option_type}.csv')
rmse_per_date_21.to_csv(f'AHBS_rmse_21_{option_type}.csv')

In [99]:
print(f"{np.average(rmse_per_date_1['rmse'])*100:.2f}")
print(f"{np.average(rmse_per_date_5['rmse'])*100:.2f}")
print(f"{np.average(rmse_per_date_10['rmse'])*100:.2f}")
print(f"{np.average(rmse_per_date_21['rmse'])*100:.2f}")

15.98
15.03
15.90
17.68


In [100]:
print(f"{r2_score(output_df_1['impl_volatility'].values, output_df_1['iv_pred_shifted'].values)*100:.2f}")
print(f"{r2_score(output_df_5['impl_volatility'].values, output_df_5['iv_pred_shifted_5'].values)*100:.2f}")
print(f"{r2_score(output_df_10['impl_volatility'].values, output_df_10['iv_pred_shifted_10'].values)*100:.2f}")
print(f"{r2_score(output_df_21['impl_volatility'].values, output_df_21['iv_pred_shifted_21'].values)*100:.2f}")

12.70
24.81
22.69
8.37
