In [1]:
import pandas as pd
from sklearn.discriminant_analysis import StandardScaler
from sklearn.compose import ColumnTransformer
from utils import run_kfold, run_kfold_eval, run_kfold_trainonly, feature_selection, calculate_metric
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import optuna
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
import time


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
TRAIN_FEATURE_FILE = "../Data/final/train_features2.xlsx"
TEST_FEATURE_FILE = "../Data/final/test_features2.xlsx"
TRAIN_LABEL_FILE = "../Data/final/train_labels2.xlsx"
TEST_LABEL_FILE = "../Data/final/test_labels2.xlsx"
OUTPUT_PATH = '../OUTPUT/svm'

In [3]:
train_features = pd.read_excel(TRAIN_FEATURE_FILE)
test_features = pd.read_excel(TEST_FEATURE_FILE)
train_labels = pd.read_excel(TRAIN_LABEL_FILE)
test_labels = pd.read_excel(TEST_LABEL_FILE)

In [4]:
print("Train feature shape: ",train_features.shape)
print("Train label shape: ",train_labels.shape)
print("Test feature shape: ",test_features.shape)
print("Train label shape: ",test_labels.shape)

Train feature shape:  (1293, 317)
Train label shape:  (1293, 1)
Test feature shape:  (432, 317)
Train label shape:  (432, 1)


In [5]:
train_features.head()

Unnamed: 0,coupon rate,SP500 MD,Average daily 1-year SP500 return,Ratio to MA,US Corporate Bond Yield Spread,US Corporate Bond Yield Spread(3-5 year),US Corporate Bond Yield Spread(5-7 year),US Corporate Bond Yield Spread(7-10 year),US Corporate Bond Yield Spread(10+ year),US Generic Govt 3 Month Yield,...,event_type_subcategory_sum_Missing Coupon payment only,event_type_subcategory_sum_Missing Interest payment,event_type_subcategory_sum_Missing Loan payment,event_type_subcategory_sum_Missing Principal payment,event_type_subcategory_sum_Others,event_type_subcategory_sum_Pre-Negotiated Chapter 11,event_type_subcategory_sum_Protection,event_type_subcategory_sum_Receivership,event_type_subcategory_sum_Rehabilitation,event_type_subcategory_sum_Restructuring
0,7.5,-117.4602,-0.000189,125.407139,177.213028,134.012054,198.8153,191.364395,223.346344,0.1983,...,True,False,False,False,False,False,False,False,False,False
1,6.0,166.38276,0.000768,-4.603446,101.613617,77.032829,123.3998,105.932022,139.111115,0.0355,...,False,False,False,False,False,False,False,False,False,False
2,11.0,119.85752,0.000678,-11.95038,104.545959,77.416649,129.4317,111.818001,139.717407,0.0101,...,False,False,False,False,False,False,False,False,False,False
3,9.125,653.51208,0.001638,-2.494861,90.736633,64.654129,95.3731,92.141212,121.666237,0.0152,...,False,False,False,False,False,False,False,False,False,False
4,9.25,231.89472,0.000664,4.823413,98.533821,68.759308,93.4174,107.424469,139.741165,1.2865,...,False,False,False,False,True,False,False,False,False,False


In [6]:
train_labels.head()

Unnamed: 0,rr1_30
0,0.082481
1,0.378845
2,0.836149
3,0.987208
4,1.021458


In [7]:
feature_list = train_features.columns
category_feature_key = ['currency', 'seniorioty_adj', 'domicile_country',	'exchange_country',	'Industry_sector',	'Industry_group',	'Industry_subgroup', 'event_type',
                 'event_type_subcategory_sum', 'defaulted_in_last_6_months']
category_features = [i for i in feature_list if any(sub in i for sub in category_feature_key)]
non_category_features = [i for i in feature_list if i not in category_features]

print(len(category_features))
print(len(non_category_features))

163
154


In [8]:
# Normalize the data
# Prepare the ColumnTransformer
scaler = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), non_category_features)
    ],
    remainder='passthrough'  # Leave categorical features untouched
)

In [15]:
def objective(trial):
    # train_x, valid_x, train_y, valid_y = train_test_split(train_feature_df, train_label_df, test_size=0.2, random_state=42)
    # Hyperparameter search space
    C = trial.suggest_float('C', 1e-10, 100)
    kernel = trial.suggest_categorical('kernel', ['linear', 'poly', 'rbf', 'sigmoid'])
    gamma = trial.suggest_categorical('gamma', ['scale', 'auto'])
    epsilon = trial.suggest_float('epsilon', 1e-3, 1.0)
    
    # Define the SVM model
    model = SVR(C=C, kernel=kernel, epsilon=epsilon, gamma=gamma)
    pipeline = Pipeline([
        ('preprocessor', scaler),
        ('svr', model)
    ])
    
    score = cross_val_score(pipeline, train_features, train_labels.to_numpy(), cv=5, scoring='neg_root_mean_squared_error', n_jobs=-1).mean()
    return score

In [16]:
# Create study object
study = optuna.create_study(direction="maximize")

# Run optimization process
study.optimize(objective, n_trials=50, show_progress_bar=True)

print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[I 2024-08-20 18:59:19,661] A new study created in memory with name: no-name-daf44c98-c489-48d3-9d7b-4568a29622e3
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[I 2024-08-20 18:59:22,760] Trial 0 finished with value: -255.0342311035848 and parameters: {'C': 89.7406321207304, 'kernel': 'sigmoid', 'gamma': 'auto', 'epsilon': 0.11042829235366182}. Best is trial 0 with value: -255.0342311035848.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[I 2024-08-20 18:59:23,689] Trial 1 finished with value: -0.3358284131913639 and parameters: {'C': 77.41119273953214, 'kernel': 'linear', 'gamma': 'scale', 'epsilon': 0.9191999339458705}. Best is trial 1 with value: -0.3358284131913639.
[I 2024-08-20 18:59:23,843] Trial 2 finished with value: -0.3358284131913639 and parameters: {'C': 80.47249707240157, 'kernel': 'rbf', 'gamma': 'scale', 'epsilon': 0.8026958335575711}. Best is trial 1 with value: -0.3358284131913639.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[I 2024-08-20 18:59:24,004] Trial 3 finished with value: -0.3358284131913639 and parameters: {'C': 43.16112569023, 'kernel': 'poly', 'gamma': 'scale', 'epsilon': 0.9557474358779097}. Best is trial 1 with value: -0.3358284131913639.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
Best trial: 1. Best value: -0.335828:  12%|█▏        | 6/50 [00:05<00:24,  1.78it/s]

[I 2024-08-20 18:59:24,841] Trial 4 finished with value: -0.372073589577341 and parameters: {'C': 56.269134316091716, 'kernel': 'poly', 'gamma': 'scale', 'epsilon': 0.44936837795309137}. Best is trial 1 with value: -0.3358284131913639.
[I 2024-08-20 18:59:25,034] Trial 5 finished with value: -0.56806195733678 and parameters: {'C': 62.78734151286025, 'kernel': 'poly', 'gamma': 'auto', 'epsilon': 0.4100084544986265}. Best is trial 1 with value: -0.3358284131913639.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[I 2024-08-20 18:59:25,252] Trial 6 finished with value: -1.2176032368030032 and parameters: {'C': 51.13728369907657, 'kernel': 'poly', 'gamma': 'scale', 'epsilon': 0.35550093171348546}. Best is trial 1 with value: -0.3358284131913639.
[I 2024-08-20 18:59:25,413] Trial 7 finished with value: -0.3358284131913639 and parameters: {'C': 1.7056141533336295, 'kernel': 'linear', 'gamma': 'scale', 'epsilon': 0.7365601564761026}. Best is trial 1 with value: -0.3358284131913639.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[I 2024-08-20 18:59:25,571] Trial 8 finished with value: -0.3358284131913639 and parameters: {'C': 28.67024796364848, 'kernel': 'poly', 'gamma': 'auto', 'epsilon': 0.6461494859480882}. Best is trial 1 with value: -0.3358284131913639.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[I 2024-08-20 19:00:37,511] Trial 9 finished with value: -0.3117260490317983 and parameters: {'C': 19.70885475324191, 'kernel': 'linear', 'gamma': 'auto', 'epsilon': 0.20617585087649487}. Best is trial 9 with value: -0.3117260490317983.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[I 2024-08-20 19:01:29,817] Trial 10 finished with value: -0.3409112217647109 and parameters: {'C': 8.818834978191575, 'kernel': 'linear', 'gamma': 'auto', 'epsilon': 0.03524567475052054}. Best is trial 9 with value: -0.3117260490317983.


Best trial: 9. Best value: -0.311726:  24%|██▍       | 12/50 [07:32<1:15:55, 119.88s/it]

[I 2024-08-20 19:06:51,705] Trial 11 finished with value: -0.3145720578224862 and parameters: {'C': 74.88143276277653, 'kernel': 'linear', 'gamma': 'auto', 'epsilon': 0.2233976960879135}. Best is trial 9 with value: -0.3117260490317983.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[I 2024-08-20 19:08:42,276] Trial 12 finished with value: -0.32292345213565654 and parameters: {'C': 28.069455440009417, 'kernel': 'linear', 'gamma': 'auto', 'epsilon': 0.23189727917823932}. Best is trial 9 with value: -0.3117260490317983.


  y = column_or_1d(y, warn=True)
Best trial: 9. Best value: -0.311726:  28%|██▊       | 14/50 [15:42<1:57:54, 196.51s/it]

[I 2024-08-20 19:15:02,353] Trial 13 finished with value: -0.3150526540620496 and parameters: {'C': 99.3008772633323, 'kernel': 'linear', 'gamma': 'auto', 'epsilon': 0.23275013976516457}. Best is trial 9 with value: -0.3117260490317983.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
Best trial: 14. Best value: -0.26262:  32%|███▏      | 16/50 [15:43<54:27, 96.10s/it]   

[I 2024-08-20 19:15:02,773] Trial 14 finished with value: -0.26261998398880576 and parameters: {'C': 23.950333845015717, 'kernel': 'rbf', 'gamma': 'auto', 'epsilon': 0.22752491256490687}. Best is trial 14 with value: -0.26261998398880576.
[I 2024-08-20 19:15:02,958] Trial 15 finished with value: -0.33475483408240864 and parameters: {'C': 20.851809884710025, 'kernel': 'rbf', 'gamma': 'auto', 'epsilon': 0.5395143557421325}. Best is trial 14 with value: -0.26261998398880576.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[I 2024-08-20 19:15:03,439] Trial 16 finished with value: -0.28026282115922246 and parameters: {'C': 38.9729625887343, 'kernel': 'rbf', 'gamma': 'auto', 'epsilon': 0.015098229974097988}. Best is trial 14 with value: -0.26261998398880576.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[I 2024-08-20 19:15:03,908] Trial 17 finished with value: -0.27959983971755914 and parameters: {'C': 39.587105845497206, 'kernel': 'rbf', 'gamma': 'auto', 'epsilon': 0.018954826283828358}. Best is trial 14 with value: -0.26261998398880576.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[I 2024-08-20 19:15:05,023] Trial 18 finished with value: -0.2662563832687662 and parameters: {'C': 38.18478646460658, 'kernel': 'rbf', 'gamma': 'auto', 'epsilon': 0.1368755189334775}. Best is trial 14 with value: -0.26261998398880576.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[I 2024-08-20 19:15:05,873] Trial 19 finished with value: -0.27279487567306804 and parameters: {'C': 11.225100349372024, 'kernel': 'rbf', 'gamma': 'auto', 'epsilon': 0.3146761439086918}. Best is trial 14 with value: -0.26261998398880576.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[I 2024-08-20 19:15:06,234] Trial 20 finished with value: -153.2427558812438 and parameters: {'C': 65.05951166206175, 'kernel': 'sigmoid', 'gamma': 'auto', 'epsilon': 0.5262936535263424}. Best is trial 14 with value: -0.26261998398880576.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[I 2024-08-20 19:15:06,468] Trial 21 finished with value: -0.2816973830494981 and parameters: {'C': 13.866199614595796, 'kernel': 'rbf', 'gamma': 'auto', 'epsilon': 0.35218179675391414}. Best is trial 14 with value: -0.26261998398880576.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[I 2024-08-20 19:15:06,770] Trial 22 finished with value: -0.26413347939870074 and parameters: {'C': 28.303843615272214, 'kernel': 'rbf', 'gamma': 'auto', 'epsilon': 0.15964693645290945}. Best is trial 14 with value: -0.26261998398880576.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[I 2024-08-20 19:15:07,707] Trial 23 finished with value: -0.2649121711471272 and parameters: {'C': 34.98064244021427, 'kernel': 'rbf', 'gamma': 'auto', 'epsilon': 0.1610371250837703}. Best is trial 14 with value: -0.26261998398880576.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[I 2024-08-20 19:15:08,030] Trial 24 finished with value: -0.26720539977391444 and parameters: {'C': 30.873130089739178, 'kernel': 'rbf', 'gamma': 'auto', 'epsilon': 0.1098412657333361}. Best is trial 14 with value: -0.26261998398880576.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[I 2024-08-20 19:15:08,279] Trial 25 finished with value: -0.26815686483153206 and parameters: {'C': 22.735608561641712, 'kernel': 'rbf', 'gamma': 'auto', 'epsilon': 0.2930764308603966}. Best is trial 14 with value: -0.26261998398880576.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[I 2024-08-20 19:15:08,593] Trial 26 finished with value: -0.2677334434742778 and parameters: {'C': 47.38493708647178, 'kernel': 'rbf', 'gamma': 'auto', 'epsilon': 0.12969109104871396}. Best is trial 14 with value: -0.26261998398880576.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[I 2024-08-20 19:15:08,935] Trial 27 finished with value: -88.02851474695277 and parameters: {'C': 30.85998686739976, 'kernel': 'sigmoid', 'gamma': 'auto', 'epsilon': 0.4319304714948073}. Best is trial 14 with value: -0.26261998398880576.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[I 2024-08-20 19:15:09,224] Trial 28 finished with value: -0.2645547906112393 and parameters: {'C': 34.47742423229856, 'kernel': 'rbf', 'gamma': 'auto', 'epsilon': 0.17508821627285104}. Best is trial 14 with value: -0.26261998398880576.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[I 2024-08-20 19:15:09,573] Trial 29 finished with value: -59.13341975711281 and parameters: {'C': 20.829983562714652, 'kernel': 'sigmoid', 'gamma': 'auto', 'epsilon': 0.07170122574541143}. Best is trial 14 with value: -0.26261998398880576.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[I 2024-08-20 19:15:09,804] Trial 30 finished with value: -0.2718770588847548 and parameters: {'C': 0.9048916376651732, 'kernel': 'rbf', 'gamma': 'auto', 'epsilon': 0.2875360783988411}. Best is trial 14 with value: -0.26261998398880576.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[I 2024-08-20 19:15:10,086] Trial 31 finished with value: -0.26488775204060844 and parameters: {'C': 35.458635224753515, 'kernel': 'rbf', 'gamma': 'auto', 'epsilon': 0.16402240051820483}. Best is trial 14 with value: -0.26261998398880576.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[I 2024-08-20 19:15:10,412] Trial 32 finished with value: -0.26602879662074697 and parameters: {'C': 44.86026605835454, 'kernel': 'rbf', 'gamma': 'auto', 'epsilon': 0.1781585364745432}. Best is trial 14 with value: -0.26261998398880576.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[I 2024-08-20 19:15:10,769] Trial 33 finished with value: -0.26732653776258536 and parameters: {'C': 25.837887245869638, 'kernel': 'rbf', 'gamma': 'auto', 'epsilon': 0.08816340386135119}. Best is trial 14 with value: -0.26261998398880576.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[I 2024-08-20 19:15:11,020] Trial 34 finished with value: -0.26050336963513115 and parameters: {'C': 16.43603147321819, 'kernel': 'rbf', 'gamma': 'scale', 'epsilon': 0.2546729864161042}. Best is trial 34 with value: -0.26050336963513115.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
Best trial: 34. Best value: -0.260503:  74%|███████▍  | 37/50 [15:51<00:04,  3.20it/s]

[I 2024-08-20 19:15:11,263] Trial 35 finished with value: -0.26380252707060636 and parameters: {'C': 16.413497285571164, 'kernel': 'rbf', 'gamma': 'scale', 'epsilon': 0.275757293713154}. Best is trial 34 with value: -0.26050336963513115.
[I 2024-08-20 19:15:11,462] Trial 36 finished with value: -0.29068811856231147 and parameters: {'C': 16.07728681328241, 'kernel': 'rbf', 'gamma': 'scale', 'epsilon': 0.37749899795896763}. Best is trial 34 with value: -0.26050336963513115.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[I 2024-08-20 19:15:11,824] Trial 37 finished with value: -0.2662214618131916 and parameters: {'C': 7.258109493451496, 'kernel': 'rbf', 'gamma': 'scale', 'epsilon': 0.28364887715398585}. Best is trial 34 with value: -0.26050336963513115.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
Best trial: 34. Best value: -0.260503:  80%|████████  | 40/50 [15:52<00:02,  3.45it/s]

[I 2024-08-20 19:15:12,165] Trial 38 finished with value: -52.07848337642687 and parameters: {'C': 6.285790967204967, 'kernel': 'sigmoid', 'gamma': 'scale', 'epsilon': 0.48749094728122166}. Best is trial 34 with value: -0.26050336963513115.
[I 2024-08-20 19:15:12,345] Trial 39 finished with value: -0.3358284131913639 and parameters: {'C': 16.37183396750085, 'kernel': 'poly', 'gamma': 'scale', 'epsilon': 0.8842874990821354}. Best is trial 34 with value: -0.26050336963513115.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[I 2024-08-20 19:15:12,537] Trial 40 finished with value: -0.3358284131913639 and parameters: {'C': 14.152187997858988, 'kernel': 'rbf', 'gamma': 'scale', 'epsilon': 0.608370974160463}. Best is trial 34 with value: -0.26050336963513115.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[I 2024-08-20 19:15:12,789] Trial 41 finished with value: -0.2617946740459335 and parameters: {'C': 54.01795054796422, 'kernel': 'rbf', 'gamma': 'scale', 'epsilon': 0.24942600118209862}. Best is trial 34 with value: -0.26050336963513115.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[I 2024-08-20 19:15:13,069] Trial 42 finished with value: -0.2626909780403118 and parameters: {'C': 60.08047009982501, 'kernel': 'rbf', 'gamma': 'scale', 'epsilon': 0.25339156930742135}. Best is trial 34 with value: -0.26050336963513115.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[I 2024-08-20 19:15:13,304] Trial 43 finished with value: -0.2784491263806812 and parameters: {'C': 55.64045104497668, 'kernel': 'rbf', 'gamma': 'scale', 'epsilon': 0.3399035607646218}. Best is trial 34 with value: -0.26050336963513115.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[I 2024-08-20 19:15:13,542] Trial 44 finished with value: -3.51855727893187 and parameters: {'C': 67.36289574150577, 'kernel': 'poly', 'gamma': 'scale', 'epsilon': 0.25649957442830407}. Best is trial 34 with value: -0.26050336963513115.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[I 2024-08-20 19:15:13,770] Trial 45 finished with value: -0.2909644519814115 and parameters: {'C': 59.10802430449592, 'kernel': 'rbf', 'gamma': 'scale', 'epsilon': 0.3842486526585228}. Best is trial 34 with value: -0.26050336963513115.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
Best trial: 34. Best value: -0.260503:  96%|█████████▌| 48/50 [15:54<00:00,  4.38it/s]

[I 2024-08-20 19:15:14,032] Trial 46 finished with value: -0.2647967975031078 and parameters: {'C': 69.28150087382149, 'kernel': 'rbf', 'gamma': 'scale', 'epsilon': 0.26359228491862274}. Best is trial 34 with value: -0.26050336963513115.
[I 2024-08-20 19:15:14,213] Trial 47 finished with value: -0.31884862399336744 and parameters: {'C': 50.539386265911254, 'kernel': 'rbf', 'gamma': 'scale', 'epsilon': 0.4621594767915211}. Best is trial 34 with value: -0.26050336963513115.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[I 2024-08-20 19:15:14,517] Trial 48 finished with value: -5.064524686669613 and parameters: {'C': 81.91361584943246, 'kernel': 'poly', 'gamma': 'scale', 'epsilon': 0.20743206182900364}. Best is trial 34 with value: -0.26050336963513115.


Best trial: 34. Best value: -0.260503: 100%|██████████| 50/50 [16:02<00:00, 19.26s/it]

[I 2024-08-20 19:15:22,528] Trial 49 finished with value: -0.3634642140205272 and parameters: {'C': 58.5154165333264, 'kernel': 'linear', 'gamma': 'scale', 'epsilon': 0.41704361966355263}. Best is trial 34 with value: -0.26050336963513115.
Number of finished trials:  50
Best trial:
  Value: -0.26050336963513115
  Params: 
    C: 16.43603147321819
    kernel: rbf
    gamma: scale
    epsilon: 0.2546729864161042





In [17]:
trial.params

{'C': 16.43603147321819,
 'kernel': 'rbf',
 'gamma': 'scale',
 'epsilon': 0.2546729864161042}

In [18]:
start_time = time.time()

# Get the best hyperparameters
best_params = study.best_params

# Train the final model on the entire training data
model = SVR(**best_params)
pipeline = Pipeline([
    ('preprocessor', scaler),
    ('svr', model)
])
pipeline.fit(train_features, train_labels.to_numpy())

# print out training time
elapsed_time = time.time() - start_time
print(f"Training time: {elapsed_time:.3f} seconds")

predictions = pipeline.predict(test_features)


Training time: 0.141 seconds


  y = column_or_1d(y, warn=True)


In [19]:
# save metrics
mae, mape, rmse, rsqr = calculate_metric(predictions, test_labels.to_numpy())
print(f"Train average mean absolute error: {mae}")
print(f"Train average mean absolute percentage error: {mape}")
print(f"Train average root mean squared error: {rmse}")
print(f"Train average R2: {rsqr}")

Train average mean absolute error: 0.32879432415585896
Train average mean absolute percentage error: 411.31102699514787
Train average root mean squared error: 0.25482462627677854
Train average R2: 0.43309475661290453


5-fold CV

In [20]:
features = pd.concat([train_features, test_features], axis=0, ignore_index=True)
features

Unnamed: 0,coupon rate,SP500 MD,Average daily 1-year SP500 return,Ratio to MA,US Corporate Bond Yield Spread,US Corporate Bond Yield Spread(3-5 year),US Corporate Bond Yield Spread(5-7 year),US Corporate Bond Yield Spread(7-10 year),US Corporate Bond Yield Spread(10+ year),US Generic Govt 3 Month Yield,...,event_type_subcategory_sum_Missing Coupon payment only,event_type_subcategory_sum_Missing Interest payment,event_type_subcategory_sum_Missing Loan payment,event_type_subcategory_sum_Missing Principal payment,event_type_subcategory_sum_Others,event_type_subcategory_sum_Pre-Negotiated Chapter 11,event_type_subcategory_sum_Protection,event_type_subcategory_sum_Receivership,event_type_subcategory_sum_Rehabilitation,event_type_subcategory_sum_Restructuring
0,7.500,-117.46020,-0.000189,125.407139,177.213028,134.012054,198.8153,191.364395,223.346344,0.1983,...,True,False,False,False,False,False,False,False,False,False
1,6.000,166.38276,0.000768,-4.603446,101.613617,77.032829,123.3998,105.932022,139.111115,0.0355,...,False,False,False,False,False,False,False,False,False,False
2,11.000,119.85752,0.000678,-11.950380,104.545959,77.416649,129.4317,111.818001,139.717407,0.0101,...,False,False,False,False,False,False,False,False,False,False
3,9.125,653.51208,0.001638,-2.494861,90.736633,64.654129,95.3731,92.141212,121.666237,0.0152,...,False,False,False,False,False,False,False,False,False,False
4,9.250,231.89472,0.000664,4.823413,98.533821,68.759308,93.4174,107.424469,139.741165,1.2865,...,False,False,False,False,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1720,3.250,147.33344,0.000556,3.538252,128.976395,95.360374,138.8445,138.946106,172.733887,0.0000,...,False,False,False,False,False,False,False,False,False,False
1721,10.000,175.31656,0.000554,-4.067337,121.170998,92.879501,123.2500,131.104904,162.916901,0.7516,...,False,True,False,False,False,False,False,False,False,False
1722,7.450,315.81748,0.000747,8.604100,126.786606,89.018188,114.9728,131.522430,171.701096,0.0864,...,False,False,False,False,False,False,False,False,False,False
1723,0.500,31.75120,-0.000157,-4.373852,126.595230,105.460007,121.6212,147.382416,163.772141,4.8375,...,False,False,False,False,False,False,False,False,False,False


In [21]:
labels = pd.concat([train_labels, test_labels], axis=0, ignore_index=True)
labels

Unnamed: 0,rr1_30
0,0.082481
1,0.378845
2,0.836149
3,0.987208
4,1.021458
...,...
1720,0.471411
1721,0.823750
1722,0.241612
1723,0.762054


In [22]:
model = SVR(**best_params)
pipeline = Pipeline([
    ('preprocessor', scaler),
    ('svr', model)
])
pipeline.fit(train_features, train_labels)

  y = column_or_1d(y, warn=True)
The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [23]:
import os
import pickle

import numpy as np
from sklearn.model_selection import KFold
from tqdm import tqdm


def run_kfold_eval(
	features: np.ndarray, 
  labels: np.ndarray, 
  n_folds: int, 
  model: object, 
  model_params: dict | None,
  output_path: str,
  name: str='rf_model', 
  seed=42
):  
    kf = KFold(n_splits=n_folds)
    if seed:
        kf = KFold(
            n_splits=n_folds, 
            shuffle=True,
            random_state=seed
        )
    oof_preds = []
    train_preds = []
    test_preds = []

    train_metrics = {}
    val_metrics = {}

    feat_importances = np.zeros(features.shape[1])

    bar = tqdm(total=n_folds)
    for fold, (train_idx, val_idx) in enumerate(kf.split(features)):

        if model_params != None:
            x_train, x_val = features[train_idx], features[val_idx]
            y_train, y_val = labels[train_idx], labels[val_idx]
            reg = model(**model_params)
        else:
            x_train, x_val = features.iloc[train_idx], features.iloc[val_idx]
            y_train, y_val = labels[train_idx], labels[val_idx]
            reg = model
        reg.fit(x_train, y_train)
        
        # Prediction on train data
        preds = reg.predict(x_train)
        train_preds.append(preds)

        # save metrics
        mae, mape, rmse, rsqr = calculate_metric(preds, y_train)
        train_metrics[f"mae_fold{fold}"] = mae
        train_metrics[f"mape_fold{fold}"] = mape
        train_metrics[f"rmse_fold{fold}"] = rmse
        train_metrics[f"rsqr_fold{fold}"] = rsqr
        
        # Prediction on val data
        preds = reg.predict(x_val)
        oof_preds.append(preds)

        # save metrics
        mae, mape, rmse, rsqr = calculate_metric(preds, y_val)
        val_metrics[f"mae_fold{fold}"] = mae
        val_metrics[f"mape_fold{fold}"] = mape
        val_metrics[f"rmse_fold{fold}"] = rmse
        val_metrics[f"rsqr_fold{fold}"] = rsqr

        # Feature importance
        if hasattr(reg, 'feature_importances_'):
            feat_importances += reg.feature_importances_
        elif hasattr(reg, 'coef_'):
            feat_importances += reg.coef_

        pickle.dump(
            reg, 
            open(os.path.join(output_path, f"{name}_{seed}_fold-{fold + 1}.pkl"), "wb")
        )
        bar.update()
        
    feat_importances /= n_folds

    return train_preds, oof_preds, feat_importances, train_metrics, val_metrics

In [24]:
train_preds, oof_preds, feat_importances, train_metrics, val_metrics = run_kfold_eval(features, labels.to_numpy(), n_folds=5, model=pipeline, model_params=None, output_path=OUTPUT_PATH, name="svm")

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
100%|██████████| 5/5 [00:01<00:00,  3.71it/s]


In [25]:
# print test resuls
mae = 0
mape = 0
rmse = 0
rsqr = 0
i = 0

for key, value in val_metrics.items():
  if "mae" in key:
    i += 1
    mae += value

  if "mape" in key:
    mape += value

  if "rmse" in key:
    rmse += value

  if "rsqr" in key:
    rsqr += value

print(f"Test average mean absolute error: {mae/i}")
print(f"Test average mean absolute percentage error: {mape/i}")
print(f"Test average root mean squared error: {rmse/i}")
print(f"Test average R2: {rsqr/i}")

Test average mean absolute error: 0.32231699440063866
Test average mean absolute percentage error: 3810.3529474132906
Test average root mean squared error: 0.24667910108606264
Test average R2: 0.4441938280204784
