In [7]:
import pandas as pd
from sklearn.discriminant_analysis import StandardScaler
from sklearn.compose import ColumnTransformer
from utils import run_kfold, run_kfold_eval, run_kfold_trainonly, feature_selection, calculate_metric
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import optuna
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
import time
from pathlib import Path
from sklearn.decomposition import PCA
import joblib

In [33]:
DATA_FOLDER = "../data"
TRAIN_FEATURES = DATA_FOLDER + '/train_features2.xlsx'
TRAIN_LABELS = DATA_FOLDER + "/train_labels2.xlsx"
TEST_FEATURES = DATA_FOLDER + "/test_features2.xlsx"
TEST_LABELS = DATA_FOLDER + "/test_labels2.xlsx"
OUTPUT_PATH = '../OUTPUT/svm_pca'
OUTPUT_FILE = OUTPUT_PATH + '/model.pkl'

TRAINING_OUTPUT_FILE = '../output/train_predictions.xlsx'
TEST_OUTPUT_FILE = '../output/test_predictions.xlsx'
SHEET_NAME = "svm"

# Create output directory if it doesn't exist
output_dir = Path(OUTPUT_PATH)
output_dir.mkdir(parents=True, exist_ok=True)

In [3]:
train_features = pd.read_excel(TRAIN_FEATURES)
train_labels = pd.read_excel(TRAIN_LABELS)
test_features = pd.read_excel(TEST_FEATURES)
test_labels = pd.read_excel(TEST_LABELS)

In [4]:
print("Train feature shape: ",train_features.shape)
print("Train label shape: ",train_labels.shape)
print("Test feature shape: ",test_features.shape)
print("Train label shape: ",test_labels.shape)

Train feature shape:  (1293, 317)
Train label shape:  (1293, 1)
Test feature shape:  (432, 317)
Train label shape:  (432, 1)


In [5]:
train_features.head()

Unnamed: 0,coupon rate,SP500 MD,Average daily 1-year SP500 return,Ratio to MA,US Corporate Bond Yield Spread,US Corporate Bond Yield Spread(3-5 year),US Corporate Bond Yield Spread(5-7 year),US Corporate Bond Yield Spread(7-10 year),US Corporate Bond Yield Spread(10+ year),US Generic Govt 3 Month Yield,...,event_type_subcategory_sum_Missing Coupon payment only,event_type_subcategory_sum_Missing Interest payment,event_type_subcategory_sum_Missing Loan payment,event_type_subcategory_sum_Missing Principal payment,event_type_subcategory_sum_Others,event_type_subcategory_sum_Pre-Negotiated Chapter 11,event_type_subcategory_sum_Protection,event_type_subcategory_sum_Receivership,event_type_subcategory_sum_Rehabilitation,event_type_subcategory_sum_Restructuring
0,7.5,-117.4602,-0.000189,125.407139,177.213028,134.012054,198.8153,191.364395,223.346344,0.1983,...,True,False,False,False,False,False,False,False,False,False
1,6.0,166.38276,0.000768,-4.603446,101.613617,77.032829,123.3998,105.932022,139.111115,0.0355,...,False,False,False,False,False,False,False,False,False,False
2,11.0,119.85752,0.000678,-11.95038,104.545959,77.416649,129.4317,111.818001,139.717407,0.0101,...,False,False,False,False,False,False,False,False,False,False
3,9.125,653.51208,0.001638,-2.494861,90.736633,64.654129,95.3731,92.141212,121.666237,0.0152,...,False,False,False,False,False,False,False,False,False,False
4,9.25,231.89472,0.000664,4.823413,98.533821,68.759308,93.4174,107.424469,139.741165,1.2865,...,False,False,False,False,True,False,False,False,False,False


In [6]:
feature_list = train_features.columns
category_feature_key = ['currency', 'seniorioty_adj', 'domicile_country',	'exchange_country',	'Industry_sector',	'Industry_group',	'Industry_subgroup', 'event_type',
                 'event_type_subcategory_sum', 'defaulted_in_last_6_months']
category_features = [i for i in feature_list if any(sub in i for sub in category_feature_key)]
non_category_features = [i for i in feature_list if i not in category_features]

print(len(category_features))
print(len(non_category_features))

163
154


In [14]:
def create_pca_pipeline(n_components=50):
    """
    Create a pipeline with StandardScaler and PCA
    n_components: float between 0 and 1 (explained variance ratio)
    """
    return Pipeline([
        ('scaler', StandardScaler()),
        ('pca', PCA(n_components=n_components))
    ])

In [15]:
# Normalize the data
# Prepare the ColumnTransformer
scaler = ColumnTransformer(
    transformers=[
        ('num', create_pca_pipeline(), non_category_features)
    ],
    remainder='passthrough'  # Leave categorical features untouched
)

In [21]:
def objective(trial):
    # train_x, valid_x, train_y, valid_y = train_test_split(train_feature_df, train_label_df, test_size=0.2, random_state=42)
    # Hyperparameter search space
    C = trial.suggest_float('C', 1e-10, 100)
    kernel = trial.suggest_categorical('kernel', ['linear', 'poly', 'rbf', 'sigmoid'])
    gamma = trial.suggest_categorical('gamma', ['scale', 'auto', 0.001, 0.01, 0.1, 1])
    epsilon = trial.suggest_float('epsilon', 1e-3, 1.0)
    
    # Define the SVM model
    model = SVR(C=C, kernel=kernel, epsilon=epsilon, gamma=gamma)
    pipeline = Pipeline([
        ('preprocessor', scaler),
        ('svr', model)
    ])
    
    score = cross_val_score(pipeline, train_features, train_labels.to_numpy(), cv=5, scoring='neg_root_mean_squared_error', n_jobs=-1).mean()
    return score

In [23]:
# Create study object
study = optuna.create_study(direction="maximize")

# Run optimization process
study.optimize(objective, n_trials=50, show_progress_bar=True)

print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[I 2025-04-23 19:06:51,379] A new study created in memory with name: no-name-754d594c-6673-4297-9b9d-2c1d219eeb1e


  0%|          | 0/50 [00:00<?, ?it/s]

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[I 2025-04-23 19:07:38,435] Trial 0 finished with value: -0.35156749836636053 and parameters: {'C': 8.123221285581922, 'kernel': 'linear', 'gamma': 0.01, 'epsilon': 0.05152848615466631}. Best is trial 0 with value: -0.35156749836636053.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[I 2025-04-23 19:07:39,315] Trial 1 finished with value: -0.3358284131913639 and parameters: {'C': 50.78559346765934, 'kernel': 'poly', 'gamma': 0.001, 'epsilon': 0.8053439813632026}. Best is trial 1 with value: -0.3358284131913639.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[I 2025-04-23 19:07:39,843] Trial 2 finished with value: -38.56574070371899 and parameters: {'C': 94.05622037842404, 'kernel': 'sigmoid', 'gamma': 0.001, 'epsilon': 0.32198479942296643}. Best is trial 1 with value: -0.3358284131913639.
[I 2025-04-23 19:07:40,018] Trial 3 finished with value: -0.3358284131913639 and parameters: {'C': 38.69783770166273, 'kernel': 'sigmoid', 'gamma': 'scale', 'epsilon': 0.8653228841993363}. Best is trial 1 with value: -0.3358284131913639.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[I 2025-04-23 19:07:40,205] Trial 4 finished with value: -0.3358284131913639 and parameters: {'C': 70.66936688565528, 'kernel': 'linear', 'gamma': 0.1, 'epsilon': 0.6700156253048347}. Best is trial 1 with value: -0.3358284131913639.
[I 2025-04-23 19:07:40,374] Trial 5 finished with value: -0.3358284131913639 and parameters: {'C': 18.20509043834604, 'kernel': 'linear', 'gamma': 0.1, 'epsilon': 0.8706964266328854}. Best is trial 1 with value: -0.3358284131913639.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[I 2025-04-23 19:07:40,810] Trial 6 finished with value: -3582.7358978838442 and parameters: {'C': 42.043888778420815, 'kernel': 'sigmoid', 'gamma': 0.1, 'epsilon': 0.4011514954445878}. Best is trial 1 with value: -0.3358284131913639.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[I 2025-04-23 19:07:41,182] Trial 7 finished with value: -0.3295236277051223 and parameters: {'C': 28.674765108784175, 'kernel': 'linear', 'gamma': 1, 'epsilon': 0.48056208053213395}. Best is trial 7 with value: -0.3295236277051223.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[I 2025-04-23 19:07:41,507] Trial 8 finished with value: -5.086261901953571 and parameters: {'C': 35.223438390317305, 'kernel': 'poly', 'gamma': 'auto', 'epsilon': 0.16606877504068698}. Best is trial 7 with value: -0.3295236277051223.
[I 2025-04-23 19:07:41,692] Trial 9 finished with value: -0.3358284131913639 and parameters: {'C': 40.59453392452763, 'kernel': 'poly', 'gamma': 1, 'epsilon': 0.7416243450713749}. Best is trial 7 with value: -0.3295236277051223.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[I 2025-04-23 19:07:41,881] Trial 10 finished with value: -0.3358284131913639 and parameters: {'C': 1.2795715871095297, 'kernel': 'rbf', 'gamma': 1, 'epsilon': 0.5643480873011201}. Best is trial 7 with value: -0.3295236277051223.
[I 2025-04-23 19:07:42,067] Trial 11 finished with value: -0.3302108428909899 and parameters: {'C': 66.15046008238947, 'kernel': 'poly', 'gamma': 0.001, 'epsilon': 0.5101330775240547}. Best is trial 7 with value: -0.3295236277051223.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[I 2025-04-23 19:07:42,271] Trial 12 finished with value: -0.33113930271193986 and parameters: {'C': 69.64688409093718, 'kernel': 'rbf', 'gamma': 1, 'epsilon': 0.4761839077836212}. Best is trial 7 with value: -0.3295236277051223.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[I 2025-04-23 19:11:00,031] Trial 13 finished with value: -0.42683188924205034 and parameters: {'C': 64.36653782749215, 'kernel': 'linear', 'gamma': 0.001, 'epsilon': 0.2775649266022242}. Best is trial 7 with value: -0.3295236277051223.
[I 2025-04-23 19:11:00,207] Trial 14 finished with value: -0.3358284131913639 and parameters: {'C': 93.28530095461196, 'kernel': 'poly', 'gamma': 'scale', 'epsilon': 0.606486261298821}. Best is trial 7 with value: -0.3295236277051223.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[I 2025-04-23 19:11:00,381] Trial 15 finished with value: -0.3358284131913639 and parameters: {'C': 24.729104964545378, 'kernel': 'linear', 'gamma': 'auto', 'epsilon': 0.9745473422752686}. Best is trial 7 with value: -0.3295236277051223.
[I 2025-04-23 19:11:00,569] Trial 16 finished with value: -0.3456944849392106 and parameters: {'C': 58.224239767954366, 'kernel': 'poly', 'gamma': 0.01, 'epsilon': 0.47352141425634403}. Best is trial 7 with value: -0.3295236277051223.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[I 2025-04-23 19:11:00,859] Trial 17 finished with value: -0.2762835093568163 and parameters: {'C': 81.73021830388387, 'kernel': 'rbf', 'gamma': 0.001, 'epsilon': 0.3023114803730189}. Best is trial 17 with value: -0.2762835093568163.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[I 2025-04-23 19:11:01,193] Trial 18 finished with value: -0.2798060743727087 and parameters: {'C': 78.98146706608465, 'kernel': 'rbf', 'gamma': 1, 'epsilon': 0.2029939701648389}. Best is trial 17 with value: -0.2762835093568163.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[I 2025-04-23 19:11:01,950] Trial 19 finished with value: -0.28808768200119805 and parameters: {'C': 80.87898183613349, 'kernel': 'rbf', 'gamma': 0.001, 'epsilon': 0.011913845623590158}. Best is trial 17 with value: -0.2762835093568163.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[I 2025-04-23 19:11:02,326] Trial 20 finished with value: -0.2687505196904992 and parameters: {'C': 84.3040638351872, 'kernel': 'rbf', 'gamma': 1, 'epsilon': 0.13426299590706342}. Best is trial 20 with value: -0.2687505196904992.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[I 2025-04-23 19:11:02,685] Trial 21 finished with value: -0.2736567589125009 and parameters: {'C': 82.47010269650596, 'kernel': 'rbf', 'gamma': 1, 'epsilon': 0.1684374202142693}. Best is trial 20 with value: -0.2687505196904992.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[I 2025-04-23 19:11:03,071] Trial 22 finished with value: -0.270488038184987 and parameters: {'C': 84.42360652953354, 'kernel': 'rbf', 'gamma': 1, 'epsilon': 0.14736921973193942}. Best is trial 20 with value: -0.2687505196904992.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[I 2025-04-23 19:11:03,474] Trial 23 finished with value: -0.26691738986444136 and parameters: {'C': 99.7157085163822, 'kernel': 'rbf', 'gamma': 1, 'epsilon': 0.11869367077492762}. Best is trial 23 with value: -0.26691738986444136.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[I 2025-04-23 19:11:03,886] Trial 24 finished with value: -0.26466507127730016 and parameters: {'C': 98.50192646727982, 'kernel': 'rbf', 'gamma': 1, 'epsilon': 0.09431079979860307}. Best is trial 24 with value: -0.26466507127730016.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[I 2025-04-23 19:11:04,292] Trial 25 finished with value: -0.26400245494749736 and parameters: {'C': 99.70676147163357, 'kernel': 'rbf', 'gamma': 1, 'epsilon': 0.08181225789855412}. Best is trial 25 with value: -0.26400245494749736.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[I 2025-04-23 19:11:04,710] Trial 26 finished with value: -0.2647640252430866 and parameters: {'C': 99.23633550658268, 'kernel': 'rbf', 'gamma': 1, 'epsilon': 0.0958167648066239}. Best is trial 25 with value: -0.26400245494749736.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[I 2025-04-23 19:11:05,129] Trial 27 finished with value: -0.26360333118904605 and parameters: {'C': 99.74821487739622, 'kernel': 'rbf', 'gamma': 1, 'epsilon': 0.06631946450266343}. Best is trial 27 with value: -0.26360333118904605.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[I 2025-04-23 19:11:05,464] Trial 28 finished with value: -0.28769417843269035 and parameters: {'C': 91.37338266840968, 'kernel': 'rbf', 'gamma': 1, 'epsilon': 0.24108327054140583}. Best is trial 27 with value: -0.26360333118904605.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[I 2025-04-23 19:11:05,987] Trial 29 finished with value: -0.27789775405838946 and parameters: {'C': 89.97700266972109, 'kernel': 'rbf', 'gamma': 0.01, 'epsilon': 0.028191200553550073}. Best is trial 27 with value: -0.26360333118904605.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[I 2025-04-23 19:11:06,760] Trial 30 finished with value: -0.272538799520053 and parameters: {'C': 74.75789831878448, 'kernel': 'rbf', 'gamma': 'auto', 'epsilon': 0.054233335659149444}. Best is trial 27 with value: -0.26360333118904605.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[I 2025-04-23 19:11:07,216] Trial 31 finished with value: -0.2639231136095499 and parameters: {'C': 98.88912708739667, 'kernel': 'rbf', 'gamma': 1, 'epsilon': 0.0794929683664079}. Best is trial 27 with value: -0.26360333118904605.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[I 2025-04-23 19:11:07,641] Trial 32 finished with value: -0.2640133335658441 and parameters: {'C': 99.19292375766818, 'kernel': 'rbf', 'gamma': 1, 'epsilon': 0.0818888631898085}. Best is trial 27 with value: -0.26360333118904605.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[I 2025-04-23 19:11:07,947] Trial 33 finished with value: -0.3141582601936474 and parameters: {'C': 88.81583060687633, 'kernel': 'rbf', 'gamma': 1, 'epsilon': 0.36068248387949264}. Best is trial 27 with value: -0.26360333118904605.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[I 2025-04-23 19:11:08,802] Trial 34 finished with value: -763.3379750705599 and parameters: {'C': 94.77063629392508, 'kernel': 'sigmoid', 'gamma': 'scale', 'epsilon': 0.004117541447602027}. Best is trial 27 with value: -0.26360333118904605.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[I 2025-04-23 19:11:09,099] Trial 35 finished with value: -0.2654562395504997 and parameters: {'C': 87.42862975602714, 'kernel': 'rbf', 'gamma': 0.01, 'epsilon': 0.22565309876325604}. Best is trial 27 with value: -0.26360333118904605.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[I 2025-04-23 19:11:09,670] Trial 36 finished with value: -5518.516542502121 and parameters: {'C': 54.376738227164196, 'kernel': 'sigmoid', 'gamma': 1, 'epsilon': 0.07790239805597614}. Best is trial 27 with value: -0.26360333118904605.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[I 2025-04-23 19:11:09,910] Trial 37 finished with value: -0.3124134236890728 and parameters: {'C': 75.25385950315078, 'kernel': 'rbf', 'gamma': 0.1, 'epsilon': 0.38341059610323563}. Best is trial 27 with value: -0.26360333118904605.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[I 2025-04-23 19:11:10,218] Trial 38 finished with value: -0.2911453387452399 and parameters: {'C': 95.18961276143676, 'kernel': 'rbf', 'gamma': 1, 'epsilon': 0.25691779523915415}. Best is trial 27 with value: -0.26360333118904605.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[I 2025-04-23 19:11:10,724] Trial 39 finished with value: -790.9433570654612 and parameters: {'C': 95.34960410686527, 'kernel': 'sigmoid', 'gamma': 'scale', 'epsilon': 0.17658155870107828}. Best is trial 27 with value: -0.26360333118904605.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[I 2025-04-23 19:11:11,196] Trial 40 finished with value: -0.2636009986187986 and parameters: {'C': 89.56536595182459, 'kernel': 'rbf', 'gamma': 1, 'epsilon': 0.041770514876494996}. Best is trial 40 with value: -0.2636009986187986.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[I 2025-04-23 19:11:11,647] Trial 41 finished with value: -0.26348772121020236 and parameters: {'C': 88.81572003366466, 'kernel': 'rbf', 'gamma': 1, 'epsilon': 0.05942453137897204}. Best is trial 41 with value: -0.26348772121020236.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[I 2025-04-23 19:11:12,077] Trial 42 finished with value: -0.26348051657080634 and parameters: {'C': 87.73545220136845, 'kernel': 'rbf', 'gamma': 1, 'epsilon': 0.04917064770051137}. Best is trial 42 with value: -0.26348051657080634.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[I 2025-04-23 19:11:12,637] Trial 43 finished with value: -0.2663528572605065 and parameters: {'C': 86.85626258312402, 'kernel': 'rbf', 'gamma': 1, 'epsilon': 0.0036698702077783557}. Best is trial 42 with value: -0.26348051657080634.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[I 2025-04-23 19:11:13,062] Trial 44 finished with value: -0.25667807657584885 and parameters: {'C': 75.58194070686703, 'kernel': 'rbf', 'gamma': 0.1, 'epsilon': 0.04736646306986983}. Best is trial 44 with value: -0.25667807657584885.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[I 2025-04-23 19:15:53,047] Trial 45 finished with value: -0.5423813597622064 and parameters: {'C': 74.70304943920931, 'kernel': 'linear', 'gamma': 0.1, 'epsilon': 0.20961425237801928}. Best is trial 44 with value: -0.25667807657584885.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[I 2025-04-23 19:15:53,441] Trial 46 finished with value: -0.2560774557045929 and parameters: {'C': 45.43518949659049, 'kernel': 'rbf', 'gamma': 0.1, 'epsilon': 0.0452375920424255}. Best is trial 46 with value: -0.2560774557045929.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[I 2025-04-23 19:15:53,657] Trial 47 finished with value: -0.25617073671709667 and parameters: {'C': 46.53201059336333, 'kernel': 'rbf', 'gamma': 0.1, 'epsilon': 0.1345743190504004}. Best is trial 46 with value: -0.2560774557045929.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[I 2025-04-23 19:15:57,697] Trial 48 finished with value: -23.37353354831473 and parameters: {'C': 47.35090911713364, 'kernel': 'poly', 'gamma': 0.1, 'epsilon': 0.12901714710735596}. Best is trial 46 with value: -0.2560774557045929.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[I 2025-04-23 19:18:44,291] Trial 49 finished with value: -0.4514781503487589 and parameters: {'C': 47.020690350929335, 'kernel': 'linear', 'gamma': 0.1, 'epsilon': 0.17330125417627493}. Best is trial 46 with value: -0.2560774557045929.
Number of finished trials:  50
Best trial:
  Value: -0.2560774557045929
  Params: 
    C: 45.43518949659049
    kernel: rbf
    gamma: 0.1
    epsilon: 0.0452375920424255


In [24]:
trial.params

{'C': 45.43518949659049,
 'kernel': 'rbf',
 'gamma': 0.1,
 'epsilon': 0.0452375920424255}

In [25]:
# best_params = {'C': 16.43603147321819,
#  'kernel': 'rbf',
#  'gamma': 'scale',
#  'epsilon': 0.2546729864161042}

In [26]:
start_time = time.time()

# Get the best hyperparameters
best_params = study.best_params

# Train the final model on the entire training data
model = SVR(**best_params)
pipeline = Pipeline([
    ('preprocessor', scaler),
    ('svr', model)
])
pipeline.fit(train_features, train_labels.to_numpy())

# print out training time
elapsed_time = time.time() - start_time
print(f"Training time: {elapsed_time:.3f} seconds")

predictions = pipeline.predict(test_features)


  y = column_or_1d(y, warn=True)


Training time: 7.177 seconds


In [27]:
# save metrics
mae, mape, rmse, rsqr = calculate_metric(predictions, test_labels.to_numpy())
print(f"Test average mean absolute error: {mae}")
print(f"Test average mean absolute percentage error: {mape}")
print(f"Test average root mean squared error: {rmse}")
print(f"Test average R2: {rsqr}")

Test average mean absolute error: 0.3527669771854595
Test average mean absolute percentage error: 404.67668751015367
Test average root mean squared error: 0.24218208707460917
Test average R2: 0.4879507704447188


In [None]:
training_predictions = pipeline.predict(train_features)
# save metrics
mae, mape, rmse, rsqr = calculate_metric(training_predictions, train_labels.to_numpy())
print(f"Train average mean absolute error: {mae}")
print(f"Train average mean absolute percentage error: {mape}")
print(f"Train average root mean squared error: {rmse}")
print(f"Train average R2: {rsqr}")

Train average mean absolute error: 0.3606632732950329
Train average mean absolute percentage error: 4700.961112135371
Train average root mean squared error: 0.071901498093537
Train average R2: 0.9523462935299593


In [None]:

# save train predictions
train_predictions_df = pd.DataFrame({"predictions": training_predictions})
with pd.ExcelWriter(TRAINING_OUTPUT_FILE, mode='a', engine='openpyxl', if_sheet_exists='replace') as writer:
    # Write the new DataFrame to a new sheet
    train_predictions_df.to_excel(writer, sheet_name=SHEET_NAME, index=False)
    
# save test predictions
test_predictions_df = pd.DataFrame({"predictions": predictions})
with pd.ExcelWriter(TEST_OUTPUT_FILE, mode='a', engine='openpyxl', if_sheet_exists='replace') as writer:
    # Write the new DataFrame to a new sheet
    test_predictions_df.to_excel(writer, sheet_name=SHEET_NAME, index=False)

In [34]:
# Save the best model
joblib.dump(pipeline, OUTPUT_FILE)

['../OUTPUT/svm_pca/model.pkl']

5-fold CV

In [20]:
features = pd.concat([train_features, test_features], axis=0, ignore_index=True)
features

Unnamed: 0,coupon rate,SP500 MD,Average daily 1-year SP500 return,Ratio to MA,US Corporate Bond Yield Spread,US Corporate Bond Yield Spread(3-5 year),US Corporate Bond Yield Spread(5-7 year),US Corporate Bond Yield Spread(7-10 year),US Corporate Bond Yield Spread(10+ year),US Generic Govt 3 Month Yield,...,event_type_subcategory_sum_Missing Coupon payment only,event_type_subcategory_sum_Missing Interest payment,event_type_subcategory_sum_Missing Loan payment,event_type_subcategory_sum_Missing Principal payment,event_type_subcategory_sum_Others,event_type_subcategory_sum_Pre-Negotiated Chapter 11,event_type_subcategory_sum_Protection,event_type_subcategory_sum_Receivership,event_type_subcategory_sum_Rehabilitation,event_type_subcategory_sum_Restructuring
0,7.500,-117.46020,-0.000189,125.407139,177.213028,134.012054,198.8153,191.364395,223.346344,0.1983,...,True,False,False,False,False,False,False,False,False,False
1,6.000,166.38276,0.000768,-4.603446,101.613617,77.032829,123.3998,105.932022,139.111115,0.0355,...,False,False,False,False,False,False,False,False,False,False
2,11.000,119.85752,0.000678,-11.950380,104.545959,77.416649,129.4317,111.818001,139.717407,0.0101,...,False,False,False,False,False,False,False,False,False,False
3,9.125,653.51208,0.001638,-2.494861,90.736633,64.654129,95.3731,92.141212,121.666237,0.0152,...,False,False,False,False,False,False,False,False,False,False
4,9.250,231.89472,0.000664,4.823413,98.533821,68.759308,93.4174,107.424469,139.741165,1.2865,...,False,False,False,False,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1720,3.250,147.33344,0.000556,3.538252,128.976395,95.360374,138.8445,138.946106,172.733887,0.0000,...,False,False,False,False,False,False,False,False,False,False
1721,10.000,175.31656,0.000554,-4.067337,121.170998,92.879501,123.2500,131.104904,162.916901,0.7516,...,False,True,False,False,False,False,False,False,False,False
1722,7.450,315.81748,0.000747,8.604100,126.786606,89.018188,114.9728,131.522430,171.701096,0.0864,...,False,False,False,False,False,False,False,False,False,False
1723,0.500,31.75120,-0.000157,-4.373852,126.595230,105.460007,121.6212,147.382416,163.772141,4.8375,...,False,False,False,False,False,False,False,False,False,False


In [21]:
labels = pd.concat([train_labels, test_labels], axis=0, ignore_index=True)
labels

Unnamed: 0,rr1_30
0,0.082481
1,0.378845
2,0.836149
3,0.987208
4,1.021458
...,...
1720,0.471411
1721,0.823750
1722,0.241612
1723,0.762054


In [22]:
model = SVR(**best_params)
pipeline = Pipeline([
    ('preprocessor', scaler),
    ('svr', model)
])
pipeline.fit(train_features, train_labels)

  y = column_or_1d(y, warn=True)
The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [23]:
import os
import pickle

import numpy as np
from sklearn.model_selection import KFold
from tqdm import tqdm


def run_kfold_eval(
	features: np.ndarray, 
  labels: np.ndarray, 
  n_folds: int, 
  model: object, 
  model_params: dict | None,
  output_path: str,
  name: str='rf_model', 
  seed=42
):  
    kf = KFold(n_splits=n_folds)
    if seed:
        kf = KFold(
            n_splits=n_folds, 
            shuffle=True,
            random_state=seed
        )
    oof_preds = []
    train_preds = []
    test_preds = []

    train_metrics = {}
    val_metrics = {}

    feat_importances = np.zeros(features.shape[1])

    bar = tqdm(total=n_folds)
    for fold, (train_idx, val_idx) in enumerate(kf.split(features)):

        if model_params != None:
            x_train, x_val = features[train_idx], features[val_idx]
            y_train, y_val = labels[train_idx], labels[val_idx]
            reg = model(**model_params)
        else:
            x_train, x_val = features.iloc[train_idx], features.iloc[val_idx]
            y_train, y_val = labels[train_idx], labels[val_idx]
            reg = model
        reg.fit(x_train, y_train)
        
        # Prediction on train data
        preds = reg.predict(x_train)
        train_preds.append(preds)

        # save metrics
        mae, mape, rmse, rsqr = calculate_metric(preds, y_train)
        train_metrics[f"mae_fold{fold}"] = mae
        train_metrics[f"mape_fold{fold}"] = mape
        train_metrics[f"rmse_fold{fold}"] = rmse
        train_metrics[f"rsqr_fold{fold}"] = rsqr
        
        # Prediction on val data
        preds = reg.predict(x_val)
        oof_preds.append(preds)

        # save metrics
        mae, mape, rmse, rsqr = calculate_metric(preds, y_val)
        val_metrics[f"mae_fold{fold}"] = mae
        val_metrics[f"mape_fold{fold}"] = mape
        val_metrics[f"rmse_fold{fold}"] = rmse
        val_metrics[f"rsqr_fold{fold}"] = rsqr

        # Feature importance
        if hasattr(reg, 'feature_importances_'):
            feat_importances += reg.feature_importances_
        elif hasattr(reg, 'coef_'):
            feat_importances += reg.coef_

        pickle.dump(
            reg, 
            open(os.path.join(output_path, f"{name}_{seed}_fold-{fold + 1}.pkl"), "wb")
        )
        bar.update()
        
    feat_importances /= n_folds

    return train_preds, oof_preds, feat_importances, train_metrics, val_metrics

In [24]:
train_preds, oof_preds, feat_importances, train_metrics, val_metrics = run_kfold_eval(features, labels.to_numpy(), n_folds=5, model=pipeline, model_params=None, output_path=OUTPUT_PATH, name="svm")

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
100%|██████████| 5/5 [00:01<00:00,  3.71it/s]


In [25]:
# print test resuls
mae = 0
mape = 0
rmse = 0
rsqr = 0
i = 0

for key, value in val_metrics.items():
  if "mae" in key:
    i += 1
    mae += value

  if "mape" in key:
    mape += value

  if "rmse" in key:
    rmse += value

  if "rsqr" in key:
    rsqr += value

print(f"Test average mean absolute error: {mae/i}")
print(f"Test average mean absolute percentage error: {mape/i}")
print(f"Test average root mean squared error: {rmse/i}")
print(f"Test average R2: {rsqr/i}")

Test average mean absolute error: 0.32231699440063866
Test average mean absolute percentage error: 3810.3529474132906
Test average root mean squared error: 0.24667910108606264
Test average R2: 0.4441938280204784
