In [63]:
import pandas  as pd
import xgboost as xgb
import seaborn as sns
from sklearn.discriminant_analysis import StandardScaler
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
import os
from utils import run_kfold_eval, calculate_metric
import joblib
import time
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
DATA_FOLDER = "../data"
TRAIN_FEATURES = DATA_FOLDER + '/train_features2.xlsx'
TRAIN_LABELS = DATA_FOLDER + "/train_labels2.xlsx"
TEST_FEATURES = DATA_FOLDER + "/test_features2.xlsx"
TEST_LABELS = DATA_FOLDER + "/test_labels2.xlsx"
OUTPUT_PATH = '../output/linear_regression'

In [3]:
train_features = pd.read_excel(TRAIN_FEATURES)
train_labels = pd.read_excel(TRAIN_LABELS)
test_features = pd.read_excel(TEST_FEATURES)
test_labels = pd.read_excel(TEST_LABELS)

In [4]:
test_features.head()

Unnamed: 0,coupon rate,SP500 MD,Average daily 1-year SP500 return,Ratio to MA,US Corporate Bond Yield Spread,US Corporate Bond Yield Spread(3-5 year),US Corporate Bond Yield Spread(5-7 year),US Corporate Bond Yield Spread(7-10 year),US Corporate Bond Yield Spread(10+ year),US Generic Govt 3 Month Yield,...,event_type_subcategory_sum_Missing Coupon payment only,event_type_subcategory_sum_Missing Interest payment,event_type_subcategory_sum_Missing Loan payment,event_type_subcategory_sum_Missing Principal payment,event_type_subcategory_sum_Others,event_type_subcategory_sum_Pre-Negotiated Chapter 11,event_type_subcategory_sum_Protection,event_type_subcategory_sum_Receivership,event_type_subcategory_sum_Rehabilitation,event_type_subcategory_sum_Restructuring
0,3.125,-122.3968,-0.001341,1.116103,396.408936,399.031738,455.0625,383.612488,341.675812,0.1734,...,False,False,False,False,False,False,False,False,False,False
1,15.0,156.01912,0.00079,6.372588,107.083778,76.41436,118.2482,113.403519,147.756165,0.0253,...,False,False,False,False,False,False,False,False,False,False
2,0.0,-451.10824,-0.002033,43.914617,637.607544,689.770996,683.3611,589.969666,529.166992,0.0406,...,False,False,False,True,False,False,False,False,False,False
3,5.875,-751.2212,-0.000435,74.783398,153.653778,129.425842,158.3252,172.708054,187.14798,1.4908,...,False,False,False,False,False,False,False,False,False,False
4,10.0,55.69036,0.000712,-28.522133,152.290619,148.38533,222.1811,142.140915,159.624313,0.0763,...,False,False,False,False,False,False,False,False,False,False


In [5]:
print(train_features.shape)
print(train_labels.shape)
print(test_features.shape)
print(test_labels.shape)

(1293, 317)
(1293, 1)
(432, 317)
(432, 1)


In [58]:
feature_list = train_features.columns
# category_feature_key = ['currency', 'seniorioty_adj', 'domicile_country',	'exchange_country',	'Industry_sector',	'Industry_group',	'Industry_subgroup', 'event_type',
#                  'event_type_subcategory_sum', 'defaulted_in_last_6_months']
# category_features = [i for i in feature_list if any(sub in i for sub in category_feature_key)]
category_features = train_features.select_dtypes(include=['int', 'bool']).columns
non_category_features = [i for i in feature_list if i not in category_features]

print(len(category_features))
print(len(non_category_features))

164
153


In [59]:
non_category_features

['coupon rate',
 'SP500 MD',
 'Average daily 1-year SP500 return',
 'Ratio to MA',
 'US Corporate Bond Yield Spread',
 'US Corporate Bond Yield Spread(3-5 year)',
 'US Corporate Bond Yield Spread(5-7 year)',
 'US Corporate Bond Yield Spread(7-10 year)',
 'US Corporate Bond Yield Spread(10+ year)',
 'US Generic Govt 3 Month Yield',
 'US Generic Govt 6 Month Yield',
 'US Generic Govt 12 Month Yield',
 'US Generic Govt 2 Year Yield',
 'US Generic Govt 3 Year Yield',
 'US Generic Govt 5 Year Yield',
 'US Generic Govt 7 Year Yield',
 'US Generic Govt 10 Year Yield',
 'marketcap',
 'sector_domicile_dtd',
 'sector_exchange_dtd',
 'sector_dtd',
 'PD_1_domicile_sector',
 'PD_3_domicile_sector',
 'PD_12_domicile_sector',
 'PD_1_domicile_subsec',
 'PD_3_domicile_subsec',
 'PD_12_domicile_subsec',
 'PD_1_exch_sector',
 'PD_3_exch_sector',
 'PD_12_exch_sector',
 'PD_1_exch_subsector',
 'PD_3_exch_subsector',
 'PD_12_exch_subsector',
 'PD_1_global_sector',
 'PD_3_global_sector',
 'PD_12_global_secto

In [60]:
# Normalize the data
# Prepare the ColumnTransformer

scaler = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), non_category_features)   # StandardScaler()
    ],
    remainder='passthrough'  # Leave categorical features untouched
)

In [62]:
start_time = time.time()

# Train the final model on the entire training data
pipeline = Pipeline([
        ('scaler', scaler),
        ('linear_reg', LinearRegression())
    ])

pipeline.fit(train_features, train_labels.to_numpy())

elapsed_time = time.time() - start_time
print(f"Training time: {elapsed_time:.3f} seconds")

# do prediction
predictions = pipeline.predict(test_features)


Training time: 28.077 seconds


In [54]:
start_time = time.time()


# Train the final model on the entire training data
model = LinearRegression()

model.fit(train_features, train_labels['rr1_30'].to_numpy())

elapsed_time = time.time() - start_time
print(f"Training time: {elapsed_time:.3f} seconds")

# do prediction
predictions = model.predict(test_features)

Training time: 28.184 seconds


In [57]:

# save metrics
mae, mape, rmse, rsqr = calculate_metric(predictions, test_labels['rr1_30'].to_numpy())
print(f"Val average mean absolute error: {mae}")
print(f"Val average mean absolute percentage error: {mape}")
print(f"Val average root mean squared error: {rmse}")
print(f"Val average R2: {rsqr}")

Val average mean absolute error: 0.21056645309037056
Val average mean absolute percentage error: 472.0187800535398
Val average root mean squared error: 0.28605640896599416
Val average R2: 0.2856171060613343


In [65]:
# Save the best model (you could use joblib in a real scenario)
joblib.dump(pipeline, OUTPUT_PATH + '/linear_model.pkl')

['../output/linear_regression/linear_model.pkl']

5-fold CV

In [36]:
features = pd.concat([train_features, test_features], axis=0, ignore_index=True)
features

Unnamed: 0,coupon rate,SP500 MD,Average daily 1-year SP500 return,Ratio to MA,US Corporate Bond Yield Spread,US Corporate Bond Yield Spread(3-5 year),US Corporate Bond Yield Spread(5-7 year),US Corporate Bond Yield Spread(7-10 year),US Corporate Bond Yield Spread(10+ year),US Generic Govt 3 Month Yield,...,event_type_subcategory_sum_Missing Coupon payment only,event_type_subcategory_sum_Missing Interest payment,event_type_subcategory_sum_Missing Loan payment,event_type_subcategory_sum_Missing Principal payment,event_type_subcategory_sum_Others,event_type_subcategory_sum_Pre-Negotiated Chapter 11,event_type_subcategory_sum_Protection,event_type_subcategory_sum_Receivership,event_type_subcategory_sum_Rehabilitation,event_type_subcategory_sum_Restructuring
0,7.500,-117.46020,-0.000189,125.407139,177.213028,134.012054,198.8153,191.364395,223.346344,0.1983,...,True,False,False,False,False,False,False,False,False,False
1,6.000,166.38276,0.000768,-4.603446,101.613617,77.032829,123.3998,105.932022,139.111115,0.0355,...,False,False,False,False,False,False,False,False,False,False
2,11.000,119.85752,0.000678,-11.950380,104.545959,77.416649,129.4317,111.818001,139.717407,0.0101,...,False,False,False,False,False,False,False,False,False,False
3,9.125,653.51208,0.001638,-2.494861,90.736633,64.654129,95.3731,92.141212,121.666237,0.0152,...,False,False,False,False,False,False,False,False,False,False
4,9.250,231.89472,0.000664,4.823413,98.533821,68.759308,93.4174,107.424469,139.741165,1.2865,...,False,False,False,False,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1720,3.250,147.33344,0.000556,3.538252,128.976395,95.360374,138.8445,138.946106,172.733887,0.0000,...,False,False,False,False,False,False,False,False,False,False
1721,10.000,175.31656,0.000554,-4.067337,121.170998,92.879501,123.2500,131.104904,162.916901,0.7516,...,False,True,False,False,False,False,False,False,False,False
1722,7.450,315.81748,0.000747,8.604100,126.786606,89.018188,114.9728,131.522430,171.701096,0.0864,...,False,False,False,False,False,False,False,False,False,False
1723,0.500,31.75120,-0.000157,-4.373852,126.595230,105.460007,121.6212,147.382416,163.772141,4.8375,...,False,False,False,False,False,False,False,False,False,False


In [37]:
labels = pd.concat([train_labels, test_labels], axis=0, ignore_index=True)
labels

Unnamed: 0,rr1_30
0,0.082481
1,0.378845
2,0.836149
3,0.987208
4,1.021458
...,...
1720,0.471411
1721,0.823750
1722,0.241612
1723,0.762054


In [47]:
train_preds, oof_preds, feat_importances, train_metrics, val_metrics = run_kfold_eval(features, labels["rr1_30"].to_numpy(), n_folds=5, model=model, model_params=None, output_path=OUTPUT_PATH, name="linear_reg")

100%|██████████| 5/5 [02:22<00:00, 28.48s/it]


In [48]:
# print test resuls
mae = 0
mape = 0
rmse = 0
rsqr = 0
i = 0

for key, value in train_metrics.items():
  if "mae" in key:
    i += 1
    mae += value

  if "mape" in key:
    mape += value

  if "rmse" in key:
    rmse += value

  if "rsqr" in key:
    rsqr += value

print(f"Training average mean absolute error: {mae/i}")
print(f"Training average mean absolute percentage error: {mape/i}")
print(f"Training average root mean squared error: {rmse/i}")
print(f"Training average R2: {rsqr/i}")

Training average mean absolute error: 0.15602937662812555
Training average mean absolute percentage error: 1447.9778511561885
Training average root mean squared error: 0.2071453858624984
Training average R2: 0.6097044969149962


In [49]:
# print test resuls
mae = 0
mape = 0
rmse = 0
rsqr = 0
i = 0

for key, value in val_metrics.items():
  if "mae" in key:
    i += 1
    mae += value

  if "mape" in key:
    mape += value

  if "rmse" in key:
    rmse += value

  if "rsqr" in key:
    rsqr += value

print(f"Test average mean absolute error: {mae/i}")
print(f"Test average mean absolute percentage error: {mape/i}")
print(f"Test average root mean squared error: {rmse/i}")
print(f"Test average R2: {rsqr/i}")

Test average mean absolute error: 0.21607959666366022
Test average mean absolute percentage error: 2255.951794265154
Test average root mean squared error: 0.34231185398368325
Test average R2: -0.07838209164255212


In [50]:
val_metrics

{'mae_fold0': 0.21737792434139963,
 'mape_fold0': 93.6694475920351,
 'rmse_fold0': 0.4011637560693921,
 'rsqr_fold0': -0.49989174176432094,
 'mae_fold1': 0.21554973096457702,
 'mape_fold1': 106.56791571734804,
 'rmse_fold1': 0.29261814065592445,
 'rsqr_fold1': 0.19112106809911156,
 'mae_fold2': 0.22227362593129396,
 'mape_fold2': 10849.800434914046,
 'rmse_fold2': 0.3384556240920935,
 'rsqr_fold2': -0.02168428759211305,
 'mae_fold3': 0.20441889433076663,
 'mape_fold3': 116.8064200868734,
 'rmse_fold3': 0.29684942415213983,
 'rsqr_fold3': 0.12509647064016893,
 'mae_fold4': 0.2207778077502637,
 'mape_fold4': 112.9147530154673,
 'rmse_fold4': 0.3824723249488666,
 'rsqr_fold4': -0.1865519675956071}