## Loading

In [314]:
%load_ext autoreload
%autoreload 2

import model as m
import data_processing as dp
import feature_engineering as fe
import make_plots as mp
import parameters as p
import evaluation as e

import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

import os


default_max_rows = pd.get_option('display.max_rows')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [317]:
assert len(p.missing_files) == 0
merged_data_daily, merged_data_monthly = dp.data_loading(p.daily_files, p.monthly_files)
merged_data_daily = dp.get_sector_data(merged_data_daily)

Loading daily data from existing file.
Loading monthly data from existing file.


In [318]:
data_daily, data_monthly = fe.feature_construction(merged_data_daily, merged_data_monthly)
data_monthly_merged = dp.merge_daily_and_monthly_data(data_daily, data_monthly)

In [319]:
data_monthly_imputed = dp.handle_crosssectional_na(data_monthly_merged)

Dropped 2568 rows with more than 66.53888275111062% missing values.


Increase in missing statistics for each column:
beta_000905: 7.35%
daily_ret_vol_roll_126: 0.48%
return_daily: 0.00%
total_market_value: 5.53%
turnover_daily: 5.53%
000905_close: 0.00%
000905_return_daily: 0.00%
000905_return_monthly: 0.44%
maxret: 0.00%
illiquidity_monthly: 1.01%
mve_log: 6.66%
return_monthly: 0.98%
ret_vol_monthly: 0.98%
std_dolvol_monthly: 1.01%
std_turnover_monthly: 6.40%
zero_trade_days: 1.25%
chmom: 10.99%
mom1m: 1.55%
mom12m: 7.38%
mom6m: 4.30%
mom36m: 17.81%


## feature engineering

## Baseline MOdel

In [330]:
data_monthly_nona = data_monthly_imputed.dropna()
removed_percentage = (1 - len(data_monthly_nona) / len(data_monthly_imputed)) * 100
print(f"Removed {len(data_monthly_nona)} rows, which is {removed_percentage:.2f}%.")

Removed 425857 rows, which is 17.73%.


In [339]:
X_train, y_train, X_val, y_val, X_test, y_test = m.split_train_val_test(data_monthly_nona, predictor="return_monthly")
input_dim=X_train.shape[1]

print("Training set range:")
print(X_train.index.min(), "to", X_train.index.max())

print("Validation set range:")
print(X_val.index.min(), "to", X_val.index.max())

print("Testing set range:")
print(X_test.index.min(), "to", X_test.index.max())

Training set range:
2015-03-31 00:00:00 to 2021-05-31 00:00:00
Validation set range:
2021-05-31 00:00:00 to 2022-11-30 00:00:00
Testing set range:
2022-11-30 00:00:00 to 2024-03-31 00:00:00


In [345]:
model_classes = [
    m.OLSModel(),
    m.OLS3Model(),
    m.PLSModel(),
    m.LASSOModel(),
    m.ElasticNetModel(),
    # m.GBRTModel(),
    m.RFModel(),
    m.XGBoostModel()
]

num_layers_range = range(1, 6)
for num_layers in num_layers_range:
    model_classes.append(m.NNModel(input_dim=input_dim, num_layers=num_layers))

model_r_2 = {}
models_fitted = {}
model_res = pd.DataFrame()
model_res.index = y_val.index
for model_class in model_classes:
    model_name = model_class.name if hasattr(model_class, "name") else model_class.__class__.__name__
    model_class.name = model_name
    print(model_name)
    model_fitted, scaler = m.train(X_train, y_train, model_class)
    validation_res = m.validation(X_val, model_fitted, scaler)
    r_2 = e.calculate_r2_oos(validation_res, y_val.values)

    models_fitted[model_name] = model_fitted
    model_res[model_name] = validation_res
    model_r_2[model_name] = r_2
model_res['y'] = y_val
for model_name, r_2 in model_r_2.items():
    print(f"{model_name}: {r_2}")

OLSModel
OLS3Model
PLSModel
LASSOModel
ElasticNetModel
GBRTModel


KeyboardInterrupt: 

## Hyperparameter tuning

In [None]:
for model_class in model_classes:
    model_name = model_class.name if hasattr(model_class, "name") else model_class.__class__.__name__
    print(model_name)

In [None]:
models_to_tune = [
    m.OLSModel,
    m.OLS3Model,
    m.PLSModel,
    m.LASSOModel,
    m.ElasticNetModel,
    m.GBRTModel,
    m.RFModel,
    m.XGBoostModel
]
best_trials = e.hyperparameter_tuning(X_train, y_train, X_val, y_val, models_to_tune, n_trials=3)

## Feature Importance

In [None]:
data_imputed = dp.fillnas_and_convert(data, dataOffset="Y")
X_train, y_train, X_val, y_val, X_test, y_test = m.split_train_val_test(data_imputed)

In [None]:
features = X_train.columns[:10]  
importance_df, percentage_change_df = e.feathre_importance(model_classes, X_train, y_train, features=features, permutation_importance=False)

In [None]:
mp.macroeconomic_feature_importance(percentage_change_df)
sorted_df = mp.characteristic_feature_importance(importance_df)

In [None]:
importance_df.apply(lambda x: (x - x.min()) / (x.max() - x.min()), axis=0)

## Portfolio analysis

In [None]:

deciles  = e.sort_into_deciles(model_res['NNModel_nn4']['y_pred'])


In [None]:
e.form_portfolios(model_res)

In [None]:
model_res