In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import json
import yaml
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor, plot_importance
import scienceplots

import os
import sys
from dotenv import load_dotenv

load_dotenv()
REPO_PATH = os.getenv("REPO_PATH")

# Import main utility functions
sys.path.insert(0, rf'{REPO_PATH}src')
from utils.main_utils import load_processed
from utils.forecast_utils import mean_directional_accuracy
plt.style.use('science')

### Import data

In [None]:
FUTURE = 'CLc1'
TOPICS = ['CRU', 'CWP', 'CEN']

# Load the YAML variable config file
with open(f'{REPO_PATH}variable_config.yaml', 'r') as file:
    var_config = yaml.load(file, Loader=yaml.FullLoader)

df = load_processed(FUTURE)[FUTURE]

# X = df[df.filter(like='_SI').columns]
X = df.drop(columns=['TARGET_1'])

y = df['TARGET_1']

# scale variables
scaler = RobustScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns, index=X.index)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

### Multiple OLS regression

In [None]:
# multiple linear regression
X_const = sm.add_constant(X_train)
model = sm.OLS(y_train, X_const).fit()

print(model.summary())



### Compare regression forecasting performance

In [None]:

# OLS
X_const = sm.add_constant(X_train)
model = sm.OLS(y_train, X_const).fit()
X_const = sm.add_constant(X_test)

# LASSO
LAMBDA: float = 0.20696
lasso = Lasso(alpha=LAMBDA, max_iter=10000)
lasso.fit(X_train, y_train)

# predict with XGBoost
xgb = XGBRegressor()
xgb.fit(X_train, y_train)

# Reduce number of features
lasso_features = json.load(
    open('feature_filters/lasso_coefs.json', 'r')
)

features = X.columns[features_names]
X_train_BE = X_train[features]
X_test_BE = X_test[features]

# OLS Reduced
X_const_R = sm.add_constant(X_train_BE)
model_R = sm.OLS(y_train, X_const_R).fit()
X_const_R = sm.add_constant(X_test_BE)

# LASSO Reduced
lasso_R = Lasso(alpha=LAMBDA, max_iter=10000)
lasso_R.fit(X_train_BE, y_train)

# predict with Reduced XGBoost
xgb_R = XGBRegressor()
xgb_R.fit(X_train_BE, y_train)


predict_df = pd.DataFrame(
    {
        'True': y_test,
        # 'OLS': model.predict(X_const),
        'OLS_R': model_R.predict(X_const_R),
        'Lasso': lasso.predict(X_test),
        'Lasso_R': lasso_R.predict(X_test_BE),
        'XGBoost': xgb.predict(X_test),
        'XGBoost_R': xgb_R.predict(X_test_BE)
    }, index=y_test.index
)

window = 500
fig, ax = plt.subplots(1, 1, figsize=(10, 5), dpi=200)
predict_df.iloc[-window:].plot(ax=ax, alpha=0.7)
ax.legend(frameon=False)

# sgb feature importance

fig, ax = plt.subplots(1, 1, figsize=(10, 5), dpi=200)
plot_importance(xgb, ax=ax, max_num_features=20)
ax.legend(frameon=False)
ax.grid(alpha=0.2)


### Metrics

In [None]:
metrics = pd.DataFrame(
    {
        'MSE': predict_df.apply(
            lambda x: mean_squared_error(predict_df['True'], x), 
            axis=0
        ),
        'MAE': predict_df.apply(
            lambda x: mean_absolute_error(predict_df['True'], x), 
            axis=0
        ),
        'DA': predict_df.apply(
            lambda x: mean_directional_accuracy(predict_df['True'], x), 
            axis=0
        )
    }
)

display(metrics)