In [None]:
import numpy as np
import pandas as pd
import json
import yaml
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm
import scienceplots

import os
import sys
from dotenv import load_dotenv
load_dotenv()
REPO_PATH = os.getenv("REPO_PATH")

# Import main utility functions
sys.path.insert(0, rf'{REPO_PATH}src')
from utils.main_utils import load_processed
from utils.lasso_utils import plot_lasso
from utils.forecast_utils import calculate_metrics
plt.style.use('science')

### Load data

In [None]:
FUTURES = ['CLc1', 'LCOc1']
TOPICS = ['CRU', 'CWP', 'CEN']

dfs = load_processed(FUTURES)

with open(f'{REPO_PATH}variable_config.yaml', 'r') as file:
    var_config = yaml.load(file, Loader=yaml.FullLoader)

df = dfs['CLc1']

ID = 'BT'

X = df[var_config['BASE'] + var_config['TEMPORAL']]

y = df['TARGET_1']

# scale feature variables
scaler = RobustScaler()
X = pd.DataFrame(
    scaler.fit_transform(X), 
    columns=X.columns, 
    index=X.index
)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=False
)

### LASSO Regression

In [None]:
lambdas = np.logspace(-5, 0, 500)

coefs_lasso = []
CV_MSE_lasso = []

for i in tqdm(lambdas, desc='Finding optimal lambda'):
    lasso = Lasso(alpha=i, max_iter=10000)
    lasso.fit(X_train, y_train)
    coefs_lasso.append(lasso.coef_)
    CV_MSE_lasso.append(
        mean_squared_error(y_test, lasso.predict(X_test))
    )

lasso_df = pd.DataFrame({'lambda': lambdas,'MSE': CV_MSE_lasso})

### Plot LASSO

In [None]:
fig = plot_lasso(lasso_df, coefs_lasso)
fig.savefig(f'images/{ID}_LASSO_Results.png')

# get coefficients for the best lambda
lasso = Lasso(alpha=lambdas[np.argmin(lasso_df['MSE'])], max_iter=10000)
lasso.fit(X_train, y_train)
print('Intercept for the best lambda:', lasso.intercept_)

metrics_df = pd.DataFrame(
    calculate_metrics(
        y_test, 
        lasso.predict(X_test), 
        decimals=4
    ) , index=[0]
)

display(metrics_df)

lasso_coefs = {
    col: 1 if coef != 0.0 else 0 
    for col, coef in zip(X.columns, lasso.coef_)
}

# print the number of non-zero coefficients
print(f'Number of non-zero coefficients: {sum(lasso_coefs.values())}')
print(f'Number of zero coefficients: {len(lasso_coefs) - sum(lasso_coefs.values())}')

# remove all RV_LAG features
lasso_coefs = {k: v for k, v in lasso_coefs.items() if 'RV_LAG' not in k}

with open('feature_filters/lasso_coefs.json', 'w') as f:
    json.dump(lasso_coefs, f, indent=4)