In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso, Ridge
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm
import scienceplots

import os
import sys
from dotenv import load_dotenv

load_dotenv()
REPO_PATH = os.getenv("REPO_PATH")

# Import main utility functions
sys.path.insert(0, rf'{REPO_PATH}src_HF')
plt.style.use('science')

### Import data

In [None]:
FUTURES = ['CLc1', 'LCOc1']
TOPICS = ['CRU', 'CWP', 'CEN']

dfs = {
    future: pd.read_csv(
        os.path.join(
            REPO_PATH,
            'data',
            'prepared_data',
            f"{future}_5min_resampled.csv"
        ),
        index_col='date',
        parse_dates=True
    ) for future in FUTURES
}



In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 5), dpi=200)

df = dfs['CLc1']

display(df.columns)

plot = ['CRU_TextBlob_SI', 'CRU_VADER_SI']

for i, topic in enumerate(plot):
    df[topic].plot(ax=ax, label=topic)

ax.grid(alpha=0.2)
ax.legend(frameon=False)

### Multiple linear regression

In [None]:
# multiple linear regression
# X = df[df.filter(like='_SI').columns]
X = df.drop(columns=['TARGET_1'])

y = df['TARGET_1']

X_const = sm.add_constant(X)
model = sm.OLS(y, X_const).fit()

print(model.summary())



### LASSO and Ridge regression

In [None]:

# scale variables
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns, index=X.index)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

lamdas = np.logspace(-2, 5, 500)

coefs_lasso = []
CV_MSE_lasso = []

coefs_ridge = []
CV_MSE_ridge = []

for i in tqdm(lamdas, desc='Fitting Lasso and Ridge'):
    lasso = Lasso(alpha=i, max_iter=10000)
    lasso.fit(X_train, y_train)
    coefs_lasso.append(lasso.coef_)
    CV_MSE_lasso.append(mean_squared_error(y_test, lasso.predict(X_test)))

    ridge = Ridge(alpha=i, max_iter=10000)
    ridge.fit(X_train, y_train)
    coefs_ridge.append(ridge.coef_)
    CV_MSE_ridge.append(mean_squared_error(y_test, ridge.predict(X_test)))


In [None]:
fig, axs = plt.subplots(1, 2, figsize=(12, 5), dpi=200)


plot_y = [coefs_lasso, CV_MSE_lasso, coefs_ridge, CV_MSE_ridge]
label_y = ['Mean Square Error', 'Coefficients']
vlines = [CV_MSE_lasso, CV_MSE_ridge]

# get twilight colors
colormap = cm.get_cmap('twilight_r', 20)

# Set the color cycle to the twilight colormap
axs[0].set_prop_cycle(color=colormap(np.linspace(0, 1, 8)))
axs[1].set_prop_cycle(color=colormap(np.linspace(0, 1, 2)))

for i, ax in enumerate(axs.flatten()):
    ax.plot(lamdas, plot_y[i], color='black' if i == 1 else None, lw=1.2)
    ax.set_xscale('log')
    # adjust tick size
    ax.tick_params(axis='both', which='major', labelsize=13)
    ax.set_xlabel('$\lambda$', fontsize=15)
    ax.set_ylabel(label_y[1] if i % 2 == 0 else label_y[0], fontsize=15)
    ax.axvline(
        lamdas[np.argmin(vlines[0] if i < 2 else vlines[1])], 
        color='red', 
        linestyle='-.',
        lw=1
    )

# set color on axs[1] line

fig.tight_layout()
fig.savefig('LASSO_Results.png')

# get coefficients for the best lambda
lasso = Lasso(alpha=lamdas[np.argmin(CV_MSE_lasso)], max_iter=10000)
lasso.fit(X_train, y_train)
print('Intercept for the best lambda:', lasso.intercept_)
print('MSE for the best lambda:', mean_squared_error(y_test, lasso.predict(X_test)))

# get coefficients for the best lambda
ridge = Lasso(alpha=lamdas[np.argmin(CV_MSE_ridge)], max_iter=10000)
ridge.fit(X_train, y_train)
print('Intercept for the best lambda:', ridge.intercept_)
print('MSE for the best lambda:', mean_squared_error(y_test, ridge.predict(X_test)))


In [None]:
display(pd.DataFrame(lasso.coef_, X.columns, columns=['Coefficients']))


In [None]:
window = 500

# plot test results
fig, ax = plt.subplots(1, 1, figsize=(10, 5), dpi=200)

y_test[-window:].plot(ax=ax, label='True', lw=0.9)
# pd.Series(lasso.predict(X_test), index=y_test.index)[-window:].plot(ax=ax, label='Lasso', lw=0.9)


# predict with lasso and backword elimination
# get the best lambda
lasso = Lasso(alpha=lamdas[np.argmin(CV_MSE_lasso)], max_iter=10000)
lasso.fit(X_train, y_train)
# get the best features
features = X.columns[lasso.coef_ != 0]
X_train_BE = X_train[features]
X_test_BE = X_test[features]
# fit the model
lasso = Lasso(alpha=lamdas[np.argmin(CV_MSE_lasso)], max_iter=10000)
lasso.fit(X_train_BE, y_train)
# plot
pd.Series(lasso.predict(X_test_BE), index=y_test.index)[-window:].plot(ax=ax, label='Lasso BE', lw=0.9)

# number of variables in reduced model vs original
print('Number of variables in reduced model:', len(features))
print('Number of variables in original model:', len(X.columns))

# predict with normal OLS
X_const = sm.add_constant(X_train)
model = sm.OLS(y_train, X_const).fit()
X_const = sm.add_constant(X_test)
y_pred = model.predict(X_const)
pd.Series(y_pred, index=y_test.index)[-window:].plot(ax=ax, label='OLS', lw=0.9)
# calculate mse
print('MSE OLS:', mean_squared_error(y_test, y_pred))

# predict with XGBoost
from xgboost import XGBRegressor
xgb = XGBRegressor()
xgb.fit(X_train_BE, y_train)
pd.Series(xgb.predict(X_test_BE), index=y_test.index)[-window:].plot(ax=ax, label='XGBoost', lw=0.6)
# calculate mse
print('MSE XGBoost:', mean_squared_error(y_test, xgb.predict(X_test_BE)))

ax.legend(frameon=False)


# sgb feature importance
from xgboost import plot_importance
fig, ax = plt.subplots(1, 1, figsize=(10, 5), dpi=200)

# only show top 10 features
plot_importance(xgb, ax=ax, max_num_features=20)

ax.legend(frameon=False)
ax.grid(alpha=0.2)
