In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import sys

sys.path.insert(0, r'c:\Users\joneh\master_thesis\src')
from main_utils import *

plot_info = load_json(r'c:\Users\joneh\master_thesis\src\plot_dict.json')

import statsmodels.api as sm
from sklearn.linear_model import Lasso, Ridge, LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

### Data combination

In [None]:
futures_data = pd.read_csv(r'c:\Users\joneh\master_thesis\data\time_series\TSA_data.csv', index_col=0)

sentiment_data = pd.read_csv(r'c:\Users\joneh\master_thesis\data\time_series\sentiment_index.csv', index_col=0)

futures_data['cond_vol_tomorrow'] = futures_data['cond_vol'].shift(-1)

# combine data
data = pd.concat([futures_data, sentiment_data], axis=1).dropna()

# set datetime index
data.index = pd.to_datetime(data.index)

display(data.head())

### Lag order creation

### Save model input data

In [None]:
# Enter filename here:
file_name = f'model_input_1.csv'
# Enter relative path for saving the file:
relative_path = 'data/model_input'

data.to_csv(save_path(relative_path, file_name), index=True)

### Multiple linear regression

In [None]:
# multiple linear regression
X = data.drop(columns='cond_vol_tomorrow')

y = data['cond_vol_tomorrow']

X = sm.add_constant(X)
model = sm.OLS(y, X).fit()

print(model.summary())


### LASSO and Ridge regression

In [None]:
dependent_variable = 'cond_vol_tomorrow'

# scale data
X = data.drop([dependent_variable], axis=1)
y = data[dependent_variable]

# scale variables
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns, index=X.index)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

lamdas = np.logspace(-2, 5, 500)

coefs_lasso = []
CV_MSE_lasso = []

coefs_ridge = []
CV_MSE_ridge = []

for i in lamdas:
    lasso = Lasso(alpha=i, max_iter=10000)
    lasso.fit(X_train, y_train)
    coefs_lasso.append(lasso.coef_)
    CV_MSE_lasso.append(mean_squared_error(y_test, lasso.predict(X_test)))

    ridge = Ridge(alpha=i)
    ridge.fit(X_train, y_train)
    coefs_ridge.append(ridge.coef_)
    CV_MSE_ridge.append(mean_squared_error(y_test, ridge.predict(X_test)))


fig, axs = plt.subplots(2, 2, figsize=(15, 12))

axs[0, 0].plot(lamdas, coefs_lasso)
axs[0, 0].set_xscale('log')
axs[0, 0].set_xlabel('Lambda')
axs[0, 0].set_ylabel('Coefficients')
axs[0, 0].legend(X.columns, frameon=False)
axs[0, 0].set_title('Lasso coefficients as a function of the regularization')
axs[0, 0].axvline(lamdas[np.argmin(CV_MSE_lasso)], color='red', linestyle='--', label='Lambda with lowest MSE')

axs[0, 1].plot(lamdas, CV_MSE_lasso)
axs[0, 1].set_xscale('log')
axs[0, 1].set_xlabel('Lambda')
axs[0, 1].set_ylabel('MSE')
axs[0, 1].set_title('MSE as a function of the regularization')
axs[0, 1].axvline(lamdas[np.argmin(CV_MSE_lasso)], color='red', linestyle='--', label='Lambda with lowest MSE')

# get coefficients for the best lambda
lasso = Lasso(alpha=lamdas[np.argmin(CV_MSE_lasso)], max_iter=10000)
lasso.fit(X_train, y_train)
display(pd.DataFrame(lasso.coef_, X.columns, columns=['Coefficients']))
print('Intercept for the best lambda:', lasso.intercept_)
print('MSE for the best lambda:', mean_squared_error(y_test, lasso.predict(X_test)))

axs[1, 0].plot(lamdas, coefs_ridge)
axs[1, 0].set_xscale('log')
axs[1, 0].set_xlabel('Lambda')
axs[1, 0].set_ylabel('Coefficients')
axs[1, 0].legend(X.columns, frameon=False)
axs[1, 0].set_title('Ridge coefficients as a function of the regularization')
axs[1, 0].axvline(lamdas[np.argmin(CV_MSE_ridge)], color='red', linestyle='--', label='Lambda with lowest MSE')

axs[1, 1].plot(lamdas, CV_MSE_ridge)
axs[1, 1].set_xscale('log')
axs[1, 1].set_xlabel('Lambda')
axs[1, 1].set_ylabel('MSE')
axs[1, 1].set_title('MSE as a function of the regularization')
axs[1, 1].axvline(lamdas[np.argmin(CV_MSE_ridge)], color='red', linestyle='--', label='Lambda with lowest MSE')

# get coefficients for the best lambda
ridge = Lasso(alpha=lamdas[np.argmin(CV_MSE_ridge)], max_iter=10000)
ridge.fit(X_train, y_train)
display(pd.DataFrame(ridge.coef_, X.columns, columns=['Coefficients']))
print('Intercept for the best lambda:', ridge.intercept_)
print('MSE for the best lambda:', mean_squared_error(y_test, ridge.predict(X_test)))

fig.tight_layout()