## Import packages used

In [2]:
import numpy as np
import pandas as pd
import midasmlpy.date_functions as datef # used to handle different frequencies of data and to create lags
import midasmlpy.sparse_group_lasso as sgl
from sklearn.model_selection import train_test_split

## Load data

Load data from excel

In [3]:
import os
# load data from xlsx files and create a dataframe

Predictors = pd.read_excel(os.path.abspath('predictors-monthly.xlsx')).to_numpy()
Target = pd.read_excel(os.path.abspath('gdp-quarterly.xlsx')).to_numpy()

In [4]:
print(Predictors)

[[Timestamp('1980-01-01 00:00:00') 0.00141245089699638
  -0.0011552922185345 ... 0.00903300474278623 -0.00604040201996625
  21.0903]
 [Timestamp('1980-02-01 00:00:00') -0.00456617930994874
  -0.00389234838978325 ... -0.0148620800568615 0.0140006758812898 22.2919]
 [Timestamp('1980-03-01 00:00:00') -0.00530077784951999
  -0.00343020822286455 ... 0.000185183409865175 -0.00496429192770176
  29.2535]
 ...
 [Timestamp('2023-08-01 00:00:00') 0.000748921515789647
  0.00175864553192895 ... 0.0007155070429814 -0.00600931526191673 15.7822]
 [Timestamp('2023-09-01 00:00:00') -0.000509381619856697
  0.000725495468836712 ... -0.00163704170406653 0.00331903172316572
  15.0424]
 [Timestamp('2023-10-01 00:00:00') 0.00221083275208933
  0.00309969441820179 ... -0.000859076321829022 -0.00445350854847071
  19.0462]]


Split data into dates and data tables

In [5]:
# Y data and X and Y dates can also be defined as they are the same for all iterations
Y_date = Target[:,0]
Y = Target[:,1]
X_date = Predictors[:,0]
X = Predictors[:,1:]

## Transform data using functions from data_functions

Define variables ued in transformation

In [6]:
# Lag variables
x_lags = 3
y_lags = 0
horizon = 0

# Legendre matrix
degree = 4 # 3 degrees + polynomial 0

Call data transformation function

In [7]:
transformed_data = datef.data_transform(Y, Y_date, X, X_date, x_lags, y_lags, horizon, degree=degree, standardize=True)

In [8]:
print(transformed_data)

{'Y': array([ -8.32783,  -0.4757 ,   7.39099,   7.76189,  -2.97534,   4.76156,
        -4.38231,  -6.26336,   1.82056,  -1.532  ,   0.16   ,   5.23673,
         9.00007,   7.91695,   8.25888,   7.74372,   6.85202,   3.83757,
         3.26969,   3.85726,   3.50648,   6.0629 ,   2.9628 ,   3.71766,
         1.79714,   3.80848,   2.13984,   2.95643,   4.28963,   3.45376,
         6.80953,   2.06221,   5.2219 ,   2.33704,   5.29463,   4.04506,
         3.04081,   2.95202,   0.78715,   4.34781,   1.44925,   0.26624,
        -3.6583 ,  -1.87616,   3.10666,   2.01634,   1.39176,   4.76057,
         4.31387,   3.93304,   4.14982,   0.66722,   2.32193,   1.90408,
         5.40393,   3.86274,   5.38433,   2.33153,   4.55623,   1.41661,
         1.19151,   3.38851,   2.70715,   2.98503,   6.61675,   3.57214,
         4.13206,   2.57348,   6.60602,   4.96335,   3.40094,   3.995  ,
         3.68572,   5.00433,   6.38554,   3.74019,   3.32511,   5.26868,
         6.50941,   1.44855,   7.2196 ,   0.4

In [9]:
X = transformed_data['X_tilde']
y = transformed_data['Y']

# Split x and y into a 80/20 train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [10]:
print(X)

[[-0.60800157 -1.39179907 -0.37670042 ...  1.31756328  1.10760184
  -1.38044112]
 [-0.44238384 -1.51913724  0.30793826 ...  2.7631645   0.59630858
  -0.37073024]
 [-0.0887135   0.18339862 -0.34210778 ... -0.9591036   0.54172414
  -2.72972071]
 ...
 [-2.51649855 -0.47471425 -2.62842862 ...  2.27120835  0.67846499
  -0.27717866]
 [-0.7712069  -0.64768512  2.71367121 ...  4.26015672 -1.33798006
  -2.80561436]
 [-0.99450877 -0.725138   -2.19637    ...  3.46932807 -0.24269991
  -0.42339739]]


In [11]:
print(y)

[ -8.32783  -0.4757    7.39099   7.76189  -2.97534   4.76156  -4.38231
  -6.26336   1.82056  -1.532     0.16      5.23673   9.00007   7.91695
   8.25888   7.74372   6.85202   3.83757   3.26969   3.85726   3.50648
   6.0629    2.9628    3.71766   1.79714   3.80848   2.13984   2.95643
   4.28963   3.45376   6.80953   2.06221   5.2219    2.33704   5.29463
   4.04506   3.04081   2.95202   0.78715   4.34781   1.44925   0.26624
  -3.6583   -1.87616   3.10666   2.01634   1.39176   4.76057   4.31387
   3.93304   4.14982   0.66722   2.32193   1.90408   5.40393   3.86274
   5.38433   2.33153   4.55623   1.41661   1.19151   3.38851   2.70715
   2.98503   6.61675   3.57214   4.13206   2.57348   6.60602   4.96335
   3.40094   3.995     3.68572   5.00433   6.38554   3.74019   3.32511
   5.26868   6.50941   1.44855   7.2196    0.40752   2.38105  -1.31335
   2.49024  -1.60578   1.09759   3.33152   2.44335   1.62275   0.49411
   2.10104   3.52689   6.59808   4.61615   2.26021   3.08786   3.77649
   4.0

## sgLasso gaussian

In [12]:
sgl.test_module()

Testing the sparse group LASSO module...


In [13]:
import midasmlpy.sparse_group_lasso as sgl

# Call the function
model2 = sgl.best_model(x=X_train, y=y_train, group_size=degree, family='gaussian', nlam=100, pmax=122, intr=False, k_folds=3, disp_flag=True, alpha_values=11, alpha=None)

print(model2)

In [None]:
model2['best_performance']

In [None]:
x = X_train
y = y_train
group_size = degree
family = 'gaussian'
nlam = 100
pmax = 122
intr = False
k_folds = 3
disp_flag = True
alpha_values = 11
alpha = None
alsparse = 0.5

In [None]:
# Find model nlam number of models
b0, beta, alam, npass, jerr, mse = sgl.sgLASSO_estimation(x, y, group_size, alsparse,family, pmax, intr)

# Find mean performance for each lambda
# Split the data into k_folds
if family == 'binomial':
    kf = sgl.StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=42)
if family == 'gaussian':   
    kf = sgl.KFold(n_splits=k_folds, shuffle=True, random_state=42)

# initialize performance list
performance = []
for train_index, test_index in kf.split(x,y):
    # Based on the split, create the training and test data for this fold
    x_train, x_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]
    # Estimate the model on the training data
    b0test, beta_test, alam, npass, jerr, mse_test = sgl.sgLASSO_estimation(x_train, y_train, group_size, alsparse, family, pmax=pmax, intr=intr, ulam=alam)
    if family == 'gaussian':
        performance.append(sgl.evaluate_gaussian(x_test, y_test, b0test, beta_test, intr=False, eval='mse'))
    if family == 'binomial':
        performance.append(sgl.evaluate_binomials(x_test, y_test, b0test, beta_test, intr=False, eval='auc', threshold=0.5))

performance = np.array(performance)
mean_performance = np.mean(performance, axis=0)
if family == 'binomial':
    best_lambda = np.argmax(mean_performance)
if family == 'gaussian':
    best_lambda = np.argmin(mean_performance)

In [None]:
mean_performance

In [None]:
mean_performance[best_lambda]

In [None]:
np.argmin(mean_performance)

In [None]:
x = transformed_data['X_tilde']
y = transformed_data['Y']

# # Split x and y into a 80/20 train test split
train_size = int(0.8*x.shape[0])
x_train, x_test = x[:train_size], x[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

In [None]:

b0, beta, alam, npass, jerr, mse = sgl.sgLASSO_estimation(x_train, y_train, group_size, alsparse,family, pmax, intr)
evaluation_scores = [0] * len(b0)  # this will store evaluation scores
for l in range(len(b0)):
    predictions = np.dot(x_test, beta[:,l]) + b0[l]
    evaluation_scores[l] = sgl.mean_squared_error(y_test, predictions)


In [None]:
evaluation_scores

In [None]:
pd.DataFrame(predictions, y_test)

In [None]:
pd.DataFrame(evaluation_scores)