## Import packages used

In [1]:
import numpy as np
import pandas as pd
import midasmlpy.date_functions as datef # used to handle different frequencies of data and to create lags
import midasmlpy.sparse_group_lasso as sgl
from sklearn.model_selection import train_test_split

## Load data

Load data from excel

In [2]:
import os

# load data from xlsx files and create a dataFrame
Predictors = pd.read_excel(os.path.abspath('predictors-monthly.xlsx')).to_numpy()
Target = pd.read_excel(os.path.abspath('gdp-quarterly.xlsx')).to_numpy()

In [3]:
print(Predictors.shape)
print(Target.shape)

(526, 123)
(175, 2)


In [45]:
pred_columns = pd.read_excel(os.path.abspath('predictors-monthly.xlsx')).columns
pred_desc = pd.read_excel(os.path.abspath('predictors-monthly.xlsx')).describe()
# print(pred_columns)
# print(pred_desc)

pred_columns_df = pd.DataFrame(pred_columns)
pred_columns_df = pred_columns_df.iloc[1:]
print(pred_columns_df)

                   0
1                RPI
2            W875RX1
3    DPCERA3M086SBEA
4          CMRMTSPLx
5            RETAILx
..               ...
118         UMCSENTx
119      DTCOLNVHFNM
120         DTCTHFNM
121           INVEST
122          VIXCLSx

[122 rows x 1 columns]


Split data into dates and data tables

In [5]:
# Y data and X and Y dates can also be defined as they are the same for all iterations
Y_date = Target[:,0]
Y = Target[:,1]
X_date = Predictors[:,0]
X = Predictors[:,1:]

## Transform data using functions from data_functions

Define variables ued in transformation

In [6]:
# Lag variables
x_lags = 3
y_lags = 0
horizon = 0

# Legendre matrix
degree = 4 # 3 degrees + polynomial 0

Call data transformation function

In [7]:
transformed_data = datef.data_transform(Y, Y_date, X, X_date, x_lags, y_lags, horizon, degree=degree, standardize=True)

In [8]:
X = transformed_data['X_tilde']
y = transformed_data['Y']

# Split x and y into a 80/20 train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [9]:
len(X)

174

In [10]:
print('X.shape:', X.shape)
print('y.shape:', y.shape)
print('X_train.shape:', X_train.shape)
print('y_train.shape:', y_train.shape)
print('X_test.shape:', X_test.shape)
print('y_test.shape:', y_test.shape)

X.shape: (174, 488)
y.shape: (174,)
X_train.shape: (139, 488)
y_train.shape: (139,)
X_test.shape: (35, 488)
y_test.shape: (35,)


## sgLasso gaussian

In [11]:
# Call the function
model2 = sgl.best_model(x=X_train, y=y_train, group_size=degree, family='gaussian', nlam=100, pmax=122, intr=False, k_folds=3, disp_flag=True, alpha_values=11, alpha=None)

print(model2)

The performance at different values of alpha are:
{np.float64(1.0): np.float64(28.32304), np.float64(0.9): np.float64(28.32275), np.float64(0.8): np.float64(28.32294), np.float64(0.7): np.float64(28.32296), np.float64(0.6): np.float64(28.32561), np.float64(0.5): np.float64(28.31837), np.float64(0.3999999999999999): np.float64(28.31288), np.float64(0.29999999999999993): np.float64(28.30532), np.float64(0.19999999999999996): np.float64(28.30512), np.float64(0.09999999999999998): np.float64(28.31985), np.float64(0.0): np.float64(28.33513)}
{'best_alsparse': np.float64(0.19999999999999996), 'best_performance': np.float64(28.30511676883688), 'b0': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.

In [41]:
beta_values = model2['beta']
beta_shape = beta_values.shape
print('Beta shape:', beta_shape)
beta_values[487]

Beta shape: (488, 100)


array([ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
       -0.00029074, -0.00423961, -0.00620669, -0.00814635, -0.01039119,
       -0.01296431, -0.01581324, -0.01760151, -0.01747689, -0.01715696,
       -0.01683805, -0.01854241, -0.02078368, -0.02268575, -0.02396199,
       -0.02440516, -0.02477555, -0.02498203, -0.02516607, -0.0254674 ,
       -0.02532761, -0.02483421, -0.0240004 , -0.02298018, -0.02171426,
       -0.02050333, -0.01922263, -0.01788042, -0.0165885 , -0.01555803,
       -0.0144565 , -0.01317645, -0.01187635, -0.01019368, -0.0087171 ,
       -0.00731551, -0.00621747, -0.00538327, -0.00510187, -0.00483701,
       -0.0045705 , -0.00365449, -0.0026651 , -0.00209281, -0.00

In [13]:
# Get the best beta values
best_beta_values = model2['best_beta']
best_beta_shape = model2['best_beta'].shape
print(best_beta_values)
print(best_beta_shape)

[ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  4.48760646e-05  1.92287789e-04  4.72889880e-04  8.72963442e-04
  1.36246312e-03  1.76361373e-03  2.22455270e-03  2.69262519e-03
  2.78736266e-03  2.78972788e-03  2.68816829e-03  2.64611276e-03
  2.70334889e-03  2.79201433e-03  2.83121289e-03  2.80430332e-03
  2.64363023e-03  2.51633183e-03  2.46739134e-03  2.57590293e-03
  2.43308118e-03  2.15516031e-03  1.83564645e-03  1.48915780e-03
  1.14469558e-03  7.37619610e-04  1.99402857e-04  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
 -0.00000000e+00 -0.00000000e+00 -0.00000000e+00 -0.00000000e+00
 -0.00000000e+00 -1.37398329e-03 -2.65644289e-03 -3.78854197e-03
 -4.97751800e-03 -6.21364541e-03 -7.37915134e-03 -8.57465579e-03
 -9.87746007e-03 -1.12655

In [29]:
best_lambda_index = model2['best_lambda_index']
print(best_lambda_index)

4


In [67]:
import math

p = X.shape[1]
print("dimension of X_tilde:", p)
q = math.floor(p / degree)
print("dimension of X:", q)
idx = math.floor(best_lambda_index / degree)
print("selected column index:", idx)
selected_column = pred_columns_df.iloc[idx].values[0]
print("selected column:", selected_column)

dimension of X_tilde: 488
dimension of X: 122
selected column index: 1
selected column: W875RX1


In [14]:
model2['best_performance']

np.float64(28.30511676883688)

In [15]:
x = X_train
y = y_train
group_size = degree
family = 'gaussian'
nlam = 100
pmax = 122
intr = False
k_folds = 3
disp_flag = True
alpha_values = 11
alpha = None
alsparse = 0.5

In [16]:
from sklearn.model_selection import StratifiedKFold, KFold

# Find model nlam number of models
b0, beta, alam, npass, jerr, mse = sgl.sgLASSO_estimation(x, y, group_size, alsparse, family, pmax, intr)

# Find mean performance for each lambda
# Split the data into k_folds
if family == 'binomial':
    kf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=42)
if family == 'gaussian':   
    kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)

# initialize performance list
performance = []
for train_index, test_index in kf.split(x,y):
    # Based on the split, create the training and test data for this fold
    x_train, x_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]
    # Estimate the model on the training data
    b0_test, beta_test, alam, npass, jerr, mse_test = sgl.sgLASSO_estimation(x_train, y_train, group_size, alsparse, family, pmax, intr, ulam=alam)
    if family == 'gaussian':
        performance.append(sgl.evaluate_gaussian(x_test, y_test, b0_test, beta_test, intr=False, eval='mse'))
    if family == 'binomial':
        performance.append(sgl.evaluate_binomials(x_test, y_test, b0_test, beta_test, intr=False, eval='auc', threshold=0.5))

performance = np.array(performance)
mean_performance = np.mean(performance, axis=0)
if family == 'binomial':
    best_lambda = np.argmax(mean_performance)
if family == 'gaussian':
    best_lambda = np.argmin(mean_performance)

In [17]:
mean_performance

array([28.26838518, 28.28011485, 28.37724124, 28.50389207, 28.63417447,
       28.77692044, 28.93964735, 29.12136597, 29.279358  , 29.4421717 ,
       29.55579606, 29.65752807, 29.81983072, 29.9954351 , 30.17946564,
       30.34987428, 30.52935536, 30.6942738 , 30.87604281, 31.06752714,
       31.266835  , 31.47846996, 31.62354462, 31.79415347, 31.97093137,
       32.02652478, 32.10643163, 32.24052474, 32.41381407, 32.65802225,
       32.94359855, 33.26107897, 33.62329184, 33.99951126, 34.36166634,
       34.73210698, 35.07714045, 35.43574149, 35.79903252, 36.17512073,
       36.5718271 , 36.95989077, 37.37092327, 37.80609898, 38.28181913,
       38.72842259, 39.17234052, 39.67856547, 40.19166741, 40.70706755,
       41.21599602, 41.72007536, 42.21475208, 42.70359803, 43.16211098,
       43.61558334, 44.06268855, 44.53326322, 44.98530107, 45.43346776,
       45.82135061, 46.22498236, 46.62338801, 47.01842057, 47.36018073,
       47.67935484, 47.98443286, 48.27467287, 48.57308514, 48.87

In [18]:
mean_performance[best_lambda]

np.float64(28.26838517629615)

In [19]:
np.argmin(mean_performance)

np.int64(0)

In [20]:
x = transformed_data['X_tilde']
y = transformed_data['Y']

# # Split x and y into a 80/20 train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

Evaluating the gaussian test data

In [21]:
b0, beta, alam, npass, jerr, mse = sgl.sgLASSO_estimation(X_test, y_test, group_size, alsparse, family, pmax, intr)
evaluation_scores = sgl.evaluate_gaussian(X_test, y_test, b0, beta, intr=False, eval='mse')

In [22]:
print(len(evaluation_scores))

100


In [23]:
evaluation_scores

[np.float64(16.113010112194285),
 np.float64(16.06603033295657),
 np.float64(15.75185112559228),
 np.float64(15.313694113310765),
 np.float64(14.89759317782088),
 np.float64(14.518133477326318),
 np.float64(14.171977343677225),
 np.float64(13.856078330911695),
 np.float64(13.567655036027281),
 np.float64(13.304167082139452),
 np.float64(13.063293694078824),
 np.float64(12.842915322415537),
 np.float64(12.519770916781562),
 np.float64(12.212446940620525),
 np.float64(11.932027812310228),
 np.float64(11.676061936365041),
 np.float64(11.442311711839292),
 np.float64(11.228211088400176),
 np.float64(11.030452280794062),
 np.float64(10.847243699558273),
 np.float64(10.57970785390325),
 np.float64(10.25163707265923),
 np.float64(9.93852359880164),
 np.float64(9.5858011801479),
 np.float64(9.26016434320158),
 np.float64(8.954031366233467),
 np.float64(8.664277235758094),
 np.float64(8.305168462733388),
 np.float64(7.912597167784603),
 np.float64(7.54551561033803),
 np.float64(7.19770561607807

In [24]:
pd.DataFrame(evaluation_scores)

Unnamed: 0,0
0,16.113010
1,16.066030
2,15.751851
3,15.313694
4,14.897593
...,...
95,0.041674
96,0.038014
97,0.034671
98,0.031620
