## Import packages used

In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
import midasmlpy.date_functions as datef # used to handle different frequencies of data and to create lags
from sklearn.model_selection import train_test_split

In [2]:
from midasmlpy.src.sparseglf90 import sparsegllog_module

## Load data

Load data from excel

In [3]:
import os
# load data from xlsx files and create a dataframe

Predictors = pd.read_excel(os.path.abspath('predictors-monthly.xlsx')).to_numpy()
Target = pd.read_excel(os.path.abspath('gdp-quarterly.xlsx')).to_numpy()

Split data into dates and data tables

In [4]:
# Y data and X and Y dates can also be defined as they are the same for all iterations
Y_date = Target[:,0]
Y = Target[:,1]
X_date = Predictors[:,0]
X = Predictors[:,1:]

## Transform data using functions from data_functions

Define variables ued in transformation

In [5]:
# Lag variables
x_lags = 3
y_lags = 0
horizon = 0

# Legendre matrix
degree = 4 # 3 degrees + polynomial 0

Call data transformation function

In [6]:
transformed_data = datef.data_transform(Y, Y_date, X, X_date, x_lags, y_lags, horizon, degree=degree, standardize = True)

In [7]:
X = transformed_data['X_tilde']
y = transformed_data['Y']

# Split x and y into a 80/20 train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

## sgLasso

In [8]:
from midasmlpy.sparse_group_lasso import *

# Call the function
model2 = best_model(x=X_train, y=y_train, group_size=degree, family='gaussian', nlam=50, pmax=122, intr=True,
                    k_folds=2, disp_flag=False, alpha_values=3, alpha=None)


# model2 = best_model(x = x_train, y = y_train, group_size = degree, family = 'gaussian', nlam = 100, pmax = 122, intr = False, k_folds = 3, disp_flag = True, alpha_values = 11, alpha = None)

In [None]:
model2['best_performance']

In [None]:
x = x_train
y = y_train
group_size = degree
family = 'gaussian'
nlam = 100
pmax = 122
intr = False
k_folds = 3
disp_flag = True
alpha_values = 11
alpha = None
alsparse = 0.5

In [None]:
# Find model nlam number of models
b0, beta, alam, npass, jerr, mse = sgLASSO_estimation(x, y, group_size, alsparse,family, pmax, intr)

# Find mean performance for each lambda
# Split the data into k_folds
if family == 'binomial':
    kf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=42)
if family == 'gaussian':   
    kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)

# initialize performance list
performance = []
for train_index, test_index in kf.split(x,y):
    # Based on the split, create the training and test data for this fold
    x_train, x_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]
    # Estimate the model on the training data
    b0test, betatest, alam, npass, jerr, msetrain = sgLASSO_estimation(x_train, y_train, group_size, alsparse, family, pmax = pmax, intr = intr, ulam = alam)
    if family == 'gaussian':
        performance.append(evaluate_gaussian(x_test, y_test, b0test, betatest, intr=False, eval='mse'))
    if family == 'binomial':
        performance.append(evaluate_binomials(x_test, y_test, b0test, betatest, intr=False, eval='auc', threshold=0.5))

performance = np.array(performance)
mean_performance = np.mean(performance, axis=0)
if family == 'binomial':
    best_lambda = np.argmax(mean_performance)
if family == 'gaussian':
    best_lambda = np.argmin(mean_performance)

In [None]:
mean_performance

array([15.72073806, 15.68469278, 15.49494481, 15.32664924, 15.1753788 ,
       15.0394381 , 14.91728718, 14.79831989, 14.69161439, 14.58669592,
       14.4925171 , 14.40975478, 14.32792115, 14.23311733, 14.0924809 ,
       13.97235306, 13.86846429, 13.77427229, 13.6950444 , 13.62711568,
       13.57304908, 13.52009265, 13.45608846, 13.37643942, 13.3041933 ,
       13.24390963, 13.20272572, 13.17317758, 13.16261396, 13.16207891,
       13.15776153, 13.15411543, 13.15249136, 13.13512304, 13.11654678,
       13.09464479, 13.05082401, 12.98869776, 12.91041804, 12.84006636,
       12.77987338, 12.7202505 , 12.6619418 , 12.61368418, 12.58416023,
       12.56948842, 12.58089292, 12.60514016, 12.64270585, 12.68403573,
       12.74605675, 12.81651275, 12.9162938 , 13.02654741, 13.14018596,
       13.24087544, 13.3647866 , 13.50583955, 13.65282761, 13.81935746,
       13.99596268, 14.17420344, 14.36136385, 14.54756812, 14.71971168,
       14.88544037, 15.07012714, 15.26097823, 15.45872708, 15.66

In [None]:
mean_performance[best_lambda]

12.569488420636413

In [None]:
np.argmin(mean_performance)

45

In [None]:
x = transformed_data['X_tilde']
y = transformed_data['Y']

# # Split x and y into a 80/20 train test split
train_size = int(0.8*x.shape[0])
x_train, x_test = x[:train_size], x[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

In [None]:
b0, beta, alam, npass, jerr, mse = sgLASSO_estimation(x_train, y_train, group_size, alsparse,family, pmax, intr)
evaluation_scores = [0] * len(b0)  # this will store evaluation scores
for l in range(len(b0)):
    predictions = np.dot(x_test, beta[:,l]) + b0[l]
    evaluation_scores[l] = mean_squared_error(y_test, predictions)


In [None]:
evaluation_scores

[66.67973619271427,
 66.66931066055061,
 66.59851725606049,
 66.5317382351875,
 66.46871618002568,
 66.40920931545486,
 66.3529970300571,
 66.29987845217595,
 66.24967026451827,
 66.20220621963709,
 66.15733641090557,
 66.1149261202311,
 66.07485511895634,
 66.03701712057271,
 66.00131938223261,
 65.95432307928232,
 65.902637478755,
 65.86062823728464,
 65.81928510611839,
 65.77666324237987,
 65.72177092426341,
 65.71595239753776,
 65.74122524773539,
 65.76844263013173,
 65.71892815299486,
 65.67069136843438,
 65.63721084185885,
 65.61596415275122,
 65.64115886486319,
 65.67454284698647,
 65.82231958955516,
 66.00603189368738,
 66.15102225346607,
 66.25443014565182,
 66.32635088372936,
 66.41786136425802,
 66.41424197901995,
 66.39330802493555,
 66.43509613506544,
 66.5046442677282,
 66.58454308107342,
 66.66999263480321,
 66.75863384605294,
 66.851708892422,
 67.00356943408615,
 67.19962874767405,
 67.40932367352032,
 67.64931674004487,
 67.91035460480109,
 68.18692489683862,
 68.4656

In [None]:
pd.DataFrame(predictions, y_test)

Unnamed: 0,0
3.58582,1.033435
2.46998,5.012366
1.59778,2.844964
0.73724,6.968606
2.31139,10.076522
1.28244,5.805969
2.82754,5.754215
2.21107,3.512108
1.94283,4.712126
2.23324,7.100055


In [None]:
pd.DataFrame(evaluation_scores)

Unnamed: 0,0
0,15.726799
1,15.695337
2,15.484573
3,15.291053
4,15.113180
...,...
95,0.482967
96,0.451148
97,0.421517
98,0.393788
