## Import packages used

In [1]:
import os
os.chdir('/Users/m.egelundmuller/Documents/GitHub/midasmlpy')
import numpy as np
import pandas as pd
from datetime import datetime
from VariousJupyterScripts.date_functions import * # used to handle different frequencies of data and to create lags
from VariousJupyterScripts.sparse_group_lasso import * # used to run the sparse group lasso and related functions
from sklearn.model_selection import StratifiedKFold

## Load data

Load data from excel

In [2]:
# load data from xlsx files and create a dataframe
Predictors = pd.read_excel('midasmlpy/data_files/predictors-monthly.xlsx').to_numpy()
Target = pd.read_excel('midasmlpy/data_files/recessions-quarterly.xlsx').to_numpy()

Split data into dates and data tables

In [3]:
# Y data and X and Y dates can also be defined as they are the same for all iterations
Y_date = Target[:,0]
Y = Target[:,1]
X_date = Predictors[:,0]
X = Predictors[:,1:]

## Transform data using functions from data_functions

Define variables ued in transformation

In [4]:
# Lag variables
x_lags = 3
y_lags = 0
horizon = 0

# Legendre matrix
legendre_degree = 4 # 3 degrees + polynomial 0

Call data transformation function

In [5]:
transformed_data = data_transform(Y, Y_date, X, X_date, x_lags, y_lags, horizon, legendre_degree=legendre_degree, standardize = True)

## sgLasso

Use log_sparse_four. Idea is: User inserts x_tilde, y, legendre degree

In [6]:
# Function input
x = transformed_data['X_tilde']
y = transformed_data['Y']
group_size = legendre_degree
# sgLasso_estimation(x, y, group_size, alsparse, pmax = 100, intr = True, nlam=None, ulam=None)
nlam = 50
pmax = 122
intr = True
k_folds = 5
alpha_values = 5

In [11]:
# Parameters
alsparse_values = np.linspace(1, 0, alpha_values)  # Testing alsparse values from 0 to 1

# Dictionary to store the average maximized performances for each alsparse
performance_dict = {}
best_performance = 0
best_alsparse = None
b0, beta = None, None  # Initialize parameters that will store best model coefficients


# Cross-validation process
for alsparse in alsparse_values:
    model_result = bestmodel(x,y,group_size, alsparse, nlam = nlam, pmax = pmax, intr = intr,k_folds = k_folds)
    # Append the maximized performance of this fold
    performance_dict[alsparse] = model_result['maximized_performance']
    # If this fold has a higher maximized performance than the previous best, update the best performance
    if model_result['maximized_performance']>best_performance:
        best_performance = model_result['maximized_performance']
        best_alsparse = alsparse
        b0 = model_result['b0']
        beta = model_result['beta'] 

best_performance = performance_dict[best_alsparse]

In [12]:
performance_dict

{1.0: 0.5,
 0.75: 0.51,
 0.5: 0.5033333333333333,
 0.25: 0.5033333333333333,
 0.0: 0.5033333333333333}

### log_sparse_four testing

In [254]:
# Function input
x = transformed_data['X_tilde']
y = transformed_data['Y']
group_size = legendre_degree
# sgLasso_estimation(x, y, group_size, alsparse, pmax = 100, intr = True, nlam=None, ulam=None)
nlam = 50
pmax = 122
intr = True

the sgLasso does not always return the same values despite getting the same input being generated. When using the exact same variables, it returns the same values.

In [223]:
# SG-Lasso function inputs
alsparse = 0.5
ulam = np.ones(nlam)
nobs,nvars = x.shape[0], x.shape[1] # Number of observations and features
eps = 1e-8 # Convergence threshold
maxit = 1000000 # Maximum number of iterations
bn = x.shape[1]//group_size # Number of groups as an integer
bs = np.full(bn, group_size, dtype=int) # Elements in groups
ix, iy =  list(range(0, nvars, group_size)), list(range(group_size-1, nvars, group_size)) # Placement og first column of each group in x
gam = 0.25 * calc_gamma(x, ix, iy, bn) # Calculate gamma values for each group of features (columns) 
pf, pfl1 = np.sqrt(bs),np.ones(nvars) # Penalty factors for L2 and L1 penalties
dfmax = bn + 1 # Maximum number of groups
flmin = 0.01 if nobs < nvars else 1e-04
lb,ub = np.full(bn, -np.inf),np.full(bn, np.inf) # Lower and upper bounds for the coefficients

#When the function is run with the same input, it returns the same output
b0a1 = sgl.log_sparse_four(x = x,
                y = y, bn = bn, bs = bs, ix = ix, iy = iy, gam = gam, nobs = nobs, 
                nvars = nvars, pf = pf, pfl1 = pfl1, dfmax = dfmax, pmax = pmax, 
                nlam = nlam, flmin = flmin, ulam = ulam, eps = eps, maxit = maxit, 
                intr = intr, lb = lb, ub = ub, alsparse = alsparse)[1]
b0b2 = sgl.log_sparse_four(x = x,
                y = y, bn = bn, bs = bs, ix = ix, iy = iy, gam = gam, nobs = nobs, 
                nvars = nvars, pf = pf, pfl1 = pfl1, dfmax = dfmax, pmax = pmax, 
                nlam = nlam, flmin = flmin, ulam = ulam, eps = eps, maxit = maxit, 
                intr = intr, lb = lb, ub = ub, alsparse = alsparse)[1]
print(b0a1==b0b2)
print(b0a1)
print(b0b2)

[ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True]
[0.         8.92991656 8.93006259 8.93020861 8.93035459 8.93050054
 8.93064645 8.9307923  8.9309381  8.93108382 8.93122945 8.93725695
 8.94325032 8.94859773 8.95346936 8.95776523 8.96158077 8.96506319
 8.96817001 8.97097823 8.97350194 8.97575416 8.97780167 8.97959876
 8.98126191 8.98279579 8.98415294 8.98533926 8.98641074 8.98742082
 8.98837078 8.98920942 8.98994364 8.99057562 8.99120355 8.99177959
 8.99225699 8.99268462 8.99311031 8.99348708 8.99386232 8.99414275
 8.99442232 8.99470104 8.99493265 8.99516366 8.99534804 8.99553205
 8.9956698  8.99580734]
[0.         8.92991656 8.93006259 8.93020861 8.93035459 8.93050054
 8.93064645 8.9307923  8.9309381  8.93108382 8.93122945 8.93725695


Below values are deemed 'wrong'. The input has been saved after a previous run of above code with the same output. 

In [230]:
# xw = x,
# yw = y
# bnw = bn
# bsw = bs
# ixw = ix
# iyw = iy
# gamw = gam
# nobsw = nobs
# nvarsw = nvars
# pfw = pf
# pfl1w = pfl1
# dfmaxw = dfmax
# pmaxw = pmax 
# nlamw = nlam
# flminw = flmin
# ulamw = ulam
# epsw = eps
# maxitw = maxit 
# intrw = intr
# lbw = lb
# ubw = ub
# alsparsew = alsparse
b0b = sgl.log_sparse_four(x = xw,
                y = yw, bn = bnw, bs = bsw, ix = ixw, iy = iyw, gam = gamw, nobs = nobsw, 
                nvars = nvarsw, pf = pfw, pfl1 = pfl1w, dfmax = dfmaxw, pmax = pmaxw, 
                nlam = nlamw, flmin = flminw, ulam = ulamw, eps = epsw, maxit = maxitw, 
                intr = intrw, lb = lbw, ub = ubw, alsparse = alsparsew)[1]
b0b

array([0.00000000e+00, 8.92991656e+00, 8.93006259e+00, 8.93020861e+00,
       8.93035459e+00, 8.93050054e+00, 8.93064645e+00, 8.93079230e+00,
       8.93093810e+00, 8.93108382e+00, 8.93122945e+00, 8.93725695e+00,
       8.94325032e+00, 8.94859773e+00, 8.95346936e+00, 8.95776523e+00,
       8.96158077e+00, 8.96506319e+00, 8.96817001e+00, 8.97097823e+00,
       8.97350194e+00, 8.97575416e+00, 8.97780167e+00, 8.97959876e+00,
       8.98126191e+00, 8.98279579e+00, 8.98415294e+00, 8.98533926e+00,
       8.98641074e+00, 8.98742082e+00, 1.11217363e-05, 1.11214800e-05,
       1.11211396e-05, 1.11207226e-05, 1.11202357e-05, 1.11196835e-05,
       1.11190696e-05, 1.11183975e-05, 1.11176713e-05, 1.11168940e-05,
       1.11160698e-05, 1.11152027e-05, 1.11133893e-05, 1.11124462e-05,
       1.11104938e-05, 1.11084806e-05, 1.11074463e-05, 1.11063864e-05,
       1.11053032e-05, 1.11041989e-05])

Below output is deemed 'right'. The variables are saved from a previous run.

In [225]:
# xr = x
# yr = y
# bnr = bn
# bsr = bs
# ixr= ix 
# iyr = iy
# gamr = gam
# nobsr = nobs 
# nvarsr = nvars 
# pfr = pf
# pfl1r = pfl1
# dfmaxr = dfmax
# pmaxr = pmax 
# nlamr = nlam
# flminr = flmin
# ulamr = ulam
# epsr = eps
# maxitr = maxit 
# intrr = intr
# lbr = lb
# ubr = ub
# alsparser = alsparse
b0a = sgl.log_sparse_four(x = xr,
                y = yr, bn = bnr, bs = bsr, ix = ixr, iy = iyr, gam = gamr, nobs = nobsr, 
                nvars = nvarsr, pf = pfr, pfl1 = pfl1r, dfmax = dfmaxr, pmax = pmaxr, 
                nlam = nlamr, flmin = flminr, ulam = ulamr, eps = epsr, maxit = maxitr, 
                intr = intrr, lb = lbr, ub = ubr, alsparse = alsparser)[1]
b0a

array([0.        , 8.92991656, 8.93006259, 8.93020861, 8.93035459,
       8.93050054, 8.93064645, 8.9307923 , 8.9309381 , 8.93108382,
       8.93122945, 8.93725695, 8.94325032, 8.94859773, 8.95346936,
       8.95776523, 8.96158077, 8.96506319, 8.96817001, 8.97097823,
       8.97350194, 8.97575416, 8.97780167, 8.97959876, 8.98126191,
       8.98279579, 8.98415294, 8.98533926, 8.98641074, 8.98742082,
       8.98837078, 8.98920942, 8.98994364, 8.99057562, 8.99120355,
       8.99177959, 8.99225699, 8.99268462, 8.99311031, 8.99348708,
       8.99386232, 8.99414275, 8.99442232, 8.99470104, 8.99493265,
       8.99516366, 8.99534804, 8.99553205, 8.9956698 , 8.99580734])

When running the code with various input output combinations, it becomes apparent that it is the pfl1-input that causes the difference

In [239]:
# Right pfl1 input and wrong other inputs
b0d = sgl.log_sparse_four(x = xr,
                y = yr, bn = bnr, bs = bsr, ix = ixr, iy = iyr, gam = gamr, nobs = nobsr, 
                nvars = nvarsr, pf = pfr, pfl1 = pfl1w, dfmax = dfmaxr, pmax = pmaxr, 
                nlam = nlamr, flmin = flminr, ulam = ulamr, eps = epsr, maxit = maxitr, 
                intr = intrr, lb = lbr, ub = ubr, alsparse = alsparser)[1]
print(b0d)
print('is equal to wrong value?: ',np.array_equal(b0b,b0d))
print('is equal to right value?: ',np.array_equal(b0a,b0d))

[0.00000000e+00 8.92991656e+00 8.93006259e+00 8.93020861e+00
 8.93035459e+00 8.93050054e+00 8.93064645e+00 8.93079230e+00
 8.93093810e+00 8.93108382e+00 8.93122945e+00 8.93725695e+00
 8.94325032e+00 8.94859773e+00 8.95346936e+00 8.95776523e+00
 8.96158077e+00 8.96506319e+00 8.96817001e+00 8.97097823e+00
 8.97350194e+00 8.97575416e+00 8.97780167e+00 8.97959876e+00
 8.98126191e+00 8.98279579e+00 8.98415294e+00 8.98533926e+00
 8.98641074e+00 8.98742082e+00 1.11217363e-05 1.11214800e-05
 1.11211396e-05 1.11207226e-05 1.11202357e-05 1.11196835e-05
 1.11190696e-05 1.11183975e-05 1.11176713e-05 1.11168940e-05
 1.11160698e-05 1.11152027e-05 1.11133893e-05 1.11124462e-05
 1.11104938e-05 1.11084806e-05 1.11074463e-05 1.11063864e-05
 1.11053032e-05 1.11041989e-05]
is equal to wrong value?:  True
is equal to right value?:  False


The input are identical, both created using np.ones()

In [240]:
np.array_equal(pfl1r,pfl1w)

True

When changing the wrong variable to an int or rounding up it, it only gets the correct output sometimes

In [241]:
# wrong pfl1 input and right other inputs
b0c = sgl.log_sparse_four(x = x,
                y = yw, bn = bnw, bs = bsw, ix = ixw, iy = iyw, gam = gamw, nobs = nobsw, 
                nvars = nvarsw, pf = pfw, pfl1 = pfl1w.round(0), dfmax = dfmaxw, pmax = pmaxw, 
                nlam = nlamw, flmin = flminw, ulam = ulamw, eps = epsw, maxit = maxitw, 
                intr = intrw, lb = lbw, ub = ubw, alsparse = alsparsew)[1]
print(b0c)
print('is equal to wrong value?: ',np.array_equal(b0b,b0c))
print('is equal to right value?: ',np.array_equal(b0a,b0c))

[0.         8.92991656 8.93006259 8.93020861 8.93035459 8.93050054
 8.93064645 8.9307923  8.9309381  8.93108382 8.93122945 8.93725695
 8.94325032 8.94859773 8.95346936 8.95776523 8.96158077 8.96506319
 8.96817001 8.97097823 8.97350194 8.97575416 8.97780167 8.97959876
 8.98126191 8.98279579 8.98415294 8.98533926 8.98641074 8.98742082
 8.98837078 8.98920942 8.98994364 8.99057562 8.99120355 8.99177959
 8.99225699 8.99268462 8.99311031 8.99348708 8.99386232 8.99414275
 8.99442232 8.99470104 8.99493265 8.99516366 8.99534804 8.99553205
 8.9956698  8.99580734]
is equal to wrong value?:  False
is equal to right value?:  True


In [242]:
# wrong pfl1 input and right other inputs
b0c = sgl.log_sparse_four(x = x,
                y = yw, bn = bnw, bs = bsw, ix = ixw, iy = iyw, gam = gamw, nobs = nobsw, 
                nvars = nvarsw, pf = pfw, pfl1 = pfl1w.round(0), dfmax = dfmaxw, pmax = pmaxw, 
                nlam = nlamw, flmin = flminw, ulam = ulamw, eps = epsw, maxit = maxitw, 
                intr = intrw, lb = lbw, ub = ubw, alsparse = alsparsew)[1]
print(b0c)
print('is equal to wrong value?: ',np.array_equal(b0b,b0c))
print('is equal to right value?: ',np.array_equal(b0a,b0c))

[0.00000000e+00 8.92991656e+00 8.93006259e+00 8.93020861e+00
 8.93035459e+00 8.93050054e+00 8.93064645e+00 8.93079230e+00
 8.93093810e+00 8.93108382e+00 8.93122945e+00 8.93725695e+00
 8.94325032e+00 8.94859773e+00 8.95346936e+00 8.95776523e+00
 8.96158077e+00 8.96506319e+00 8.96817001e+00 8.97097823e+00
 8.97350194e+00 8.97575416e+00 8.97780167e+00 8.97959876e+00
 8.98126191e+00 8.98279579e+00 8.98415294e+00 8.98533926e+00
 8.98641074e+00 8.98742082e+00 1.11217363e-05 1.11214800e-05
 1.11211396e-05 1.11207226e-05 1.11202357e-05 1.11196835e-05
 1.11190696e-05 1.11183975e-05 1.11176713e-05 1.11168940e-05
 1.11160698e-05 1.11152027e-05 1.11133893e-05 1.11124462e-05
 1.11104938e-05 1.11084806e-05 1.11074463e-05 1.11063864e-05
 1.11053032e-05 1.11041989e-05]
is equal to wrong value?:  True
is equal to right value?:  False


And when copying the 'right' pfl1, but adding a decimal far down the line, the output is still ok

In [244]:
pfl1rslight = np.copy(pfl1r)

In [252]:
pfl1rslight[0]=1.0000001

In [253]:
# wrong pfl1 input and right other inputs
b0x = sgl.log_sparse_four(x = x,
                y = yw, bn = bnw, bs = bsw, ix = ixw, iy = iyw, gam = gamw, nobs = nobsw, 
                nvars = nvarsw, pf = pfw, pfl1 = pfl1rslight, dfmax = dfmaxw, pmax = pmaxw, 
                nlam = nlamw, flmin = flminw, ulam = ulamw, eps = epsw, maxit = maxitw, 
                intr = intrw, lb = lbw, ub = ubw, alsparse = alsparsew)[1]
print(b0x)
print('is equal to wrong value?: ',np.array_equal(b0b,b0x))
print('is equal to right value?: ',np.array_equal(b0a,b0x))

[0.         8.92991656 8.93006259 8.93020861 8.93035459 8.93050054
 8.93064645 8.9307923  8.9309381  8.93108382 8.93122945 8.93725695
 8.94325032 8.94859773 8.95346936 8.95776523 8.96158077 8.96506319
 8.96817001 8.97097823 8.97350194 8.97575416 8.97780167 8.97959876
 8.98126191 8.98279579 8.98415294 8.98533926 8.98641074 8.98742082
 8.98837078 8.98920942 8.98994364 8.99057562 8.99120355 8.99177959
 8.99225699 8.99268462 8.99311031 8.99348708 8.99386232 8.99414275
 8.99442232 8.99470104 8.99493265 8.99516366 8.99534804 8.99553205
 8.9956698  8.99580734]
is equal to wrong value?:  False
is equal to right value?:  True
