# Packages

In [1]:
# Load Packages 
import pandas as pd
import numpy as np
import math
import os
import random
from tqdm import tqdm

import warnings 

import time

#import multiprocess as mp
from multiprocess import Pool

#from functools import partial  

import matplotlib.pyplot as plt
import seaborn as sns

from itertools import combinations
from itertools import chain
from itertools import product

from statsmodels.tsa.tsatools import lagmat
import statsmodels.api as sm
from statsmodels.tsa.ar_model import AutoReg

from sklearn.preprocessing import StandardScaler
#from sklearn.model_selection import TimeSeriesSplit
from sklearn import random_projection
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
#from sklearn.linear_model import LarsCV
from sklearn.linear_model import Lars
from sklearn.model_selection import RepeatedKFold
from sklearn.linear_model import LassoCV
#from sklearn.linear_model import Lasso
from sklearn.feature_selection import SequentialFeatureSelector

from dimod import BinaryQuadraticModel
from dwave.samplers import SimulatedAnnealingSampler
#from dwave.samplers import SteepestDescentSolver
#from dwave.preprocessing import roof_duality
# from dwave.system import LeapHybridSampler
# from dwave.system import DWaveSampler
# from dwave.system import EmbeddingComposite
# from dimod import ExactSolver
# from dwave.samplers import TabuSampler
# from dwave.samplers import TreeDecompositionSolver

import gurobipy as gp
from gurobi_optimods.qubo import solve_qubo
gp.setParam('OutputFlag', 0)

if os.name == 'nt':
    import dill
    dill.settings['recurse'] = True

Set parameter Username
Academic license - for non-commercial use only - expires 2025-01-09


# Set Path

In [2]:
# Set Path
path  =  os.path.dirname(os.getcwd()) # os.path.dirname(os.getcwd()) #r'/Users/slehmann/Library/CloudStorage/Dropbox/QUBO'

# Functions

## Simulate Data

In [3]:
# Simulate Data
def sim_data(n_obs, n_preds, non_zero, p, rho, scenario, snr, random_state):
    
    """
    ...
    """
    
    # Set Seed
    np.random.seed(random_state)
    
    # Simulating nonzero betas
    b_nonzero = (-1) ** np.random.binomial(1, p, non_zero) * ((4 * np.log(n_obs) / np.sqrt(n_obs)) + np.random.standard_normal(non_zero))
    
    # Simulate all Betas
    b = np.append(b_nonzero, np.repeat(0, n_preds - non_zero))
    
    # Simulate Covariance Matrix
    if scenario == 1:
        
        # Set Covariance Matrix between Predictors
        cov_mat = np.full((n_preds, n_preds), rho)
        np.fill_diagonal(cov_mat, 1.0)
        
    if scenario == 2:
        
        # Set Covariance Matrix between Predictors
        cov_mat = np.zeros((n_preds, n_preds))
        cov_mat[:non_zero, :non_zero] = rho
        np.fill_diagonal(cov_mat, 1.0)

    # Simulate Predictor-Time-Series
    X = np.random.multivariate_normal([0.0]*n_preds, cov_mat, n_obs)
    pred_names = "X"+pd.Series(range(1, n_preds+1)).astype(str) 
    
    # Simulate Noise
    adj   =  np.sqrt((b.transpose() @ cov_mat @ b) / snr)
    error =  np.random.standard_normal(n_obs)   

    # Set Target Variable
    y = X @ b + adj * error
    
    # Return
    return(y, X, pred_names)

In [4]:
# Simulate Data
def sim(n_obs, n_preds, b, p, r):
    
    """
    Simulate Data for Regression Problem
    with pre-determined Covariance Matrix between Predictors
    and pre-determined Coefficients
    """
    
    # Set Seed
    np.random.seed(r)
    
    # Set Covariance Matrix between Predictors
    cov_mat = np.full((n_preds, n_preds), p)
    np.fill_diagonal(cov_mat, 1.0)

    # Simulate Predictor Time Series
    X = np.random.multivariate_normal([0.0]*n_preds, cov_mat, n_obs)
    pred_names = "X"+pd.Series(range(1, n_preds+1)).astype(str) #list(X.columns)

    # Set Noise
    noise  =  1.0
    eps    =  np.random.normal(0.0, noise, n_obs)

    # Set Coefficients
    b_scl = np.sqrt(n_preds / np.sum(b)) * b * noise / np.sqrt(100)   

    # Set Target Variable
    y = X @ b_scl + eps
    
    # Return
    return(y, X, pred_names)

In [5]:
# Function to create Lags
def create_lags(y, X, mlags):
    
    """
    Add mlags lags of y to X
    """
    
    # Add all lags from zero to maxlag
    X = np.concatenate((lagmat(y, maxlag = mlags, use_pandas = True), X), axis = 1)
    
    # Return
    return X

In [6]:
# Function to Pre-Process Data
def prepro(y, X, t):
    
    """
    Split Data in Train and Predict Data and Standardize Data
    """
    
    # Train Data
    y_train  =  y[:t]
    X_train  =  X[:t]

    # Predict Data
    y_pred  =  y[t]
    X_pred  =  X[[t]]
    
    # Standardize Data
    scaler  =  StandardScaler()   
    X_train =  scaler.fit_transform(X_train)
    X_pred  =  scaler.transform(X_pred)
    
    ## Add Constant
    X_train =  sm.add_constant(X_train)
    X_pred  =  sm.add_constant(X_pred, has_constant = 'add')
    
    return y_train, X_train, y_pred, X_pred

## (Complete) Subset Forecasts

In [7]:
# Function to return array of all subsets of length k
def complete_sub(arr, k):
    
    """
    Elements are treated as unique based on their position, not on their value.
    So if the input elements are unique, there will be no repeated values in each combination.
    """
    
    # Get all subsets of size k
    subset = list(combinations(arr, k)) 
    
    # Return 
    return subset 

In [8]:
# Function to calculate number of models
def n_models(K, k):
    
    """
    Calculate number of models
    """
    
    return math.factorial(K) / (math.factorial(k) * math.factorial(K-k))

In [9]:
# Function to randomly select n_max items from array
def random_select(arr, n_max, random_state):
    
    """
    ...
    """
    
    # Set random state
    random.seed(random_state)
    
    # Set upper Boundary
    upper_bound  =  len(arr) if len(arr) < n_max else n_max
    
    # Randomly select items without repetition
    rand_arr  =  random.sample(arr, k = upper_bound)
    
    # Return 
    return rand_arr

In [10]:
# Function to produce Subset Regression Forecasts
def ssf(y_train, X_train, X_pred, feature, mlags):
    
    # Subset Feature Space (incl. constant)
    X_train_subset = X_train[:, list(range(0, mlags+1)) + list(feature)]
    X_pred_subset  = X_pred[:, list(range(0, mlags+1)) + list(feature)]
    
    # Fit Model
    model =  sm.OLS(y_train, X_train_subset) # LinearRegression() 
    regr  =  model.fit() # model.fit(X_train_subset, y_train)
    
    # Predict
    pred = regr.predict(X_pred_subset)
    
    return(pred[0], regr.params[1:])

## Compressed Regressions

In [11]:
# Compressed Regression (Gaussian random projection)
def cr_reg(y_train, X_train, X_pred, n_comp, mlags, ran_st):
    
    # Set up Random-Projection-Matrix
    projector = random_projection.GaussianRandomProjection(n_components = n_comp, random_state = ran_st)
    
    # Transform
    X_train_proj =  projector.fit_transform(X_train[:, (mlags+1):])
    X_pred_proj  =  projector.fit_transform(X_pred[:,  (mlags+1):])

    # Add Constant + Lags
    rp_train =  np.concatenate([X_train[:, :(mlags+1)], X_train_proj], axis = 1)
    rp_pred  =  np.concatenate([X_pred[:,  :(mlags+1)], X_pred_proj],  axis = 1)

    # Fit Model
    model  =  sm.OLS(y_train, rp_train) #LinearRegression()
    regr   =  model.fit() # model.fit(rp_train, y_train)
    
    # Predict
    pred = regr.predict(rp_pred)
    
    return pred[0]

## Decision Trees

In [12]:
# Decision Tree Regression
def dt_reg(y_train, X_train, X_pred, ran_st):
    
    # Set up Regressor Object 
    model = DecisionTreeRegressor(criterion = "squared_error",
                                  max_depth = 20,
                                  splitter  = "random",
                                  random_state = ran_st)
    
    # Fit model
    model.fit(X_train, y_train)
    
    # Predict
    pred = model.predict(X_pred)

    # Return Prediction
    return pred[0]

## Loop Candidate Forecasts

In [13]:
# Combine all Candidate Models in one Function
def candidate_models_t(y_, X_, t_, k_range_, cr_range_, rep_range_):
    
    ### Pre-Process Data ###
    y_train, X_train, y_pred, X_pred = prepro(y_, X_, t_)
    
    ### Subset Forecasts ###
    if np.sum(k_range_) == 0:
        preds_ssf = np.array([]) 
    else: 
        feature_set       = list(chain(*list(map(lambda k: complete_sub(list(range(1, X_train.shape[1])), k), k_range_))))
        preds_ssf, coeffs = zip(*list(map(lambda feature: ssf(y_train, X_train, X_pred, feature, 0), feature_set)))
    
    ### Compressed Regressions ###
    if (np.sum(cr_range_) == 0) or (np.sum(rep_range_) == 0):
        preds_cr = np.array([]) 
    else:
        preds_cr = np.array(list(map(lambda z: cr_reg(y_train, X_train, X_pred, z[0], 0, z[1]), product(cr_range_, rep_range_))))
            
    ### Concatenate Predictions ###
    cand_forecasts = np.concatenate([preds_ssf, preds_cr])
    
    return cand_forecasts

In [14]:
# Combine all Candidate Models in Parallel
def candidate_models(y, X, t_range, k_range, cr_range, rep_range, n_core):
    
    ### Create Candidate Models ###
    if __name__ == '__main__':
        pool_ = Pool(n_core)
        cand_forecasts = np.array(pool_.map(lambda t: candidate_models_t(y, X, t, k_range, cr_range, rep_range), t_range))
        pool_.close()
        
    # Return
    return cand_forecasts
                
#f = partial(candidate_models, y_ = y, X_ = X, k_range_ = k_range, cr_range_ = cr_range, rep_range_ = rep_range)
#with mp.Pool(5) as pool:
#    cand_forecasts = np.array([result for result in pool.map(lambda t: f(t_ = t), range(init, n_obs))])

## AR-Model

In [15]:
# Autoregressive Model
def ar_mod(y_train, lags):

    # Fit AR-Model
    model = AutoReg(y_train, lags=lags).fit()
    
    # Prediction
    pred = model.forecast(steps=1)

    # Return Prediction
    return pred[0]

## Least Angle Regression

In [16]:
# Function to fit and predict LARS-Models
def lars(y_train, cf_train, cf_pred, n_range):

    # Set up Array
    predictions = np.full(len(n_range), np.nan)
    
    # Init Counter
    i = 0
    
    # Loop over Subset Size
    for n in n_range:
        
        # Define Model
        model = Lars(fit_intercept = True,
                     fit_path = False,
                     jitter = None,
                     n_nonzero_coefs = n,
                     random_state = 123)

        # Fit Model
        model.fit(cf_train, y_train)

        # Predict
        predictions[i] = model.predict(cf_pred)[0][0]
        
        # Update Counter
        i += 1
    
    return(predictions)

## Forward Stepwise Regression

In [17]:
# Function to perform Forward Stepwise Selection
def fss_n(y_train, cf_train, cf_pred, n):
    
        # Model
        model = LinearRegression()
    
        # Sequential Forward Selection
        sfs = SequentialFeatureSelector(model,
                                        n_features_to_select = n,
                                        direction = 'forward')
    
        # Select Features
        active_set = sfs.fit(cf_train, y_train).get_support()
    
        # Fit Model
        model.fit(cf_train[:, active_set], y_train)
    
        # Predict
        pred = model.predict(cf_pred[:, active_set])[0]
        
        # Return
        return pred

In [18]:
# Function to perform Forward Stepwise Selection in Parallel
def fss(y_train, cf_train, cf_pred, n_range, n_core):
    
    # Parallelize over Subset Size
    if __name__ == '__main__':
        pool_ = Pool(n_core)
        predictions = np.array(pool_.map(lambda n: fss_n(y_train, cf_train, cf_pred, n), n_range))
        pool_.close()

    # Return
    return predictions

## Partially-Egalitarian Lasso

In [19]:
# Function to fit and predict peLASSO-Models
def peLASSO(y_train, cf_train, cf_pred, n_alpha, n_iter, cv_splits, cv_repeats):
    
    # No warnings
    warnings.filterwarnings('ignore')
    
    ### Step 1: Select to zero
    # Define Cross-Validation Method
    cv = RepeatedKFold(n_splits = cv_splits,
                       n_repeats = cv_repeats,
                       random_state = 1)
    
    # Define Model
    model_lasso = LassoCV(fit_intercept = True,
                          n_alphas = n_alpha,
                          max_iter = n_iter,
                          cv = cv,
                          n_jobs=-1)
    
    # Fit Model
    model_lasso.fit(cf_train, y_train)
    
    # Get & select only active candidate models
    active_cf_train = cf_train[:, model_lasso.coef_.astype(bool)]
    active_cf_pred  = cf_pred[: , model_lasso.coef_.astype(bool)]

    ### Step 2: Shrink towards equality
    # Check if active candidate models exist
    if active_cf_train.shape[1] > 0:
        
        mean_cf = active_cf_train.mean(axis = 1)
    
        # Define Cross-Validation Method
        cv = RepeatedKFold(n_splits = cv_splits,
                           n_repeats = cv_repeats,
                           random_state = 1)
    
        # Define Model
        model_elasso = LassoCV(fit_intercept = True,
                               n_alphas = n_alpha,
                               max_iter = n_iter,
                               cv = cv,
                               n_jobs=-1)
    
        # Fit Model
        model_elasso.fit(active_cf_train, (y_train-mean_cf))
    
        # Predict
        pred = model_elasso.predict(active_cf_pred)[0]
        
        #print(active_cf_pred)
    
    else:
        print("No active candidate models")
        
        # Set Prediction to mean
        pred = y_train.mean()
        
    
    # Return Prediction
    return(pred)

## Average-Best Forecast Combination

In [20]:
# Individual-based average-best forecast combination
def avg_best(y_train, cf_train, cf_pred, n_range):
    
    # Set up Array
    predictions = np.full(len(n_range), np.nan)
    
    # Calculate Squared Errors
    se = ([[value] for value in y_train] - cf_train) ** 2

    # Mean-Squared-Error
    mse = np.mean(se, axis = 0)

    # Get indices of the average-best N candidate models
    ind = np.argsort(mse)

    # Init Counter
    i = 0
    
    # Loop over Subset Size
    for n in n_range:
        
        # Predict
        pred = np.mean(cf_pred[:, ind[:n]])
        
        # Append Prediction
        predictions[i] = pred
        
        # Update Counter
        i += 1
        
    # Return
    return predictions

## Best Selection of Forecasts (D-Wave)

In [56]:
# Best Selection of Forecasts
def bssf(y_train, cf_train, cf_pred, alpha, n_sub, bssf_timeout, method):
    
    # Adapt X-Matrix
    cf_train  =  cf_train / n_sub
    cf_pred   =  cf_pred  / n_sub
    
    # Generate Q-Matrix
    ivec      =  np.mat(np.ones(cf_train.shape[1])).transpose()
    aux_mat   =  np.array(y_train.transpose() @ cf_train + alpha * n_sub)
    Q         =  - 2 * np.diag(aux_mat) + cf_train.transpose() @ cf_train + alpha * ivec @ ivec.transpose()

    ## Initialize BQM
    #bqm  =  BinaryQuadraticModel('BINARY')
    #bqm  =  bqm.from_qubo(Q)
    #
    ## Normalize
    #bqm.normalize()
    #
    ## Preprocess (?)
    ##roof_duality(bqm)    
    #
    ## Select Solver
    #solver_qpu  =  SimulatedAnnealingSampler() #LeapHybridSampler() SimulatedAnnealingSampler() EmbeddingComposite(DWaveSampler())
    ##solver_pp   =  SteepestDescentSolver()    #SteepestDescentSolver()
    #
    ## Submit for Solution
    #sampleset  =  solver_qpu.sample(bqm, 
    #                                num_reads = n_times,
    #                                #time_limit = 90,
    #                                label = "Best Subset Selection of Forecasts",
    #                                seed = 123) # f'Best Subset Selection of Forecasts{t}'
    #
    ### Postprocess Problem
    ##sampleset_pp = solver_pp.sample(bqm,
    ##                                initial_states = sampleset.lowest())
    #
    ## Get Solution
    #solution    =  np.array(list(sampleset.first[0].values()))
    
    if method == "qubo":
    
        # Set up Model
        model = gp.Model()
        model.Params.TimeLimit = bssf_timeout
        
        # Decision Variables
        b = model.addMVar(shape=Q.shape[0], vtype=gp.GRB.BINARY, name="b")
        
        # Objective Function
        model.setObjective(b @ Q @ b, gp.GRB.MINIMIZE)
        
        # Optimize
        model.optimize()
        solution = np.array(model.x)
        
    if method == "qcbo":
        
        # Set up Model
        model = gp.Model()
        model.params.timelimit = bssf_timeout

        # Decision Variables
        b = model.addMVar(shape=cf_train.shape[1], vtype=gp.GRB.BINARY, name="b")
        norm_0 = model.addVar(lb=n_sub, ub=n_sub, name="norm")

        # Objective Function
        model.setObjective(b.T @ cf_train.T @ cf_train @ b
                           - 2*y_train.T @ cf_train @ b
                           + np.dot(y_train, y_train), gp.GRB.MINIMIZE)

        # L0-Norm Constraint
        model.addGenConstrNorm(norm_0, b, which=0, name="budget")

        # Optimize
        model.optimize()
        solution = np.array(model.x)[:-1]
    
    # Test Solution
    if np.sum(solution) != n_sub:
        print(f"Warning: Number of selected features does not match --- {np.sum(solution)} instead of {n_sub}!")
    
    # Prediction 
    pred = solution @ cf_pred.transpose()
    
    # Return 
    return(pred[0], solution)

In [18]:
n_obs = 100
init  =  50
n_preds = 4
b = 1
bernoulli_p = 0.2
rho = 0.5
scenario = 1
snr = 3 
r = 123

In [22]:
kfolds = 2
krepeats = 1
k_range = [1, 2]
cr_range = [1]
rep_range = range(5)
n_core = 1
ran_st = 123

In [23]:
### Simulate Data ###
y, X, pred_names = sim_data(n_obs, n_preds, b, bernoulli_p, rho, scenario, snr, r)

### Create Candidate Models ###
cand_forecasts = candidate_models(y, X, range(init, n_obs), k_range, cr_range, rep_range, n_core)

In [25]:
cand_forecasts

array([[ 2.22498112e-01,  2.66514093e-02, -2.02284461e-02,
        -2.38016593e-02,  2.20830563e-01,  2.18093790e-01,
         2.27956356e-01,  4.69403908e-03,  1.53767655e-02,
        -5.84781784e-03,  1.06022326e-01, -5.14908155e-02,
        -4.30154655e-02,  9.36764933e-02, -8.90364084e-02],
       [-4.79778224e-02, -6.95853328e-02, -2.43829477e-01,
        -4.42537477e-01, -4.78686705e-02, -6.95719767e-02,
         6.06128623e-02, -2.31681828e-01, -3.79339851e-01,
        -3.80673698e-01, -4.47863327e-01,  2.07963589e-01,
         1.04436848e-01,  5.21464512e-01, -4.92083866e-02],
       [ 4.84182703e-01, -1.73668432e-01, -1.63966759e-01,
         1.54506407e-01,  4.86681499e-01,  4.51990470e-01,
         4.60800553e-01, -1.87892246e-01,  6.34529447e-02,
        -5.02224790e-02,  2.51804805e-01,  1.00031148e-01,
        -2.87307753e-01, -9.13246412e-02, -8.46176101e-02],
       [ 1.14779557e-01, -1.82276288e-01,  2.48370986e-01,
         4.36317479e-01,  1.12823421e-01,  1.49557875

In [26]:
pd.DataFrame(cand_forecasts)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,0.222498,0.026651,-0.020228,-0.023802,0.220831,0.218094,0.227956,0.004694,0.015377,-0.005848,0.106022,-0.051491,-0.043015,0.093676,-0.089036
1,-0.047978,-0.069585,-0.243829,-0.442537,-0.047869,-0.069572,0.060613,-0.231682,-0.37934,-0.380674,-0.447863,0.207964,0.104437,0.521465,-0.049208
2,0.484183,-0.173668,-0.163967,0.154506,0.486681,0.45199,0.460801,-0.187892,0.063453,-0.050222,0.251805,0.100031,-0.287308,-0.091325,-0.084618
3,0.11478,-0.182276,0.248371,0.436317,0.112823,0.149558,-0.018894,0.190639,0.288022,0.413627,0.50983,-0.310167,-0.217228,-0.768337,-0.071875
4,-0.319761,0.187043,0.100741,0.134221,-0.313431,-0.287685,-0.401649,0.165092,0.228809,0.157413,0.138013,-0.447666,-0.116646,-0.431286,-0.091987
5,-0.128517,0.183409,-0.694249,-0.248165,-0.121087,-0.207783,-0.078503,-0.572376,-0.084523,-0.666072,-0.383446,0.041262,-0.455453,0.25803,-0.145884
6,0.75857,0.391676,0.466075,0.117219,0.763298,0.799139,0.760076,0.574178,0.318559,0.45702,0.627897,-0.191317,0.331826,0.441447,-0.084956
7,-0.008429,0.106267,0.686839,0.350751,-0.003474,0.097938,-0.123396,0.676049,0.357576,0.732347,0.579491,-0.514761,0.240812,-0.544983,-0.034208
8,1.616201,0.47587,0.302482,0.666981,1.612732,1.608689,1.472922,0.466116,0.816237,0.536796,1.369351,-0.211394,-0.236602,0.107902,-0.140873
9,1.168761,0.233615,0.629712,0.21665,1.170552,1.186075,1.188718,0.656553,0.307273,0.604038,0.815859,0.037632,0.46033,0.569953,-0.05036


---

# Simulation

### Setting 1:

In [77]:
###### MC-Parameter ######
# Number of MC-Runs
n_mc   =  1000
n_core =  25

# Set Parameter
n_obs       =  250
n_preds     =  10
init        =  50
mlags       =  0
b_range     =  [2, 5, 8]
bernoulli_p =  0.2
corr_range  =  [0.2, 0.5, 0.8]
scenario_range = [1, 2]
snr_range   =  [0.3, 0.5]

###### Parameter Subset Forecasts ######
# Subset Lengths
k_range_ssf = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] 

# Upper Bound for Subset-Sizes
n_max = 10000  

# Number of Subset-Forecasts
if np.sum(k_range_ssf) == 0:
    n_sub  =  0
else:
    n_sub  =  int(sum([min(n_models(n_preds, k), n_max) for k in k_range_ssf]))

###### Parameter Compressed Regressions ######
# Number of Components for Compressed Regression
k_range_cr  =  [0] #[1, 2, 3, 4] 

# Number of runs for each random projection
rep_range_cr  =  [0] #range(0, 60) 

# Number of Compressed-Regression-Forecasts
if (np.sum(k_range_cr) == 0) or (len(rep_range_cr) == 0):
    n_cr = 0
else: 
    n_cr  =  len(k_range_cr) * len(rep_range_cr)

# ###### Parameter Decision Tree ######
# dt_range  =  range(0, 5) 
# 
# # Number of Decision Trees
# n_dt  =  len(dt_range)

###### Parameter LARS ######
k_range_lars = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

###### Parameter Forward Stepwise Selection ######
k_range_fss = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

###### Parameter peLasso ######
n_alpha        = 200
n_iter_peL     = 1000
cv_splits_peL  = 5
cv_repeats_peL = 1

###### Parameter Average-Best ######
k_range_avg_best = [1, 10, 25, 50, 100, 1023]

###### Parameter BSSF ######
bssf_alpha   =  2500
bssf_timeout =  3.0
k_range_bssf =  [1, 5, 10, 25, 50, 100, 250, 500, 750, 1023] 
n_bssf       =  len(k_range_bssf)
bssf_method  =  "qcbo"

######## Objects ########
# Set up Matrices for Results
cand_forecasts   = np.full((len(range(init, n_obs)), (n_sub + n_cr)), np.nan)  

benchmark        = np.full( len(scenario_range) * len(snr_range) * len(corr_range) * len(b_range) * n_mc, np.nan)
se_benchmark     = np.full( len(scenario_range) * len(snr_range) * len(corr_range) * len(b_range) * n_mc, np.nan)

cf_weights       = np.full((len(scenario_range) * len(snr_range) * len(corr_range) * len(b_range) * n_mc, n_bssf, (n_sub + n_cr)), np.nan)
bssf_forecast    = np.full((len(scenario_range) * len(snr_range) * len(corr_range) * len(b_range) * n_mc, n_bssf), np.nan)
se_bssf_forecast = np.full((len(scenario_range) * len(snr_range) * len(corr_range) * len(b_range) * n_mc, n_bssf), np.nan)

csr_forecast     = np.full((len(scenario_range) * len(snr_range) * len(corr_range) * len(b_range) * n_mc, len(k_range_ssf)), np.nan)
se_csr_forecast  = np.full((len(scenario_range) * len(snr_range) * len(corr_range) * len(b_range) * n_mc, len(k_range_ssf)), np.nan)

#lars_forecast    = np.full((len(scenario_range) * len(snr_range) * len(corr_range) * len(b_range) * n_mc, len(k_range_lars)), np.nan)
#se_lars_forecast = np.full((len(scenario_range) * len(snr_range) * len(corr_range) * len(b_range) * n_mc, len(k_range_lars)), np.nan)

#fss_forecast    = np.full((len(scenario_range) * len(snr_range) * len(corr_range) * len(b_range) * n_mc, len(k_range_fss)), np.nan)
#se_fss_forecast = np.full((len(scenario_range) * len(snr_range) * len(corr_range) * len(b_range) * n_mc, len(k_range_fss)), np.nan)

#pelasso_forecast    = np.full( len(scenario_range) * len(snr_range) * len(corr_range) * len(b_range) * n_mc, np.nan)
#se_pelasso_forecast = np.full( len(scenario_range) * len(snr_range) * len(corr_range) * len(b_range) * n_mc, np.nan)

#avg_best_forecast    = np.full((len(scenario_range) * len(snr_range) * len(corr_range) * len(b_range) * n_mc, len(k_range_avg_best)), np.nan)
#se_avg_best_forecast = np.full((len(scenario_range) * len(snr_range) * len(corr_range) * len(b_range) * n_mc, len(k_range_avg_best)), np.nan)

###### Start ######
# Loop over Monte-Carlo Runs
i = 0
program_starts = time.time()
for r in tqdm(range(n_mc)):
    
    # Loop over Covariance-Scenario
    for scenario in scenario_range:
        
        # Loop over Signal-to-Noise-Ratio
        for snr in snr_range:
    
            # Loop over Covariance-Sets
            for rho in corr_range:
        
                # Loop over Coefficient-Sets
                for b in b_range:
    
                    ### Simulate Data ###
                    y, X, pred_names = sim_data(n_obs, n_preds, b, bernoulli_p, rho, scenario, snr, r)

                    ### Create Candidate Models ###
                    cand_forecasts = candidate_models(y, X, range(init, n_obs), k_range_ssf, k_range_cr, rep_range_cr, n_core)

                    ### Benchmark: PHM ###
                    benchmark[i]    = y[:-1].mean()
                    se_benchmark[i] = (y[-1] - benchmark[i]) ** 2

                    ### Benchmark: Complete Subset Regression ###
                    tmp_ = np.cumsum([0] + [len(list(combinations(range(n_preds), k))) for k in k_range_ssf])
                    csr_forecast[i]     =  [np.mean(cand_forecasts[-1, :n_sub][tmp_[i]:tmp_[i+1]]) for i in range(len(tmp_)-1)]
                    se_csr_forecast[i]  =  (y[-1] - csr_forecast[i]) ** 2

                    ### Benchmark: LARS ###
                    #lars_forecast[i]    = lars(y[init:-1], cand_forecasts[:-1], cand_forecasts[[-1]], k_range_lars)
                    #se_lars_forecast[i] = (y[-1] - lars_forecast[i]) ** 2

                    ### Benchmark: Forward Stepwise Selection ###
                    #fss_forecast[i]    = fss(y[init:-1], cand_forecasts[:-1], cand_forecasts[[-1]], k_range_fss, n_core)
                    #se_fss_forecast[i] = (y[-1] - fss_forecast[i]) ** 2

                    ### Benchmark: peLASSO ###
                    #pelasso_forecast[i]    = peLASSO(y[init:-1], cand_forecasts[:-1], cand_forecasts[[-1]], n_alpha, n_iter_peL, cv_splits_peL, cv_repeats_peL)
                    #se_pelasso_forecast[i] = (y[-1] - pelasso_forecast[i]) ** 2

                    ### Benchmark: Average-Best ###
                    #avg_best_forecast[i]    = avg_best(y[init:-1], cand_forecasts[:-1], cand_forecasts[[-1]], k_range_avg_best)
                    #se_avg_best_forecast[i] = (y[-1] - avg_best_forecast[i]) ** 2

                    ### Best Selection of Forecast ###
                    bssf_forecast[i], cf_weights[i] = zip(*list(map(lambda s: bssf(y[init:-1], cand_forecasts[:-1], cand_forecasts[[-1]], bssf_alpha, s, bssf_timeout, bssf_method), k_range_bssf)))
                    se_bssf_forecast[i] = (y[-1] - bssf_forecast[i]) ** 2

                    # Update index   
                    i += 1
# Time                
program_ends = time.time()
         
# # Candidate-Model-Names
ssf_names = [f"SSF{k}_" + "_".join(map(str, sub)) for k in k_range_ssf for sub in combinations(range(n_preds), k)]
cr_names  = [f"CR{n_comp}_{r}" for n_comp in k_range_cr for r in rep_range_cr]

### Evaluation ###
# Set up Lenghts
seq_1 = n_mc * len(scenario_range) * len(snr_range) * len(corr_range) * len(b_range)
seq_2 = n_mc * len(scenario_range) * len(snr_range) * len(corr_range)
seq_3 = n_mc * len(scenario_range) * len(snr_range)
seq_4 = n_mc * len(scenario_range)

# Loop over all combinations
for b in range(len(b_range)):
    for rho in range(len(corr_range)):
        for snr in range(len(snr_range)):
            for scenario in range(len(scenario_range)):
    
                # Calculate Forecast Combination Method Performances    
                se_bssf      =  se_bssf_forecast[np.arange(b, seq_1, len(b_range))][np.arange(rho, seq_2, len(corr_range))][np.arange(snr, seq_3, len(snr_range))][np.arange(scenario, seq_4, len(scenario_range))]
                se_csr       =  se_csr_forecast[ np.arange(b, seq_1, len(b_range))][np.arange(rho, seq_2, len(corr_range))][np.arange(snr, seq_3, len(snr_range))][np.arange(scenario, seq_4, len(scenario_range))]
                #se_lars      =  se_lars_forecast[np.arange(b, seq_1, len(b_range))][np.arange(rho, seq_3, len(corr_range))][np.arange(snr, seq_3, len(snr_range))][np.arange(scenario, seq_4, len(scenario_range))]
                #se_fss       =  se_fss_forecast[np.arange(b, seq_1, len(b_range))][np.arange(rho, seq_2, len(corr_range))][np.arange(snr, seq_3, len(snr_range))][np.arange(scenario, seq_4, len(scenario_range))]
                #se_pelasso   =  se_pelasso_forecast[np.arange(b, seq_1, len(b_range))][np.arange(rho, seq_2, len(corr_range))][np.arange(snr, seq_3, len(snr_range))][np.arange(scenario, seq_4, len(scenario_range))]
                #se_avg_best  =  se_avg_best_forecast[np.arange(b, seq_1, len(b_range))][np.arange(rho, seq_2, len(corr_range))][np.arange(snr, seq_3, len(snr_range))][np.arange(scenario, seq_4, len(scenario_range))]
        
                # Calculate Benchmark Performance
                se_phm  = se_benchmark[np.arange(b, seq_1, len(b_range))][np.arange(rho, seq_2, len(corr_range))][np.arange(snr, seq_3, len(snr_range))][np.arange(scenario, seq_4, len(scenario_range))]
                
                # Create Rows
                new_row_bbsf = { "n_mc": n_mc, "n_obs": n_obs, "n_preds": n_preds, "init": init, "mlags": mlags, "rho": corr_range[rho], "scenario": scenario_range[scenario], "bernoulli_p": bernoulli_p, "snr": snr_range[snr], "b": b_range[b], "k_range_ssf": k_range_ssf, "k_range_cr": k_range_cr, "rep_range_cr": rep_range_cr, "k_range_lars": k_range_lars, "k_range_fss": k_range_fss, "n_alpha": n_alpha, "n_iter_peL": n_iter_peL, "cv_splits_peL": cv_splits_peL, "cv_repeats_peL": cv_repeats_peL, "k_range_avg_best": k_range_avg_best, "bssf_alpha": bssf_alpha, "bssf_timeout": bssf_timeout, "bssf_range": k_range_bssf, "error": 0, "time" : np.floor((program_ends - program_starts) / 60), "type": "BSSF",        "OOS-R2": np.array2string(100 * (1 - sum(se_bssf) / sum(se_phm)), formatter={'float_kind':'{0:.2f}'.format}).replace('[', '').replace(']', '')}
                new_row_csr  = { "n_mc": n_mc, "n_obs": n_obs, "n_preds": n_preds, "init": init, "mlags": mlags, "rho": corr_range[rho], "scenario": scenario_range[scenario], "bernoulli_p": bernoulli_p, "snr": snr_range[snr], "b": b_range[b], "k_range_ssf": k_range_ssf, "k_range_cr": k_range_cr, "rep_range_cr": rep_range_cr, "k_range_lars": k_range_lars, "k_range_fss": k_range_fss, "n_alpha": n_alpha, "n_iter_peL": n_iter_peL, "cv_splits_peL": cv_splits_peL, "cv_repeats_peL": cv_repeats_peL, "k_range_avg_best": k_range_avg_best, "bssf_alpha": bssf_alpha, "bssf_timeout": bssf_timeout, "bssf_range": k_range_bssf, "error": 0, "time" : np.floor((program_ends - program_starts) / 60), "type": "CSR",         "OOS-R2": np.array2string(100 * (1 - sum(se_csr) / sum(se_phm)), formatter={'float_kind':'{0:.2f}'.format}).replace('[', '').replace(']', '')}
                #new_row_lars = { "n_mc": n_mc, "n_obs": n_obs, "n_preds": n_preds, "init": init, "mlags": mlags, "rho": corr_range[rho], "scenario": scenario_range[scenario], "bernoulli_p": bernoulli_p, "snr": snr_range[snr], "b": b_range[b], "k_range_ssf": k_range_ssf, "k_range_cr": k_range_cr, "rep_range_cr": rep_range_cr, "k_range_lars": k_range_lars, "k_range_fss": k_range_fss, "n_alpha": n_alpha, "n_iter_peL": n_iter_peL, "cv_splits_peL": cv_splits_peL, "cv_repeats_peL": cv_repeats_peL, "k_range_avg_best": k_range_avg_best, "bssf_alpha": bssf_alpha, "bssf_timeout": bssf_timeout, "bssf_range": k_range_bssf, "error": 0, "time" : np.floor((program_ends - program_starts) / 60), "type": "LARS",       "OOS-R2": np.array2string(100 * (1 - sum(se_lars) / sum(se_phm)), formatter={'float_kind':'{0:.2f}'.format}).replace('[', '').replace(']', '')}
                #new_row_fss  = { "n_mc": n_mc, "n_obs": n_obs, "n_preds": n_preds, "init": init, "mlags": mlags, "rho": corr_range[rho], "scenario": scenario_range[scenario], "bernoulli_p": bernoulli_p, "snr": snr_range[snr], "b": b_range[b], "k_range_ssf": k_range_ssf, "k_range_cr": k_range_cr, "rep_range_cr": rep_range_cr, "k_range_lars": k_range_lars, "k_range_fss": k_range_fss, "n_alpha": n_alpha, "n_iter_peL": n_iter_peL, "cv_splits_peL": cv_splits_peL, "cv_repeats_peL": cv_repeats_peL, "k_range_avg_best": k_range_avg_best, "bssf_alpha": bssf_alpha, "bssf_timeout": bssf_timeout, "bssf_range": k_range_bssf, "error": 0, "time" : np.floor((program_ends - program_starts) / 60), "type": "FSS",        "OOS-R2": np.array2string(100 * (1 - sum(se_fss) / sum(se_phm)), formatter={'float_kind':'{0:.2f}'.format}).replace('[', '').replace(']', '')}
                #new_row_peL  = { "n_mc": n_mc, "n_obs": n_obs, "n_preds": n_preds, "init": init, "mlags": mlags, "rho": corr_range[rho], "scenario": scenario_range[scenario], "bernoulli_p": bernoulli_p, "snr": snr_range[snr], "b": b_range[b], "k_range_ssf": k_range_ssf, "k_range_cr": k_range_cr, "rep_range_cr": rep_range_cr, "k_range_lars": k_range_lars, "k_range_fss": k_range_fss, "n_alpha": n_alpha, "n_iter_peL": n_iter_peL, "cv_splits_peL": cv_splits_peL, "cv_repeats_peL": cv_repeats_peL, "k_range_avg_best": k_range_avg_best, "bssf_alpha": bssf_alpha, "bssf_timeout": bssf_timeout, "bssf_range": k_range_bssf, "error": 0, "time" : np.floor((program_ends - program_starts) / 60), "type": "peLASSO",    "OOS-R2": np.array2string(100 * (1 - sum(se_pelasso) / sum(se_phm)), formatter={'float_kind':'{0:.2f}'.format}).replace('[', '').replace(']', '')}
                #new_row_avgB = { "n_mc": n_mc, "n_obs": n_obs, "n_preds": n_preds, "init": init, "mlags": mlags, "rho": corr_range[rho], "scenario": scenario_range[scenario], "bernoulli_p": bernoulli_p, "snr": snr_range[snr], "b": b_range[b], "k_range_ssf": k_range_ssf, "k_range_cr": k_range_cr, "rep_range_cr": rep_range_cr, "k_range_lars": k_range_lars, "k_range_fss": k_range_fss, "n_alpha": n_alpha, "n_iter_peL": n_iter_peL, "cv_splits_peL": cv_splits_peL, "cv_repeats_peL": cv_repeats_peL, "k_range_avg_best": k_range_avg_best, "bssf_alpha": bssf_alpha, "bssf_timeout": bssf_timeout, "bssf_range": k_range_bssf, "error": 0, "time" : np.floor((program_ends - program_starts) / 60), "type": "Avg_Best_N", "OOS-R2": np.array2string(100 * (1 - sum(se_avg_best) / sum(se_phm)), formatter={'float_kind':'{0:.2f}'.format}).replace('[', '').replace(']', '')}

                # Add Rows
                new_rows = pd.DataFrame.from_dict([new_row_bbsf,
                                                   new_row_csr,
                                                   #new_row_lars,
                                                   #new_row_fss,
                                                   #new_row_peL,
                                                   #new_row_avgB
                                                   ])
                
                # Add to CSV
                output_path = path + "/Results/my_csv.csv"
                new_rows.to_csv(output_path, mode='a', header=not os.path.exists(output_path))
                
                # Print Results
                print(f"BSSF:        Avg. OOS-R2 for Scenario {scenario}, SNR {snr_range[snr]}, Corr. {corr_range[rho]} and Betas {b_range[b]} is: " + str(np.round(100 * (1 - sum(se_bssf) / sum(se_phm)), 2)) + "%")
                print(f"CSR:         Avg. OOS-R2 for Scenario {scenario}, SNR {snr_range[snr]}, Corr. {corr_range[rho]} and Betas {b_range[b]} is: " + str(np.round(100 * (1 - sum(se_csr)  / sum(se_phm)), 2)) + "%")
                #print(f"LARS:       Avg. OOS-R2 for Scenario {scenario}, SNR {snr_range[snr]}, Corr. {corr_range[rho]} and Betas {b_range[b]} is: " + str(np.round(100 * (1 - sum(se_lars) / sum(se_phm)), 2)) + "%")
                #print(f"FSS:        Avg. OOS-R2 for Scenario {scenario}, SNR {snr_range[snr]}, Corr. {corr_range[rho]} and Betas {b_range[b]} is: " + str(np.round(100 * (1 - sum(se_fss)  / sum(se_phm)), 2)) + "%")
                #print(f"peLASSO:    Avg. OOS-R2 for Scenario {scenario}, SNR {snr_range[snr]}, Corr. {corr_range[rho]} and Betas {b_range[b]} is: " + str(np.round(100 * (1 - sum(se_pelasso)  / sum(se_phm)), 2)) + "%")
                #print(f"Avg-Best N: Avg. OOS-R2 for Scenario {scenario}, SNR {snr_range[snr]}, Corr. {corr_range[rho]} and Betas {b_range[b]} is: " + str(np.round(100 * (1 - sum(se_avg_best) / sum(se_phm)), 2)) + "%")

100%|██████████| 10/10 [59:02<00:00, 354.29s/it]

BSSF:        Avg. OOS-R2 for Scenario 0, SNR 0.3, Corr. 0.2 and Betas 2 is: [47.32 49.03 48.8  48.77 48.74]%
CSR:         Avg. OOS-R2 for Scenario 0, SNR 0.3, Corr. 0.2 and Betas 2 is: [11.06 18.58 24.36 29.11 33.17 36.67 39.69 42.27 44.44 46.21]%
BSSF:        Avg. OOS-R2 for Scenario 1, SNR 0.3, Corr. 0.2 and Betas 2 is: [39.99 41.63 42.43 42.78 42.77]%
CSR:         Avg. OOS-R2 for Scenario 1, SNR 0.3, Corr. 0.2 and Betas 2 is: [ 8.38 15.59 21.73 26.87 31.09 34.46 37.04 38.88 40.06 40.63]%
BSSF:        Avg. OOS-R2 for Scenario 0, SNR 0.5, Corr. 0.2 and Betas 2 is: [55.07 58.84 58.41 58.75 58.56]%
CSR:         Avg. OOS-R2 for Scenario 0, SNR 0.5, Corr. 0.2 and Betas 2 is: [13.98 23.32 30.39 36.14 41.   45.17 48.76 51.81 54.35 56.4 ]%
BSSF:        Avg. OOS-R2 for Scenario 1, SNR 0.5, Corr. 0.2 and Betas 2 is: [49.13 51.73 54.44 53.8  54.24]%
CSR:         Avg. OOS-R2 for Scenario 1, SNR 0.5, Corr. 0.2 and Betas 2 is: [10.7  19.9  27.73 34.3  39.7  44.03 47.37 49.79 51.38 52.22]%
BSSF:   




### Setting 2:

In [None]:
###### MC-Parameter ######
# Number of MC-Runs
n_mc   =  10
n_core =  6

# Set Parameter
n_obs       =  250
n_preds     =  8
init        =  50
mlags       =  0
corr_range  =  [0.0, 0.5, 0.95]
b_range     =  np.array([[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
                         [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0],
                         [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]])

###### Parameter Subset Forecasts ######
# Subset Lengths
k_range = [1, 2, 3, 4, 5, 6, 7, 8] 

# Upper Bound for Subset-Sizes
n_max = 10000  

# Number of Subset-Forecasts
if np.sum(k_range) == 0:
    n_sub  =  0
else:
    n_sub  =  int(sum([min(n_models(n_preds, k), n_max) for k in k_range]))

###### Parameter Compressed Regressions ######
# Number of Components for Compressed Regression
cr_range  =  [0] #[1, 2, 3, 4] 

# Number of runs for each random projection
rep_range  =  [0] #range(0, 60) 

# Number of Compressed-Regression-Forecasts
if (np.sum(cr_range) == 0) or (len(rep_range) == 0):
    n_cr = 0
else: 
    n_cr  =  len(cr_range) * len(rep_range)

# ###### Parameter Decision Tree ######
# dt_range  =  range(0, 5) 
# 
# # Number of Decision Trees
# n_dt  =  len(dt_range)

###### Parameter LARS ######
n_range_lars = [1, 10, 25, 50, 100, 255]

###### Parameter Forward Stepwise Selection ######
n_range_fss = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

###### Parameter peLasso ######
n_alpha        = 200
n_iter_peL     = 1000
cv_splits_peL  = 4
cv_repeats_peL = 1

###### Parameter Average-Best ######
n_range_avg_best = [1, 10, 25, 50, 100, 255]

###### Parameter BSSF ######
alpha        =  12.5
bssf_timeout =  1
bssf_range   =  [1, 10, 25, 50, 100, 255] 
n_bssf       =  len(bssf_range)

######## Objects ########
# Set up Matrices for Results
cand_forecasts   = np.full((len(range(init, n_obs)), (n_sub + n_cr)), np.nan)  

benchmark        = np.full( len(corr_range) * len(b_range) * n_mc, np.nan)
se_benchmark     = np.full( len(corr_range) * len(b_range) * n_mc, np.nan)

cf_weights       = np.full((len(corr_range) * len(b_range) * n_mc, n_bssf, (n_sub + n_cr)), np.nan)
bssf_forecast    = np.full((len(corr_range) * len(b_range) * n_mc, n_bssf), np.nan)
se_bssf_forecast = np.full((len(corr_range) * len(b_range) * n_mc, n_bssf), np.nan)

csr_forecast     = np.full((len(corr_range) * len(b_range) * n_mc, len(k_range)), np.nan)
se_csr_forecast  = np.full((len(corr_range) * len(b_range) * n_mc, len(k_range)), np.nan)

lars_forecast    = np.full((len(corr_range) * len(b_range) * n_mc, len(n_range_lars)), np.nan)
se_lars_forecast = np.full((len(corr_range) * len(b_range) * n_mc, len(n_range_lars)), np.nan)

fss_forecast    = np.full((len(corr_range) * len(b_range) * n_mc, len(n_range_fss)), np.nan)
se_fss_forecast = np.full((len(corr_range) * len(b_range) * n_mc, len(n_range_fss)), np.nan)

pelasso_forecast    = np.full( len(corr_range) * len(b_range) * n_mc, np.nan)
se_pelasso_forecast = np.full( len(corr_range) * len(b_range) * n_mc, np.nan)

avg_best_forecast    = np.full((len(corr_range) * len(b_range) * n_mc, len(n_range_avg_best)), np.nan)
se_avg_best_forecast = np.full((len(corr_range) * len(b_range) * n_mc, len(n_range_avg_best)), np.nan)

###### Start ######
# Loop over Monte-Carlo Runs
i = 0
for r in tqdm(range(n_mc)):
    
    # Loop over Covariance-Sets
    for p in corr_range:
        
        # Loop over Coefficient-Sets
        for b in b_range:
    
            ### Simulate Data ###
            y, X, pred_names = sim(n_obs, n_preds, b, p, r)
            
            ### Create Candidate Models ###
            cand_forecasts = candidate_models(y, X, range(init, n_obs), k_range, cr_range, rep_range, n_core)
            
            #print(f"Computing Covariance {p} and Coefficients {b}")
            
            ### Benchmark: PHM ###
            benchmark[i]    = y[:-1].mean()
            se_benchmark[i] = (y[-1] - benchmark[i]) ** 2
                
            ### Benchmark: Complete Subset Regression ###
            tmp_ = np.cumsum([0] + [len(list(combinations(range(n_preds), k))) for k in k_range])
            csr_forecast[i]     =  [np.mean(cand_forecasts[-1, :n_sub][tmp_[i]:tmp_[i+1]]) for i in range(len(tmp_)-1)]
            se_csr_forecast[i]  =  (y[-1] - csr_forecast[i]) ** 2
            
            ### Benchmark: LARS ###
            #lars_forecast[i]    = lars(y[init:-1], cand_forecasts[:-1], cand_forecasts[[-1]], n_range_lars)
            #se_lars_forecast[i] = (y[-1] - lars_forecast[i]) ** 2
            
            ### Benchmark: Forward Stepwise Selection ###
            #fss_forecast[i]    = fss(y[init:-1], cand_forecasts[:-1], cand_forecasts[[-1]], n_range_fss, n_core)
            #se_fss_forecast[i] = (y[-1] - fss_forecast[i]) ** 2
            
            ### Benchmark: peLASSO ###
            #pelasso_forecast[i]    = peLASSO(y[init:-1], cand_forecasts[:-1], cand_forecasts[[-1]], n_alpha, n_iter_peL, cv_splits_peL, cv_repeats_peL)
            #se_pelasso_forecast[i] = (y[-1] - pelasso_forecast[i]) ** 2
            
            ### Benchmark: Average-Best ###
            #avg_best_forecast[i]    = avg_best(y[init:-1], cand_forecasts[:-1], cand_forecasts[[-1]], n_range_avg_best)
            #se_avg_best_forecast[i] = (y[-1] - avg_best_forecast[i]) ** 2

            ### Best Selection of Forecast ###
            bssf_forecast[i], cf_weights[i] = zip(*list(map(lambda s: bssf(y[init:-1], cand_forecasts[:-1], cand_forecasts[[-1]], alpha, s, n_times), bssf_range)))
            se_bssf_forecast[i] = (y[-1] - bssf_forecast[i]) ** 2
             
            # Update index   
            i += 1
            
# # Candidate-Model-Names
ssf_names = [f"SSF{k}_" + "_".join(map(str, sub)) for k in k_range for sub in combinations(range(n_preds), k)]
cr_names  = [f"CR{n_comp}_{r}" for n_comp in cr_range for r in rep_range]

### Evaluation ###
for b in range(len(b_range)):
    for p in range(len(corr_range)):
    
        # Calculate Forecast Combination Method Performances    
        se_bssf      =  se_bssf_forecast[np.arange(b, n_mc * len(corr_range) * len(b_range), len(b_range))][np.arange(p, n_mc * len(corr_range), len(corr_range))]
        se_csr       =  se_csr_forecast[np.arange(b, n_mc * len(corr_range) * len(b_range), len(b_range))][np.arange(p, n_mc * len(corr_range), len(corr_range))]
        #se_lars      =  se_lars_forecast[np.arange(b, n_mc * len(corr_range) * len(b_range), len(b_range))][np.arange(p, n_mc * len(corr_range), len(corr_range))]
        #se_fss       =  se_fss_forecast[np.arange(b, n_mc * len(corr_range) * len(b_range), len(b_range))][np.arange(p, n_mc * len(corr_range), len(corr_range))]
        #se_pelasso   =  se_pelasso_forecast[np.arange(b, n_mc * len(corr_range) * len(b_range), len(b_range))][np.arange(p, n_mc * len(corr_range), len(corr_range))]
        #se_avg_best  =  se_avg_best_forecast[np.arange(b, n_mc * len(corr_range) * len(b_range), len(b_range))][np.arange(p, n_mc * len(corr_range), len(corr_range))]

        # Calculate Benchmark Performance
        se_phm  = se_benchmark[np.arange(b, n_mc * len(corr_range) * len(b_range), len(b_range))][np.arange(p, n_mc * len(corr_range), len(corr_range))]
        
        # Print Results
        print(f"BSSF:       Avg. OOS-R2 for Corr. {corr_range[p]} and Betas {b_range[b]} is: " + str(np.round(100 * (1 - sum(se_bssf) / sum(se_phm)), 2)) + "%")
        print(f"CSR:        Avg. OOS-R2 for Corr. {corr_range[p]} and Betas {b_range[b]} is: " + str(np.round(100 * (1 - sum(se_csr)  / sum(se_phm)), 2)) + "%")
        #print(f"LARS:       Avg. OOS-R2 for Corr. {corr_range[p]} and Betas {b_range[b]} is: " + str(np.round(100 * (1 - sum(se_lars) / sum(se_phm)), 2)) + "%")
        #print(f"FSS:        Avg. OOS-R2 for Corr. {corr_range[p]} and Betas {b_range[b]} is: " + str(np.round(100 * (1 - sum(se_fss)  / sum(se_phm)), 2)) + "%")
        #print(f"peLASSO:    Avg. OOS-R2 for Corr. {corr_range[p]} and Betas {b_range[b]} is: " + str(np.round(100 * (1 - sum(se_pelasso)  / sum(se_phm)), 2)) + "%")
        #print(f"Avg-Best N: Avg. OOS-R2 for Corr. {corr_range[p]} and Betas {b_range[b]} is: " + str(np.round(100 * (1 - sum(se_avg_best) / sum(se_phm)), 2)) + "%")

In [None]:
BSSF:       Avg. OOS-R2 for Corr. 0.0 and Betas [1. 0. 0. 0. 0. 0. 0. 0.] is: [5.3  5.15 5.24 5.32 5.43 4.8 ]%
CSR:        Avg. OOS-R2 for Corr. 0.0 and Betas [1. 0. 0. 0. 0. 0. 0. 0.] is: [1.73 3.09 4.11 4.78 5.12 5.12 4.78 4.09]%
LARS:       Avg. OOS-R2 for Corr. 0.0 and Betas [1. 0. 0. 0. 0. 0. 0. 0.] is: [ 2.07000000e+000 -3.79000000e+000 -9.96590000e+002 -3.57842000e+004 -5.30253757e+007 -1.88807783e+132]%
FSS:        Avg. OOS-R2 for Corr. 0.0 and Betas [1. 0. 0. 0. 0. 0. 0. 0.] is: [ 2.37  1.91  1.3   1.06  0.37 -0.16 -0.25 -0.18 -0.27 -0.72]%
peLASSO:    Avg. OOS-R2 for Corr. 0.0 and Betas [1. 0. 0. 0. 0. 0. 0. 0.] is: -0.83%
Avg-Best N: Avg. OOS-R2 for Corr. 0.0 and Betas [1. 0. 0. 0. 0. 0. 0. 0.] is: [5.3  5.29 5.39 5.34 5.44 4.8 ]%
BSSF:       Avg. OOS-R2 for Corr. 0.5 and Betas [1. 0. 0. 0. 0. 0. 0. 0.] is: [7.72 7.61 7.65 7.66 7.7  7.08]%
CSR:        Avg. OOS-R2 for Corr. 0.5 and Betas [1. 0. 0. 0. 0. 0. 0. 0.] is: [4.06 5.47 6.41 7.09 7.52 7.72 7.66 7.35]%
LARS:       Avg. OOS-R2 for Corr. 0.5 and Betas [1. 0. 0. 0. 0. 0. 0. 0.] is: [ 2.66000000e+000 -1.35000000e+000 -4.59200000e+002 -1.55664300e+004 -4.68720454e+006 -6.82237413e+198]%
FSS:        Avg. OOS-R2 for Corr. 0.5 and Betas [1. 0. 0. 0. 0. 0. 0. 0.] is: [ 4.52  4.83  3.08  1.97  1.28  0.63  0.07 -1.   -1.32 -2.09]%
peLASSO:    Avg. OOS-R2 for Corr. 0.5 and Betas [1. 0. 0. 0. 0. 0. 0. 0.] is: -1.95%
Avg-Best N: Avg. OOS-R2 for Corr. 0.5 and Betas [1. 0. 0. 0. 0. 0. 0. 0.] is: [7.72 7.75 7.7  7.84 7.82 7.08]%
BSSF:       Avg. OOS-R2 for Corr. 0.95 and Betas [1. 0. 0. 0. 0. 0. 0. 0.] is: [6.91 7.7  7.76 7.86 7.95 7.97]%
CSR:        Avg. OOS-R2 for Corr. 0.95 and Betas [1. 0. 0. 0. 0. 0. 0. 0.] is: [7.77 7.95 8.02 7.98 7.81 7.53 7.12 6.59]%
LARS:       Avg. OOS-R2 for Corr. 0.95 and Betas [1. 0. 0. 0. 0. 0. 0. 0.] is: [ 2.55000000e+00 -1.77000000e+00 -4.39920000e+02 -2.42853800e+04 -4.45249327e+15            -inf]%
FSS:        Avg. OOS-R2 for Corr. 0.95 and Betas [1. 0. 0. 0. 0. 0. 0. 0.] is: [ 5.41  3.26  1.83  1.68  1.27  0.83  0.69  1.44  0.42 -0.41]%
peLASSO:    Avg. OOS-R2 for Corr. 0.95 and Betas [1. 0. 0. 0. 0. 0. 0. 0.] is: -3.45%
Avg-Best N: Avg. OOS-R2 for Corr. 0.95 and Betas [1. 0. 0. 0. 0. 0. 0. 0.] is: [6.91 7.31 7.62 7.76 7.93 7.97]%

BSSF:       Avg. OOS-R2 for Corr. 0.0 and Betas [1. 1. 1. 1. 0. 0. 0. 0.] is: [5.87 6.36 6.36 6.44 6.54 6.18]%
CSR:        Avg. OOS-R2 for Corr. 0.0 and Betas [1. 1. 1. 1. 0. 0. 0. 0.] is: [2.05 3.75 5.13 6.17 6.88 7.25 7.29 6.99]%
LARS:       Avg. OOS-R2 for Corr. 0.0 and Betas [1. 1. 1. 1. 0. 0. 0. 0.] is: [ 2.52000000e+000 -1.06000000e+001 -5.83570000e+002 -1.73518900e+004 -8.37694114e+007 -3.82112811e+163]%
FSS:        Avg. OOS-R2 for Corr. 0.0 and Betas [1. 1. 1. 1. 0. 0. 0. 0.] is: [3.76 3.67 3.52 3.34 3.54 3.07 2.32 2.43 2.04 2.19]%
peLASSO:    Avg. OOS-R2 for Corr. 0.0 and Betas [1. 1. 1. 1. 0. 0. 0. 0.] is: -2.13%
Avg-Best N: Avg. OOS-R2 for Corr. 0.0 and Betas [1. 1. 1. 1. 0. 0. 0. 0.] is: [5.87 6.08 5.95 6.   6.32 6.18]%
BSSF:       Avg. OOS-R2 for Corr. 0.5 and Betas [1. 1. 1. 1. 0. 0. 0. 0.] is: [14.27 15.61 15.74 15.91 16.08 16.4 ]%
CSR:        Avg. OOS-R2 for Corr. 0.5 and Betas [1. 1. 1. 1. 0. 0. 0. 0.] is: [12.92 15.28 16.13 16.49 16.57 16.44 16.13 15.64]%
LARS:       Avg. OOS-R2 for Corr. 0.5 and Betas [1. 1. 1. 1. 0. 0. 0. 0.] is: [ 5.54000000e+000 -7.40000000e+000 -1.85594000e+003 -7.03743600e+004 -9.77012471e+014 -6.10886863e+267]%
FSS:        Avg. OOS-R2 for Corr. 0.5 and Betas [1. 1. 1. 1. 0. 0. 0. 0.] is: [12.86 12.05 11.95 11.76 10.65  9.94  9.11  8.54  8.4   8.15]%
peLASSO:    Avg. OOS-R2 for Corr. 0.5 and Betas [1. 1. 1. 1. 0. 0. 0. 0.] is: -1.45%
Avg-Best N: Avg. OOS-R2 for Corr. 0.5 and Betas [1. 1. 1. 1. 0. 0. 0. 0.] is: [14.27 15.4  15.79 15.84 16.08 16.4 ]%
BSSF:       Avg. OOS-R2 for Corr. 0.95 and Betas [1. 1. 1. 1. 0. 0. 0. 0.] is: [23.45 24.72 24.82 24.87 24.93 25.1 ]%
CSR:        Avg. OOS-R2 for Corr. 0.95 and Betas [1. 1. 1. 1. 0. 0. 0. 0.] is: [25.07 25.21 25.2  25.1  24.91 24.63 24.25 23.77]%
LARS:       Avg. OOS-R2 for Corr. 0.95 and Betas [1. 1. 1. 1. 0. 0. 0. 0.] is: [ 7.72000000e+000 -7.37000000e+000 -4.14030000e+003 -3.77355250e+005 -5.00717015e+013 -1.08530823e+304]%
FSS:        Avg. OOS-R2 for Corr. 0.95 and Betas [1. 1. 1. 1. 0. 0. 0. 0.] is: [22.37 21.16 20.79 19.99 19.41 19.21 18.77 17.56 16.95 17.16]%
peLASSO:    Avg. OOS-R2 for Corr. 0.95 and Betas [1. 1. 1. 1. 0. 0. 0. 0.] is: -1.81%
Avg-Best N: Avg. OOS-R2 for Corr. 0.95 and Betas [1. 1. 1. 1. 0. 0. 0. 0.] is: [23.45 24.42 24.6  24.73 24.83 25.1 ]%

BSSF:       Avg. OOS-R2 for Corr. 0.0 and Betas [1. 1. 1. 1. 1. 1. 1. 1.] is: [2.83 4.57 4.71 4.93 5.15 5.4 ]%
CSR:        Avg. OOS-R2 for Corr. 0.0 and Betas [1. 1. 1. 1. 1. 1. 1. 1.] is: [1.82 3.33 4.51 5.38 5.94 6.19 6.12 5.73]%
LARS:       Avg. OOS-R2 for Corr. 0.0 and Betas [1. 1. 1. 1. 1. 1. 1. 1.] is: [ 1.89000000e+000 -9.83000000e+000 -4.75880000e+002 -1.50787200e+004 -9.58904251e+006 -7.54728041e+147]%
FSS:        Avg. OOS-R2 for Corr. 0.0 and Betas [1. 1. 1. 1. 1. 1. 1. 1.] is: [ 2.23  0.88  0.61  0.52  0.76  0.28  0.09 -0.04 -0.7  -0.22]%
peLASSO:    Avg. OOS-R2 for Corr. 0.0 and Betas [1. 1. 1. 1. 1. 1. 1. 1.] is: -3.02%
Avg-Best N: Avg. OOS-R2 for Corr. 0.0 and Betas [1. 1. 1. 1. 1. 1. 1. 1.] is: [2.83 3.68 4.15 4.34 4.8  5.4 ]%
BSSF:       Avg. OOS-R2 for Corr. 0.5 and Betas [1. 1. 1. 1. 1. 1. 1. 1.] is: [25.71 26.31 26.52 26.74 27.05 27.57]%
CSR:        Avg. OOS-R2 for Corr. 0.5 and Betas [1. 1. 1. 1. 1. 1. 1. 1.] is: [22.5  26.26 27.38 27.74 27.77 27.61 27.31 26.89]%
LARS:       Avg. OOS-R2 for Corr. 0.5 and Betas [1. 1. 1. 1. 1. 1. 1. 1.] is: [ 8.47000000e+000 -6.66000000e+000 -3.59007000e+003 -3.30016220e+005 -1.26080654e+012 -2.23377050e+203]%
FSS:        Avg. OOS-R2 for Corr. 0.5 and Betas [1. 1. 1. 1. 1. 1. 1. 1.] is: [24.15 23.91 22.67 23.14 23.11 23.4  23.01 23.16 22.43 22.45]%
peLASSO:    Avg. OOS-R2 for Corr. 0.5 and Betas [1. 1. 1. 1. 1. 1. 1. 1.] is: 0.44%
Avg-Best N: Avg. OOS-R2 for Corr. 0.5 and Betas [1. 1. 1. 1. 1. 1. 1. 1.] is: [25.71 26.69 26.84 26.79 27.02 27.57]%
BSSF:       Avg. OOS-R2 for Corr. 0.95 and Betas [1. 1. 1. 1. 1. 1. 1. 1.] is: [38.32 39.43 39.56 39.58 39.65 39.82]%
CSR:        Avg. OOS-R2 for Corr. 0.95 and Betas [1. 1. 1. 1. 1. 1. 1. 1.] is: [39.81 39.94 39.92 39.83 39.66 39.42 39.1  38.71]%
LARS:       Avg. OOS-R2 for Corr. 0.95 and Betas [1. 1. 1. 1. 1. 1. 1. 1.] is: [ 1.15800000e+001 -7.48900000e+001 -1.15969600e+004 -9.39080260e+005 -9.88798732e+010 -5.78881317e+202]%
FSS:        Avg. OOS-R2 for Corr. 0.95 and Betas [1. 1. 1. 1. 1. 1. 1. 1.] is: [37.28 36.41 36.08 35.96 35.32 35.15 34.85 35.04 34.14 33.78]%
peLASSO:    Avg. OOS-R2 for Corr. 0.95 and Betas [1. 1. 1. 1. 1. 1. 1. 1.] is: -0.08%
Avg-Best N: Avg. OOS-R2 for Corr. 0.95 and Betas [1. 1. 1. 1. 1. 1. 1. 1.] is: [38.32 39.1  39.37 39.44 39.57 39.82]%

In [None]:
# Set up DataFrame
dataframe_plot = pd.DataFrame()

# Create DataFrame
for b in range(len(b_range)):
    for p in range(len(corr_range)):
        
        # Subset Weights
        chosen_cm = cf_weights[np.arange(b, n_mc * len(corr_range) * len(b_range), len(b_range))][np.arange(p, n_mc * len(corr_range), len(corr_range))]

        # Aggregated
        chosen_cm = np.sum(chosen_cm, axis = 0)

        # Replace Zero with NaN
        chosen_cm[chosen_cm == 0] = np.nan
        chosen_cm_agg = chosen_cm.copy()

        # Set to One
        chosen_cm[chosen_cm > 1 ] = 1

        # Adapt Column-Values
        chosen_cm = chosen_cm * np.arange(1, chosen_cm.shape[1]+1)

        # Create DataFrame
        chosen_cm = pd.DataFrame(chosen_cm, index = [f"K={k}" for k in bssf_range], columns = [*ssf_names, *cr_names])
        chosen_cm_agg = pd.DataFrame(chosen_cm_agg, index = [f"K={k}" for k in bssf_range], columns = [*ssf_names, *cr_names])

        # Index
        chosen_cm = chosen_cm.reset_index(names="INDEX")
        chosen_cm_agg = chosen_cm_agg.reset_index(names="INDEX")

        # Melt Data
        chosen_cm = chosen_cm.melt(id_vars = 'INDEX', var_name = 'Candidate_Model', value_name = 'Value')
        chosen_cm_agg = chosen_cm_agg.melt(id_vars = 'INDEX', var_name = 'Candidate_Model', value_name = 'Weight')

        # Concatenate
        chosen_cm = pd.concat([chosen_cm, chosen_cm_agg])
        
        # Transform Candidate Model Names
        chosen_cm['Candidate_Model'] = chosen_cm['Candidate_Model'].str.split('_').str[0]
        
        # Add Column for Correlation
        chosen_cm['Correlation'] = corr_range[p] 
        
        # Add Column for Betas
        chosen_cm['Betas'] = str(b_range[b])
        
        # Append
        dataframe_plot = pd.concat([dataframe_plot, chosen_cm])
        
# Plot Theme
sns.set_theme(style="ticks")

# Plot Data
g = sns.relplot(
    data = dataframe_plot,
    y = 'Value', x = 'INDEX',
    hue = "Candidate_Model", size = "Weight", 
    col = "Betas", row = "Correlation",
    sizes = (3, 75), height = 5.0, aspect = 1.5,
    alpha = 0.75, palette = "muted",
    #legend = True
    )

# Legend
#g._legend.remove()
#h, l = g.ax.get_legend_handles_labels()
#g.ax.legend(h[0:13], l[0:13], bbox_to_anchor=(1.0, 0.75), loc=2, fontsize=10, frameon=False)

# Title & Axis
g.set(xlabel='Combination Size',
      ylabel='Candidate Models')
      #title=f"Selected Candidate Models: \n Correlation {corr_range[p]} & Betas {b_range[b]}")

# Margins
#g.ax.margins(x = 0.05, y = 0)

# Set number of ticks for y-axis
idx = [0, 8, 36, 92, 162, 218, 246, 255, 315, 375, 435]
#g.ax.set_yticks(idx)

# Set ticks labels for x-axis
#g.ax.set_yticklabels([string.split("_")[0] for string in [[[*ssf_names, *cr_names]][0][i] for i in idx]], rotation='horizontal')

## iterate over axes of FacetGrid
for ax in g.axes.flatten():
    ax.set_yticks(idx)
    ax.set_yticklabels([string.split("_")[0] for string in [[[*ssf_names, *cr_names]][0][i] for i in idx]], rotation='horizontal')

# Tick-Label Size
g.set_yticklabels(size = 8)
g.set_xticklabels(size = 10)

# Add Horizontal Lines
#for i in idx:
#    g.ax.axhline(y=i, color='r', linestyle='--', lw = 0.2)

# Rotate x-axis labels
#for label in g.ax.get_xticklabels():
#    label.set_rotation(90)
plt.show()

In [None]:
for b in range(len(b_range)):
    for p in range(len(corr_range)):
        
        # Subset Weights
        chosen_cm = cf_weights[np.arange(b, n_mc * len(corr_range) * len(b_range), len(b_range))][np.arange(p, n_mc * len(corr_range), len(corr_range))]

        # Aggregated
        chosen_cm = np.sum(chosen_cm, axis = 0)

        # Replace Zero with NaN
        chosen_cm[chosen_cm == 0] = np.nan
        chosen_cm_agg = chosen_cm.copy()

        # Set to One
        chosen_cm[chosen_cm > 1 ] = 1

        # Adapt Column-Values
        chosen_cm = chosen_cm * np.arange(1, chosen_cm.shape[1]+1)

        # Create DataFrame
        chosen_cm = pd.DataFrame(chosen_cm, index = [f"K={k}" for k in bssf_range], columns = [*ssf_names, *cr_names])
        chosen_cm_agg = pd.DataFrame(chosen_cm_agg, index = [f"K={k}" for k in bssf_range], columns = [*ssf_names, *cr_names])

        # Index
        chosen_cm = chosen_cm.reset_index(names="INDEX")
        chosen_cm_agg = chosen_cm_agg.reset_index(names="INDEX")

        # Melt Data
        chosen_cm = chosen_cm.melt(id_vars = 'INDEX', var_name = 'Candidate_Model', value_name = 'Value')
        chosen_cm_agg = chosen_cm_agg.melt(id_vars = 'INDEX', var_name = 'Candidate_Model', value_name = 'Weight')

        # Concatenate
        chosen_cm = pd.concat([chosen_cm, chosen_cm_agg])

        # Transform Candidate Model Names
        chosen_cm['Candidate_Model'] = chosen_cm['Candidate_Model'].str.split('_').str[0]
        #chosen_cm['Candidate_Model'] = chosen_cm['Candidate_Model'].str.startswith("SSF").replace({True: "SSF", False: "CR"})

        # Plot Theme
        sns.set_theme(style="ticks")

        # Draw each cell as a scatter point with varying size and color
        g = sns.relplot(
            data = chosen_cm,
            y = 'Value', x = 'INDEX', hue = "Candidate_Model", 
            size = "Weight", sizes = (3, 75),
            height = 6.5, alpha = 0.75, palette="muted",
            #legend = True
            )

        # Legend
        g._legend.remove()
        h, l = g.ax.get_legend_handles_labels()
        g.ax.legend(h[0:13], l[0:13], bbox_to_anchor=(1.0, 0.75), loc=2, fontsize=10, frameon=False)

        # Title & Axis
        g.set(xlabel='Combination Size',
              ylabel='Candidate Models',
              title=f"Selected Candidate Models: \n Correlation {corr_range[p]} & Betas {b_range[b]}")

        # Margins
        g.ax.margins(x = 0.05, y = 0)

        # Set number of ticks for y-axis
        idx = [0, 8, 36, 92, 162, 218, 246, 255, 315, 375, 435]
        g.ax.set_yticks(idx)

        # Set ticks labels for x-axis
        g.ax.set_yticklabels([string.split("_")[0] for string in [[[*ssf_names, *cr_names]][0][i] for i in idx]], rotation='horizontal')

        # Tick-Label Size
        g.set_yticklabels(size = 8)
        g.set_xticklabels(size = 10)

        # Add Horizontal Lines
        #for i in idx:
        #    g.ax.axhline(y=i, color='r', linestyle='--', lw = 0.2)

        # Rotate x-axis labels
        for label in g.ax.get_xticklabels():
            label.set_rotation(90)

In [None]:
# Set up Plot
fig, ax = plt.subplots(1,1) 
fig.set_figheight(10)
fig.set_figwidth(10)

# Plot 
colors = {'SSF':'blue', 'CR':'green'}
ax.scatter(x = chosen_cm['INDEX'], y = chosen_cm['Weight'], c=chosen_cm['Candidate_Model'].map(colors), s = 1)
#ax.plot(chosen_cm['INDEX'], chosen_cm.iloc[:, 1:], marker = "o", lw = 0, ms = 4)

# Add title and axis names
plt.title('Selected Candidate Models')
plt.ylabel('Candidate Models')
plt.xlabel('Combination Size')

# Legend
#plt.legend(loc = "upper right")

# Margins
plt.margins(x=0.10, y=0)

# Set number of ticks for x-axis
idx = [0, 8, 36, 92, 162, 218, 246, 255, 315, 375, 435]
ax.set_yticks(idx)

# Set ticks labels for x-axis
ax.set_yticklabels([[[*ssf_names, *cr_names]][0][i] for i in idx], rotation='horizontal', fontsize=6)

# Add horizontal lines
for i in idx:
    plt.axhline(y=i, color='r', linestyle='--', lw = 0.2)

# Show Plot
plt.show()

In [None]:
### Plot: Welches K hat was ausgewählt
### Benchmark: Complete Subset Regression

In [None]:
for b in range(len(b_range)):
    for p in range(len(corr_range)):
        
        se_bssf = se_bssf_forecast[np.arange(b, n_mc * len(corr_range) * len(b_range), len(b_range))][np.arange(p, n_mc * len(corr_range), len(corr_range))]
        se_csr  = se_csr_forecast[np.arange(b, n_mc * len(corr_range) * len(b_range), len(b_range))][np.arange(p, n_mc * len(corr_range), len(corr_range))]
        se_phm  = se_benchmark[np.arange(b, n_mc * len(corr_range) * len(b_range), len(b_range))][np.arange(p, n_mc * len(corr_range), len(corr_range))]
        
        print(f"BSSF: Avg. OOS-R2 for Corr. {corr_range[p]} and Betas {b_range[b]} is:" + str(np.round(100 * (1 - sum(se_bssf) / sum(se_phm)), 2)) + "%")
        print(f"CSR:  Avg. OOS-R2 for Corr. {corr_range[p]} and Betas {b_range[b]} is:" + str(np.round(100 * (1 - sum(se_csr) / sum(se_phm)), 2)) + "%")

In [None]:
b1 = benchmark[np.arange(0, n_mc * len(corr_range) * len(b_range), len(b_range))]
b2 = benchmark[np.arange(1, n_mc * len(corr_range) * len(b_range), len(b_range))]

b1_p1 = b1[np.arange(0, n_mc * len(corr_range), len(corr_range))]
b1_p2 = b1[np.arange(1, n_mc * len(corr_range), len(corr_range))]
b1_p3 = b1[np.arange(2, n_mc * len(corr_range), len(corr_range))]

b2_p1 = b2[np.arange(0, n_mc * len(corr_range), len(corr_range))]
b2_p2 = b2[np.arange(1, n_mc * len(corr_range), len(corr_range))]
b2_p3 = b2[np.arange(2, n_mc * len(corr_range), len(corr_range))]

-------------

------------------

# Application

In [None]:
# Load Goyal Welch Data
data  =  pd.read_csv(path + r'/Data/PredictorData2022.xlsx - Quarterly.csv', thousands=',')

# Equity Premium
data['equity_premium'] = data['CRSP_SPvw'] - data['Rfree']

# Dividend Price Ratio 
data['dp'] = np.log(data['D12']) - np.log(data['Index'])

# Dividend Yield 
data['dy'] = np.log(data['D12'])- np.log(data['Index'].shift(1))

# Earnings Price Ratio 
data['ep'] = np.log(data['E12']) - np.log(data['Index'])

# Dividend Payout Ratio 
data['dpayr'] = np.log(data['D12']) - np.log(data['E12'])

# Book to Market Ratio
data['bmr'] = data['b/m']

# # Net Equity Expansion
data['ntis'] = data['ntis']

# Treasury Bill Rate
data['tbl'] = data['tbl']

# Long Term Rate
data['ltr'] = data['ltr']

# Term Spread 
data['tsp'] = data['lty'] - data['tbl']

# Default Return Spread 
data['dfr'] = data['corpr'] - data['ltr']

# Inflation
data['infl'] = data['infl']

# Investment of Capital Ratio
data['ik']  = data['ik']

# Default Yield Spread
data['dfy'] = data['BAA'] - data['AAA']

# Realized Volatility
data['rvol'] = data['svar']

# reorganize the dataframe
data = data[['yyyyq', "equity_premium", "dp", "dy", "ep", "dpayr", "bmr", "ntis", "tbl", "ltr", "tsp", "dfr", "infl", "ik"]]

# Convert Date
data['yyyyq'] = data['yyyyq'].astype(str)
data['yyyyq'] = data.apply(lambda x: x['yyyyq'][:4]+'-Q'+x['yyyyq'][4:], axis=1)
data['yyyyq'] = pd.to_datetime(data['yyyyq'])

# Resetting the index
data.set_index('yyyyq', inplace=True)
data.index = data.index.to_period('Q')

# Lag all Predictors
data.iloc[:,1:]  =  data.iloc[:,1:].shift(1)

# Drop Na
data = data.loc["1946Q1":, ]

In [None]:
###### Set Seed ######
#random.seed(123)

###### Data ######
# Set Target Variable
y  =  data.loc[:, ["equity_premium"]]
X  =  data.drop("equity_premium", axis = 1)

# Get Predictor Names
pred_names = list(X.columns)

# Number of AR-Terms to include
mlags =  2

# Create Lags
X  =  create_lags(y, X, mlags)

# Drop Missing Values
y  =  y.loc["1947Q2":, ] #y[mlags:]
X  =  X.loc["1947Q2":, ] #X[mlags:]

# Check NA
any(X.isna().any())

###### Parameter Subset Forecasts
# Subset Lengths
k_range = [1, 2, 3] # [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]

# Upper Bound for Subset-Sizes
n_max = 10000  # 20000

# Number of Subset-Forecasts
n_sub  =  int(sum([min(n_models((X.shape[1]-mlags), k), n_max) for k in k_range]))

###### Parameter Compressed Regressions ######
# Number of Components for Compressed Regression
cr_range  =  [1, 2, 3] # [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100]

# Number of runs for each random projection
rep_range  =  range(0, 100) # 10000

# Number of Compressed-Regression-Forecasts
n_cr  =  len(cr_range) * len(rep_range)

# ###### Parameter Decision Tree ######
# dt_range  =  range(0, 5) # 300000
# 
# # Number of Decision Trees
# n_dt  =  len(dt_range)

###### Parameter BSSF ######
alpha       =  0.5
n_times     =  50
bssf_range  =  [1, 2, 3] # range(1, 5)
n_bssf      =  len(bssf_range)

### General Parameter ######
# Initial Training-Period
init       =  4 * 50 #4 * 10

# Total Length
total =  len(y) 

# Set up Matrices for Results
cand_forecasts  =  np.full((total, (n_sub + n_cr)), np.nan)     #np.full((total, (n_sub + n_cr + n_dt)), np.nan)
cf_weights      =  np.full((total, (n_sub + n_cr)), np.nan)
benchmark       =  np.full(total, np.nan)
bssf_forecast   =  np.full(total, np.nan)
bssf_opt        =  np.full(total, np.nan)
sse_bssf        =  np.zeros(n_bssf)

###### Start ######
# Loop over Time
for t in tqdm(range(init, total)):
        
    # Pre-Process Data
    y_train, X_train, y_pred, X_pred = prepro(y, X, t)
    
    ### Benchmark: AR(X) Model
    pred          =  ar_mod(y_train, lags = mlags)
    benchmark[t]  =  pred.iloc[0]
    
    ### Subset Forecasts
    # Set up List to store Subset-Forecasts
    preds_ssf =  np.full(n_sub, np.nan)
    idx_sub   =  0
    
    # Loop over Subset Size 
    for k in k_range:
    
        # Get all possible Subset of length k
        col_idx   =  list(range(mlags+1, X_train.shape[1]))
        subs_idx  =  complete_sub(col_idx, k)

        # Randomly select n_upper Subsets
        feature_set  =  subs_idx #random_select(subs_idx, n_max, random_state = 123)

        # Loop over Subsets
        for feature in feature_set:

            # Compute Subset-Regression-Forecast
            pred  =  ssf(y_train, X_train, X_pred, feature, mlags)
            preds_ssf[idx_sub] = pred
            idx_sub += 1
            
    ### Compressed Regressions
    # Set up List to store Compressed-Regression-Forecasts
    preds_cr = np.full(n_cr, np.nan)
    idx_cr   = 0
    
    # Loop over number of Components
    for n_comp in cr_range:

        # Loop over n repetitions
        for r in rep_range:
        
            # Compute Compressed-Regression-Forecasts
            pred  =  cr_reg(y_train, X_train, X_pred, n_comp, mlags, r)
            preds_cr[idx_cr] = pred
            idx_cr += 1
            
    # ### Decision Tree Regressions
    # # Set up Matrix to store Decision-Tree-Forecasts
    # preds_dt   = np.full(n_dt, np.nan)
    # 
    # # Loop over number of Components
    # for idx_dt, r in enumerate(dt_range):
    #     
    #     # Compute Decision-Tree-Forecasts
    #     pred  =  dt_reg(y_train, X_train, X_pred, r)
    #     preds_dt[idx_dt] = pred[0]

    # Append Results
    cand_forecasts[t][:n_sub]             =  preds_ssf 
    cand_forecasts[t][n_sub:(n_sub+n_cr)] =  preds_cr
    #cand_forecasts[t][(n_sub+n_cr):]      =  preds_dt

    ### Best Selection of Forecast
    if t > init:
    
        # Set up Matrix to store Forecasts
        bssf_forecasts  =  np.full(n_bssf, np.nan)
        bssf_weights    =  np.zeros([n_bssf, n_sub + n_cr]) 
           
        # Get "best" Subset-Size until now (lowest Sum of Squared Errors)
        s_opt  =  np.argmin(sse_bssf)
    
        # Loop over Subset Sizes
        for idx_bssf, s in enumerate(bssf_range):
    
            # Compute Best-Subset-Selection-of-Forecasts
            pred  =  bssf(y_train[init:], cand_forecasts[init:t], cand_forecasts[t], alpha, s, n_times)
            bssf_forecasts[idx_bssf]  =  pred[0]
            bssf_weights[idx_bssf]    =  pred[1]
    
            # Compute Sum of Squared Errors
            sse_bssf[idx_bssf] =  sse_bssf[idx_bssf] + (y_pred.iloc[0,0] - pred[0]) ** 2
    
        # Select Forecast 
        bssf_forecast[t] =  bssf_forecasts[s_opt]
        cf_weights[t]    =  bssf_weights[s_opt]
        bssf_opt[t]      =  bssf_range[s_opt]
        
# Candidate-Model-Names
ssf_names = [f"SSF{k}_" + "_".join(map(str, sub)) for k in k_range for sub in combinations(range(len(pred_names)), k)]
cr_names  = [f"CR{n_comp}_{r}" for n_comp in cr_range for r in rep_range]
#dt_names = [f"DT_{idx_dt}" for idx_dt in dt_range]
        
# Convert Results to DataFrame
cand_forecasts  =  pd.DataFrame(cand_forecasts, index = y.index, columns = [*ssf_names, *cr_names]) #, *dt_names])
benchmark       =  pd.DataFrame(benchmark,      index = y.index, columns = ["AR"])
bssf_forecast   =  pd.DataFrame(bssf_forecast,  index = y.index, columns = ["BSSF"])
cf_weights      =  pd.DataFrame(cf_weights,     index = y.index, columns = [*ssf_names, *cr_names]) #, *dt_names])
bssf_opt        =  pd.DataFrame(bssf_opt,     index = y.index, columns = ["Subset_Size"])

# Cut off initial Training-Period
sub_y              =  y.iloc[init:].copy()
sub_cand_forecasts =  cand_forecasts.iloc[init:].copy()
sub_benchmark      =  benchmark.iloc[init:].copy()
sub_bssf_forecast  =  bssf_forecast.iloc[init:].copy()
sub_cf_weights     =  cf_weights.iloc[init:].copy()
sub_bssf_opt       =  bssf_opt.iloc[init:].copy()

# OOS-Period
oos_start  =  "1999Q4"
oos_end    =  "2022Q4" 
oos_y             =  sub_y.loc[oos_start:oos_end].copy()
oos_cand_forecast =  sub_cand_forecasts.loc[oos_start:oos_end].copy()
oos_benchmark     =  sub_benchmark.loc[oos_start:oos_end].copy()
oos_bssf_forecast =  sub_bssf_forecast.loc[oos_start:oos_end].copy()
oos_cf_weights    =  sub_cf_weights.loc[oos_start:oos_end].copy()
oos_bssf_opt      =  sub_bssf_opt.loc[oos_start:oos_end].copy()

# Evaluation
np.sum((oos_y.iloc[:,0] - oos_bssf_forecast.iloc[:,0]) ** 2) / np.sum((oos_y.iloc[:,0] - oos_benchmark.iloc[:,0]) ** 2)

In [None]:
# Replace Zero with NaN
oos_cf_weights.replace({0:np.nan}, inplace=True)

# Adapt Column-Values
vec = list(range(1, oos_cf_weights.shape[1]+1))
tmp = oos_cf_weights * vec

# Dates
tmp = tmp.reset_index(names="date")

# Plot 
# tmp.plot(x='date', y = tmp.columns[1:],
#          figsize=(10, 5), legend=False,
#          marker="o", ms = 1, 
#          title="Selected Candidate Models", ylabel="Selected Candidate Models")
#          #yticks = (np.arange(98), list(tmp.columns[1:])))

tmp_long = pd.melt(tmp, id_vars = "date")
tmp_long['variable'] = tmp_long['variable'].str.startswith("SSF").replace({True: "SSF", False: "CR"})
tmp_long.set_index("date", inplace = True)
tmp_long.groupby("variable")["value"].plot(legend=True, figsize = (10, 5),
                                           marker="o", ms = 2, lw = 0,
                                           ylim = [-1, cand_forecasts.shape[1]+1],
                                           title="Selected Candidate Models", ylabel="Selected Candidate Models")
plt.show()

# Subset Size
oos_bssf_opt.plot(figsize=(10, 5), legend=False, 
                  color = "black", marker="o", ms = 1, lw = 0,
                  title = "Subset Size", xlabel="date", ylabel="Selected Subset Size",
                  ylim  =  [min(bssf_range)-0.5, max(bssf_range)+0.5],
                  yticks = np.arange(min(bssf_range), max(bssf_range)+1, step=1.0))
plt.show()

# CSSED
cssed = np.cumsum(((oos_y.iloc[:,0] - oos_benchmark.iloc[:,0]) ** 2) - ((oos_y.iloc[:,0] - oos_bssf_forecast.iloc[:,0]) ** 2))
cssed.plot(figsize=(10, 5),
            xlabel = "date", ylabel = "CSSED", title = "Cumulated Sum of Squared Error Differences")
plt.show()

---

# Old Data

In [None]:
###### MC-Parameter ######
# Number of MC-Runs
n_mc  =  2

# Set Parameter
n_obs       =  100
n_preds     =  8
init        =  50
mlags       =  0
corr_range  =  [0.5, 0.95]
b_range     =  np.array([[1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0], [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]])

###### Parameter Subset Forecasts ######
# Subset Lengths
k_range = [1, 2, 3, 4, 5, 6, 7, 8] 

# Upper Bound for Subset-Sizes
n_max = 10000  

# Number of Subset-Forecasts
n_sub  =  int(sum([min(n_models(n_preds, k), n_max) for k in k_range]))

###### Parameter Compressed Regressions ######
# Number of Components for Compressed Regression
cr_range  =  [1, 2] 

# Number of runs for each random projection
rep_range  =  range(0, 50) 

# Number of Compressed-Regression-Forecasts
n_cr  =  len(cr_range) * len(rep_range)

# ###### Parameter Decision Tree ######
# dt_range  =  range(0, 5) 
# 
# # Number of Decision Trees
# n_dt  =  len(dt_range)

###### Parameter BSSF ######
alpha       =  10.0
n_times     =  1
bssf_range  =  [1, 2, 3] 
n_bssf      =  len(bssf_range)

######## Objects ########
# Set up Matrices for Results
cand_forecasts   = np.full((len(range(init, n_obs)), (n_sub + n_cr)), np.nan)     
benchmark        = np.full( len(corr_range) * len(b_range) * n_mc, np.nan)
cf_weights       = np.full((len(corr_range) * len(b_range) * n_mc, n_bssf, (n_sub + n_cr)), np.nan)
bssf_forecast    = np.full((len(corr_range) * len(b_range) * n_mc, n_bssf), np.nan)
csr_forecast     = np.full((len(corr_range) * len(b_range) * n_mc, len(k_range)), np.nan)
se_benchmark     = np.full( len(corr_range) * len(b_range) * n_mc, np.nan)
se_bssf_forecast = np.full((len(corr_range) * len(b_range) * n_mc, n_bssf), np.nan)
se_csr_forecast  = np.full((len(corr_range) * len(b_range) * n_mc, len(k_range)), np.nan)

###### Start ######
# Loop over Monte-Carlo Runs
i = 0
for r in tqdm(range(n_mc)):
    
    # Loop over Covariance-Sets
    for p in corr_range:
        
        # Loop over Coefficient-Sets
        for b in b_range:
    
            ### Simulate Data ###
            y, X, pred_names = sim(n_obs, n_preds, b, p, r)
            
            ### Benchmark: PHM ###
            benchmark[i]    = y.iloc[:-1].mean().iloc[0]
            se_benchmark[i] = (y.iloc[-1,0] - benchmark[i]) ** 2

            # Loop over t / Create Candidate Models
            for t in range(init, n_obs):
            
                ### Pre-Process Data ###
                y_train, X_train, y_pred, X_pred = prepro(y, X, t)

                ### Subset Forecasts ###
                feature_set  =  list(chain(*list(map(lambda k: complete_sub(list(range(1, X_train.shape[1])), k), k_range))))
                preds_ssf    =  np.array(list(map(lambda feature: ssf(y_train, X_train, X_pred, feature, 0), feature_set)))
            
                ## Set up List to store Subset-Forecasts
                #preds_ssf = np.full(n_sub, np.nan)
                #idx_sub   = 0
                #
                ## Loop over Subset Size 
                #for k in k_range:
                #
                #    # Get all possible Subsets of length k
                #    col_idx  = list(range(1, X_train.shape[1]))
                #    subs_idx = complete_sub(col_idx, k)
                #
                #    # Randomly select n_upper Subsets
                #    feature_set  =  subs_idx #random_select(subs_idx, n_max, random_state = 123)
                #
                #    # Loop over Subsets
                #    for feature in feature_set:
                #    
                #        # Compute Subset-Regression-Forecast
                #        pred  =  ssf(y_train, X_train, X_pred, feature, 0)
                #        preds_ssf[idx_sub] = pred
                #        idx_sub += 1

                ### Compressed Regressions ###
                preds_cr = np.array(list(chain(*[list(map(lambda rep: cr_reg(y_train, X_train, X_pred, n_comp, 0, rep), rep_range)) for n_comp in cr_range])))
                
                # # Set up List to store Compressed-Regression-Forecasts
                # preds_cr   = np.full(n_cr, np.nan)
                # idx_cr     = 0
                # 
                # # Loop over number of Components
                # for n_comp in cr_range:
                # 
                #     # Loop over n repetitions
                #     for rep in rep_range:
                #     
                #         # Compute Compressed-Regression-Forecasts
                #         pred  =  cr_reg(y_train, X_train, X_pred, n_comp, 0, rep)
                #         preds_cr[idx_cr] = pred
                #         idx_cr += 1

                # ### Decision Tree Regressions
                # preds_dt = np.array(list(map(lambda r: dt_reg(y_train, X_train, X_pred, r), dt_range)))
                
                # # Set up Matrix to store Decision-Tree-Forecasts
                # preds_dt   = np.full(n_dt, np.nan)
                # 
                # # Loop over number of Components
                # for idx_dt, r in enumerate(dt_range):
                #     
                #     # Compute Decision-Tree-Forecasts
                #     pred  =  dt_reg(y_train, X_train, X_pred, r)
                #     preds_dt[idx_dt] = pred[0]

                # Append Results
                cand_forecasts[t-init][:n_sub]             =  preds_ssf 
                cand_forecasts[t-init][n_sub:(n_sub+n_cr)] =  preds_cr
                #cand_forecasts[t-init][(n_sub+n_cr):]     =  preds_dt
                
            ### Benchmark: Complete Subset Regression ###
            tmp_ = np.cumsum([0] + [len(list(combinations(range(n_preds), k))) for k in k_range])
            csr_forecast[i]     =  [np.mean(preds_ssf[tmp_[i]:tmp_[i+1]]) for i in range(len(tmp_)-1)]
            se_csr_forecast[i]  =  (y_pred.iloc[0,0] - csr_forecast[i]) ** 2

            ### Best Selection of Forecast ###
            bssf_forecast[i], cf_weights[i] = zip(*list(map(lambda s: bssf(y_train[init:], cand_forecasts[:-1], cand_forecasts[-1], alpha, s, n_times), bssf_range)))
            se_bssf_forecast[i] = (y_pred.values[0] - bssf_forecast[i]) ** 2
            
            # # Set up Matrix to store Forecasts
            # kssf_forecasts  =  np.full(n_bssf, np.nan)
            # kssf_weights    =  np.zeros([n_bssf, n_sub + n_cr]) 

            # # Loop over Subset Sizes
            # for idx_bssf, s in enumerate(bssf_range):
            #     
            #     # Compute Best-Subset-Selection-of-Forecasts
            #     pred = bssf(y_train[init:], cand_forecasts[:-1], cand_forecasts[-1], alpha, s, n_times)
            #     bssf_forecast[i][idx_bssf] = pred[0]
            #     cf_weights[i][idx_bssf]    = pred[1]
            #     se_bssf_forecast[i][idx_bssf] = (y_pred.iloc[0,0] - pred[0]) ** 2
             
            # Update index   
            i += 1
            
# # Candidate-Model-Names
ssf_names = [f"SSF{k}_" + "_".join(map(str, sub)) for k in k_range for sub in combinations(range(n_preds), k)]
cr_names  = [f"CR{n_comp}_{r}" for n_comp in cr_range for r in rep_range]

### Evaluation ###
for b in range(len(b_range)):
    for p in range(len(corr_range)):
        
        se_bssf = se_bssf_forecast[np.arange(b, n_mc * len(corr_range) * len(b_range), len(b_range))][np.arange(p, n_mc * len(corr_range), len(corr_range))]
        se_csr  = se_csr_forecast[np.arange(b, n_mc * len(corr_range) * len(b_range), len(b_range))][np.arange(p, n_mc * len(corr_range), len(corr_range))]
        se_phm  = se_benchmark[np.arange(b, n_mc * len(corr_range) * len(b_range), len(b_range))][np.arange(p, n_mc * len(corr_range), len(corr_range))]
        
        print(f"BSSF: Avg. OOS-R2 for Corr. {corr_range[p]} and Betas {b_range[b]} is: " + str(np.round(100 * (1 - sum(se_bssf) / sum(se_phm)), 2)) + "%")
        print(f"CSR:  Avg. OOS-R2 for Corr. {corr_range[p]} and Betas {b_range[b]} is: " + str(np.round(100 * (1 - sum(se_csr) / sum(se_phm)), 2)) + "%")

In [None]:
# Make Fred-MD-Data stationary
def transform_tcode(data):
    
    # Get Transformation-Code
    tcode = data[0]
    
    # Get Data
    data = data[1:]

    if tcode == 1:
        output = data
    elif tcode == 2:
        output = data - np.roll(data, 1)
    elif tcode == 3:
        output = (data - np.roll(data, 1)) - (np.roll(data, 1) - np.roll(data, 2))
    elif tcode == 4:
        output = np.log(data)
    elif tcode == 5:
        output = np.log(data) - np.roll(np.log(data), 1)
    elif tcode == 6:
        output = (np.log(data) - np.roll(np.log(data), 1)) - (np.roll(np.log(data), 1) - np.roll(np.log(data), 2))
    else:
        output = (data / np.roll(data, 1) - 1) - (np.roll(data, 1) / np.roll(data, 2) - 1)

    return np.concatenate(([tcode], output))


In [None]:
# Load Data
x_dataset  =  pd.read_csv(path + r'/fred_md_202306.csv')

# Drop Variables with too many missing values
x_dataset  =  x_dataset.drop(["PERMIT", "PERMITNE", "PERMITMW", "PERMITS",
                              "PERMITW", "ACOGNO", "ANDENOx", "CP3Mx",
                              "COMPAPFFx", "TWEXAFEGSMTHx", "UMCSENTx", "VIXCLSx"],
                              axis=1)

# Transform remaining Columns
x_dataset.iloc[:, 1:]  =  x_dataset.iloc[:,1:].apply(lambda x: transform_tcode(x))

# Drop First Row with Transformation-Code
x_dataset  =  x_dataset.iloc[1:,:]

# Lag Data
x_dataset.iloc[:,1:]  =  x_dataset.iloc[:,1:].shift(1)

# Convert Date
x_dataset['sasdate']  =  pd.to_datetime(x_dataset['sasdate'])

# Filter Data
x_dataset  =  x_dataset[(x_dataset['sasdate'] >= '1959-04-01') & (x_dataset['sasdate'] <= '2023-03-01')]

# Resetting the index
x_dataset.set_index('sasdate', inplace=True)
x_dataset.index = x_dataset.index.to_period('M')

# Load Data
y_dataset  =  pd.read_csv(path + r'/fred_md_202306.csv')

# Select and Rename Variables
y_dataset  =  y_dataset.loc[:, ["sasdate", "CPIAUCSL", "INDPRO", "UNRATE"]]
y_dataset  =  y_dataset.rename(columns={'CPIAUCSL': 'CPIAUCSL_h1', 'INDPRO': 'INDPRO_h1', 'UNRATE': 'UNRATE_h1'})

# Transform Variables
y_dataset[["CPIAUCSL_h1"]]  =  y_dataset[["CPIAUCSL_h1"]].apply(lambda x: 4 * 100 * (np.log(x) - np.roll(np.log(x), 1)))
y_dataset[["INDPRO_h1"]]    =  y_dataset[["INDPRO_h1"]].apply(lambda x: 4 * 100 * (np.log(x) - np.roll(np.log(x), 1)))
y_dataset[["UNRATE_h1"]]    =  y_dataset[["UNRATE_h1"]]

# Drop First Row with Transformation-Code
y_dataset  =  y_dataset.iloc[1:,:]

# Convert Date
y_dataset['sasdate']  =  pd.to_datetime(y_dataset['sasdate'])

# Filter Data
y_dataset  =  y_dataset[(y_dataset['sasdate'] >= '1959-04-01') & (y_dataset['sasdate'] <= '2023-03-01')]

# Resetting the index
y_dataset.set_index('sasdate', inplace=True)
y_dataset.index = y_dataset.index.to_period('M')

# Set Target Variable
y  =  y_dataset.loc[:, ["CPIAUCSL_h1"]]
X  =  x_dataset.drop("CPIAUCSL", axis = 1)

## Best Selection of Forecasts

In [None]:
# Best Selection of Forecasts
def bssf(Y_train, X_train, X_pred, alpha, n_sub, n_times):
    
    # Adapt X-Matrix
    X_train  =  X_train / n_sub
    
    # Generate Q-Matrix
    ivec      =  np.mat(np.ones(X_train.shape[1])).transpose()
    aux_mat   =  np.array(Y_train.transpose() @ X_train + alpha * n_sub)
    diag_mat  =  np.diag(aux_mat[0])
    Q         =  - 2 * diag_mat + X_train.transpose() @ X_train + alpha * ivec @ ivec.transpose()

    # Initialize BQM
    bqm  =  BinaryQuadraticModel('BINARY')
    bqm  =  bqm.from_qubo(Q)
    
    # Solve
    solver     =  SimulatedAnnealingSampler()
    #solver     =  SteepestDescentSolver()
    #solver     =  TabuSampler()
    #solver     =  TreeDecompositionSolver()

    sampleset  =  solver.sample(bqm, num_reads = n_times)
    solution   =  list(sampleset.first[0].values())
    
    # Prediction 
    pred       = solution @ X_pred
    
    # Return 
    return(pred, solution)

In [None]:
# Time-Series-Cross-Validation to Tune Parameter k
def cv_ts(Y, X, splits, k_seq, alpha, n_times):

    # Set up TS-Split
    tscv    =  TimeSeriesSplit(n_splits = splits)
    
    # Set up Result-Matrix
    cv_res  =  np.zeros((splits, len(k_seq)))

    # Loop over Train-Test-Splits
    for i, (train_index, test_index) in enumerate(tscv.split(X)):

        # Split Data
        X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]
        Y_train, Y_test = Y.iloc[train_index],   Y.iloc[test_index]

        # Cross-Validation k
        for k in k_seq:

            # Selected Predictors
            solution    =  bssf(Y_train, X_train, X_pred, alpha, k, n_times)[1]

            # Prediction
            prediction  =  solution @ X_test.transpose()

            # MSE
            cv_res[i, k-1]  =  np.mean((Y_test.squeeze() - prediction) ** 2)

            # Select k with smalltest average MSE
            k  =  cv_res.mean(axis=0).argmin() + 1
            
    # Return 
    return(k)

In [None]:
# Candidate Forecasts
cand_forecasts  =  results

# Target Variable
target_var  =  y_dataset.loc[:, ["CPIAUCSL_h1"]]

# Get Dates
dates  =  cand_forecasts.index.values

# Match Time Frames
target_var  =  target_var.loc[dates]

In [None]:
# Set Parameter
alpha    =  1
n_mods   =  cand_forecasts.shape[1]

# Set Vector & Matrices
X     =  cand_forecasts.copy()
Y     =  target_var.copy()

# Set Time Parameter
init    =  int(12 * 5)
final   =  len(Y)

# Set up Empty Array
predictions  =  np.zeros(final)
predictions.fill(np.nan)

# Loop
for t in tqdm(range(init, final)):
    
    # Train Data
    X_train  =  X.iloc[:t, ]
    Y_train  =  Y.iloc[:t, ]
    
    # Prediction Data
    X_pred   =  X.iloc[t, ]
    
    # Cross-Validation
    splits   =  5
    n_times  =  500
    k_seq    =  [1, 2, 3, 4, 5]
    k_opt    =  cv_ts(Y_train, X_train, splits, k_seq, alpha, n_times)
    
    # Prediction 
    n_times  =  500
    predictions[t]  =  bssf(Y_train, X_train, X_pred, alpha, k_opt, n_times)[0]

# Convert to Pandas Series
predictions  =  pd.Series(predictions, name = "qubo", index = Y.index).to_frame()

#### Load Data

In [None]:
# Load Target Variable
target_var      =  pyreadr.read_r(path + '/Data/Results/Target_Var/target_var.RDS')[None]

# Load all Candidate Forecasts
cand_forecasts  =  pd.DataFrame()
files           =  os.scandir(path + '/Data/Results/Candidate_Forecasts')

# Loop
for file in files:
    if (file.path.endswith(".RDS")):
        aux  =  pyreadr.read_r(file)[None]
        cand_forecasts  =  pd.concat([cand_forecasts, aux], axis = 1)
        
# Drop Na
cand_forecasts  =  cand_forecasts.dropna()

# Get Dates
dates           =  cand_forecasts.index.values

# Match Time Frames
target_var      =  target_var.loc[dates]

# Dimensions
print(cand_forecasts.shape)
print(target_var.shape)

### Set up Q-Matrix

In [None]:
# Best Selection of Forecasts
def bssf(Y_train, X_train, X_pred, alpha, n_sub, n_times):
    
    # Adapt X-Matrix
    X_train  =  X_train / n_sub
    
    # Generate Q-Matrix
    ivec      =  np.mat(np.ones(X_train.shape[1])).transpose()
    aux_mat   =  np.array(Y_train.transpose() @ X_train + alpha * n_sub)
    diag_mat  =  np.diag(aux_mat[0])
    Q         =  - 2 * diag_mat + X_train.transpose() @ X_train + alpha * ivec @ ivec.transpose()

    # Initialize BQM
    bqm  =  BinaryQuadraticModel('BINARY')
    bqm  =  bqm.from_qubo(Q)
    
    # Solve
    solver     =  SimulatedAnnealingSampler()
    #solver     =  SteepestDescentSolver()
    #solver     =  TabuSampler()
    #solver     =  TreeDecompositionSolver()

    sampleset  =  solver.sample(bqm, num_reads = n_times)
    solution   =  list(sampleset.first[0].values())
    
    # Prediction 
    pred       = solution @ X_pred
    
    # Return 
    return(pred, solution)

In [None]:
# Time-Series-Cross-Validation to Tune Parameter k
def cv_ts(Y, X, splits, k_seq, alpha, n_times):

    # Set up TS-Split
    tscv    =  TimeSeriesSplit(n_splits = splits)
    
    # Set up Result-Matrix
    cv_res  =  np.zeros((splits, len(k_seq)))

    # Loop over Train-Test-Splits
    for i, (train_index, test_index) in enumerate(tscv.split(X)):

        # Split Data
        X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]
        Y_train, Y_test = Y.iloc[train_index],   Y.iloc[test_index]

        # Cross-Validation k
        for k in k_seq:

            # Selected Predictors
            solution    =  bssf(Y_train, X_train, X_pred, alpha, k, n_times)[1]

            # Prediction
            prediction  =  solution @ X_test.transpose()

            # MSE
            cv_res[i, k-1]  =  np.mean((Y_test.squeeze() - prediction) ** 2)

            # Select k with smalltest average MSE
            k  =  cv_res.mean(axis=0).argmin() + 1
            
    # Return 
    return(k)

In [None]:
# Set Parameter
alpha    =  1
n_mods   =  cand_forecasts.shape[1]

# Set Vector & Matrices
X     =  cand_forecasts.copy()
Y     =  target_var.copy()

# Set Time Parameter
init    =  int(12 * 5)
final   =  len(Y)

# Set up Empty Array
predictions  =  np.zeros(final)
predictions.fill(np.nan)

# Loop
for t in range(init, final):
    
    # Train Data
    X_train  =  X.iloc[:t, ]
    Y_train  =  Y.iloc[:t, ]
    
    # Prediction Data
    X_pred   =  X.iloc[t, ]
    
    # Cross-Validation
    splits   =  5
    n_times  =  500
    k_seq    =  [1, 2, 3, 4, 5]
    k        =  cv_ts(Y_train, X_train, splits, k_seq, alpha, n_times)
    
    # Prediction 
    n_times  =  500
    predictions[t]  =  bssf(Y_train, X_train, X_pred, alpha, k, n_times)[0]

# Convert to Pandas Series
predictions  =  pd.Series(predictions, name = "qubo", index = Y.index).to_frame()

### Evaluation

In [None]:
# Define Eval-Function (OOS-R2)
def oos_r2(observation, prediction, benchmark):
    
    # Squared Error Model
    se1  =  (observation - prediction) ** 2
    
    # Squared Error Benchmark
    se2  =  (observation - benchmark) ** 2
    
    # Out-of-Sample R2
    oos_r2  =  (1 - sum(se1) / sum(se2)) * 100
    
    # Return 
    return(oos_r2)

In [None]:
# Set OOS-Period
eval_start  =  "1974-12-01"
eval_end    =  "2020-12-01"

# Keep only OOS-Period
oos_target_var      =  target_var.loc[eval_start:eval_end].squeeze()
oos_benchmark       =  cand_forecasts[eval_start:eval_end]["pred_hist_mean"]
oos_bssf            =  predictions.loc[eval_start:eval_end].squeeze()
oos_cand_forecasts  =  cand_forecasts[eval_start:eval_end]

# Evaluate Best Subset Selection of Forecasts
oos_r2(oos_target_var, oos_bssf, oos_benchmark)

In [None]:
# Evaluate Candidate Models
eval_cand_mods  =  oos_cand_forecasts.apply(lambda x: oos_r2(oos_target_var, x, oos_benchmark), axis = 0)

In [None]:
eval_cand_mods.sort_values(ascending = False).to_frame("OOS-R2").head(n = 50)

In [None]:
# Set OOS-Period
eval_start  =  "1974-12-01"
eval_end    =  "2020-12-01"

# Keep only OOS-Period
oos_target_var      =  target_var.loc[eval_start:eval_end].squeeze()
oos_benchmark       =  cand_forecasts[eval_start:eval_end]["pred_hist_mean"]
oos_cand_forecasts  =  cand_forecasts[eval_start:eval_end]

# Evaluate Candidate Models
eval_cand_mods  =  oos_cand_forecasts.apply(lambda x: oos_r2(oos_target_var, x, oos_benchmark), axis = 0)

# Show Results
print(eval_cand_mods.filter(like = "XGB"))
print(eval_cand_mods.filter(like = "GBM"))
print(eval_cand_mods.filter(like = "pcr"))
print(eval_cand_mods.filter(like = "glm"), n =)
#eval_cand_mods.sort_values(ascending = False).to_frame("OOS-R2").head(n = 50)

In [None]:
# Initialize BQM
qubo  =  { (i,j) : Q.iloc[i,j] for i in range(0, 11) for j in range(0, 11) }
bqm   =  BinaryQuadraticModel('BINARY')
bqm   =  bqm.from_qubo(Q)

# Initialize BQM
bqm = BinaryQuadraticModel('BINARY')

# Add Linear Coefficients
for i in range(0,11):
    lin_coef  =  Q.iloc[i,i]
    bqm.add_linear(i, lin_coef)
    
# Add Quadratic Coefficients
for i in range(0,11):
    for j in range(0,11):
        if i != j:
            quad_coef  =  Q.iloc[i,j]
            bqm.add_quadratic(i, j, quad_coef)