In [None]:
pip install seaborn

In [None]:
pip install statsmodels

In [None]:
pip install xgboost

In [None]:
import os
import numpy as np
import pandas as pd
import sklearn as skl
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import matplotlib.pyplot as plt
import seaborn as sns
import math

In [None]:
from sklearn.metrics import mean_squared_error, r2_score,accuracy_score
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import RidgeCV, MultiTaskLassoCV, MultiTaskElasticNetCV, LogisticRegressionCV
from sklearn.pipeline import Pipeline
from sklearn.multioutput import MultiOutputRegressor,MultiOutputClassifier
from tqdm import tqdm 
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV

In [None]:
from sklearn.ensemble import RandomForestClassifier


In [None]:
from statsmodels.tsa.stattools import acf

In [None]:

from xgboost import XGBClassifier

# I- Training linear models on daily data 

We will try to train models on both daily and weekly data. We may see some different dynamics between daily and weekly data: it could be easier to train a model on weekly data because there is less autocorrelation between datapoints. However in the meanwhile, the dataset on daily datapoints may be bigger so it could also be easier to train models on daily data. 

In [None]:
datapath = os.path.join('data', 'US', 'us_data.csv')
dus = pd.read_csv(datapath, index_col=0)

In [None]:
cols_inf = dus.columns[np.isinf(dus.to_numpy()).any(axis=0)]
print(cols_inf)

In [None]:
dus.hist(figsize=(23, 23), bins=100)
plt.tight_layout()

### A) Creating new features.

Now we need to add the lagged values of yields as features. We have to choose lags and yields to add as features. 

In [None]:

for USyield in ['DGS1MO', 'DGS3MO', 'DGS6MO', 'DGS1', 'DGS2', 'DGS3','DGS5', 'DGS7', 'DGS10', 'DGS20', 'DGS30']:
    series = dus[USyield]
    fig, axes = plt.subplots(1,2, figsize=(8,4))
    plot_acf(series, lags=90,title = f'ACF {USyield}', ax = axes[0])
    plot_pacf(series, lags=90,title = f'PACF {USyield}',ax = axes[1])

    for ax in axes:
        ax.set_xlim(0.5, 90)  # décale le début après 0
        ax.set_ylim(-0.2, 0.2) 

        if USyield in ['DGS5','DGS7','DGS10', 'DGS20', 'DGS30']:
            
            ax.axvline(x=25, color='red', linestyle='--', linewidth=1)
            ax.axvline(x=50, color='red', linestyle='--', linewidth=1)

    
    plt.show()


- We can see that on short term yields, there is much more autocorrelation in the data, up to more than 30 days. Returns in the past few days are highly correlated to returns in the next days. 
- However, on long term yields, there is much less autocorrelation and returns in the past 2 days are only slightly correlated to next day return. Surprisingly we see some persistent autocorrelation between returns at day t and t-25 and t-50. 

For maturities less than 1y, we'll add the following lags:
- t-1,t-2,t-5,t-10,t-15,t-20,t-25,t-30,t-40,t-50

For maturities more than 1y, we will add:
- t-1,t-2,t-10,t-25,t-50

We will probably need to do some PCA to combine features as they will be very correlated.

In [None]:
for lag in [1,2,5,10,15,20,25,30,40,50]:
    dus[f'DGS1MO_t-{lag}'] = dus['DGS1MO'].shift(lag-1)
    dus[f'DGS3MO_t-{lag}'] = dus['DGS3MO'].shift(lag-1)
    dus[f'DGS6MO_t-{lag}'] = dus['DGS6MO'].shift(lag-1)
    dus[f'DGS1_t-{lag}'] = dus['DGS1'].shift(lag-1)
  

for lag in [1,2,10,15,25,50]:
    dus[f'DGS1_t-{lag}'] = dus['DGS1'].shift(lag-1)
    dus[f'DGS2_t-{lag}'] = dus['DGS2'].shift(lag-1)
    dus[f'DGS3_t-{lag}'] = dus['DGS3'].shift(lag-1)
    dus[f'DGS5_t-{lag}'] = dus['DGS5'].shift(lag-1)
    dus[f'DGS7_t-{lag}'] = dus['DGS7'].shift(lag-1)
    dus[f'DGS10_t-{lag}'] = dus['DGS10'].shift(lag-1)
    dus[f'DGS20_t-{lag}'] = dus['DGS20'].shift(lag-1)
    dus[f'DGS30_t-{lag}'] = dus['DGS30'].shift(lag-1)

In [None]:
#creating variables to forecast 

dus['Y_1MO'] = dus['DGS1MO'].shift(-1)
dus['Y_3MO'] = dus['DGS3MO'].shift(-1)
dus['Y_6MO'] = dus['DGS6MO'].shift(-1)
dus['Y_1year'] = dus['DGS1'].shift(-1)
dus['Y_2year'] = dus['DGS2'].shift(-1)
dus['Y_3year'] = dus['DGS3'].shift(-1)
dus['Y_5year'] = dus['DGS5'].shift(-1)
dus['Y_7year'] = dus['DGS7'].shift(-1)
dus['Y_10year'] = dus['DGS10'].shift(-1)
dus['Y_20year'] = dus['DGS20'].shift(-1)
dus['Y_30year'] = dus['DGS30'].shift(-1)

In [None]:
#we can now remove the original yield columns
dus = dus.drop(columns=['DGS1MO', 'DGS3MO', 'DGS6MO', 'DGS1', 'DGS2', 'DGS3','DGS5', 'DGS7', 'DGS10', 'DGS20', 'DGS30'])

We can now look at the heatmap:

In [None]:
plt.figure(figsize = (25,25))
sns.heatmap(dus.corr(), cmap='seismic', center=0)

Overall the features have a very low correlation with the target, so we'll remove the least correlated ones: 

- the lagged features that have a correlation coefficient < 0.05 in absolute value with all target variables. 
- the other features that have a correlation coefficient < 0.03 in absolute value with all target variables. We do a distinction between lagged features and other features because filtering all features with the 0.05 threshold removes somes features that should have a predictive impact: sp500, gold, VIX for instance. 

Moreover, given the very high correlation between lagged features, we'll apply a PCA in the pipeline on those to limit the number of colinear features.

In [None]:
Y = dus[['Y_1MO', 'Y_3MO', 'Y_6MO', 'Y_1year', 'Y_2year', 'Y_3year', 'Y_5year', 'Y_7year', 'Y_10year', 'Y_20year', 'Y_30year']]
dus_lagged_features = dus[[col for col in dus.columns if '_t-' in col]]
dus_other =dus.drop(columns=[col for col in dus.columns if '_t-' in col])

corrs = pd.DataFrame({
    target: dus_lagged_features.corrwith(Y[target]) for target in Y.columns
}).abs()  

# repérer les colonnes où la corrélation absolue < 0.05 pour toutes les targets
mask = (corrs < 0.05).all(axis=1)
low_corr_features = corrs.index[mask]

# supprimer ces colonnes
dus_filtered = dus.drop(columns=low_corr_features)



corrs = pd.DataFrame({
    target: dus_other.corrwith(Y[target]) for target in Y.columns
}).abs()  

# repérer les colonnes où la corrélation absolue < 0.05 pour toutes les targets
mask = (corrs < 0.03).all(axis=1)
low_corr_features_2 = corrs.index[mask]

# supprimer ces colonnes
dus_filtered = dus_filtered.drop(columns=low_corr_features_2)

print(f"{len(low_corr_features) + len(low_corr_features_2)} features were deleted")

In [None]:
print(low_corr_features)
print(low_corr_features_2)

In [None]:
plt.figure(figsize = (20,20))
sns.heatmap(dus_filtered.corr(), cmap='seismic', center=0)

### B) PCA 

In [None]:
dus_filtered = dus_filtered.dropna()
X = dus_filtered[['USGOOD', 'USCONS', 'MANEMP', 'DMANEMP', 'NDMANEMP', 'USWTRADE',
       'USFIRE', 'PERMIT', 'UMCSENT', 'M2SL', 'M2REAL', 'TOTRESNS', 'CPIAUCSL',
       'CPIAPPSL', 'CPITRNSL', 'CUSR0000SAC', 'CPIULFSL', 'CUUR0000SA0L2',
       'PCEPI', 'DNDGRG3M086SBEA', 'MTSDS133FMS', 'GFDEGDQ188S',
       'IRLTLT01DEM156N', 'IRLTLT01JPM156N', 'IRLTLT01GBM156N',
       'IRLTLT01CAM156N', 'IRLTLT01AUM156N', 'IRLTLT01FRM156N', 'NASDAQCOM',
       'AAA', 'BAA', 'DEXCAUS', 'DEXUSAL', 'NFCI', 'FEDFUNDS', 'BOGMBASE',
       'WSHOSHO', 'T5YIE', 'T10YIE', 'log return gold', 'log return sp500',
       'DGS1MO_t-1', 'DGS3MO_t-1', 'DGS6MO_t-1', 'DGS1_t-1', 'DGS1MO_t-2',
       'DGS3MO_t-2', 'DGS6MO_t-2', 'DGS1MO_t-5', 'DGS3MO_t-5', 'DGS6MO_t-5',
       'DGS3MO_t-10', 'DGS6MO_t-10', 'DGS1_t-10', 'DGS1MO_t-15', 'DGS3MO_t-15',
       'DGS6MO_t-15', 'DGS1MO_t-20', 'DGS3MO_t-20', 'DGS6MO_t-20', 'DGS1_t-20',
       'DGS3MO_t-30', 'DGS6MO_t-40', 'DGS1MO_t-50', 'DGS2_t-1', 'DGS3_t-1',
       'DGS10_t-1', 'DGS20_t-1', 'DGS30_t-1', 'DGS7_t-50', 'DGS10_t-50',
       'DGS20_t-50', 'DGS30_t-50']]

Y = dus_filtered[['Y_1MO', 'Y_3MO', 'Y_6MO', 'Y_1year', 'Y_2year', 'Y_3year', 'Y_5year', 'Y_7year', 'Y_10year', 'Y_20year', 'Y_30year']]


In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_train)

pca = PCA(n_components=0.95)
X_pca = pca.fit_transform(X_scaled)

print("Number of components kept:", pca.n_components_)
print("cumulative explained variance :", pca.explained_variance_ratio_.cumsum())
print("Composantes principales (coefficients sur les features originales) :")
print(pca.components_)

print("Exemple des nouvelles features transformées :")
print(X_pca[:5])

In [None]:
# calculer la contribution absolue sur PC1 et PC2
pc1, pc2 = np.abs(pca.components_[:2])
importance = pc1 + pc2

# garder les n features les plus importantes
n = 30
top_idx = np.argsort(importance)[-n:]
top_labels = X.columns[top_idx]
top_components = pca.components_[:2, top_idx]

plt.figure(figsize=(12,12))
circle = plt.Circle((0,0), 1, color='gray', fill=False)
plt.gca().add_artist(circle)

for i, (x, y) in enumerate(zip(top_components[0,:], top_components[1,:])):
    plt.arrow(0, 0, x, y, color='r', alpha=0.6, head_width=0.02)
    plt.text(x*1.15, y*1.15, top_labels[i], color='b', ha='center', va='center', fontsize=9)

plt.xlim(-1.1, 1.1)
plt.ylim(-1.1, 1.1)
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.title("Correlation circle (top features)")
plt.grid()
plt.axhline(0, color='black', linewidth=0.5)
plt.axvline(0, color='black', linewidth=0.5)
plt.show()


We can see that features are overall very correlated, so we'll train some models with PCA and some without.

### C) Training a ridge model

We do a walk forward cross validation:
- we train our model on 4 years of data (= approximately 1000 data points)
- we do prediction for the next month (21 days)
- wa add a PCA to the pipeline to deal with correlated features. We'll also train a model without PCA to see how it changes the performance.

In [None]:


window_train = 252 *4
window_pred = 21          
alphas = np.logspace(-3, 3, 20)


tscv = TimeSeriesSplit(n_splits=4)

pipe = Pipeline([
    ('scaler',StandardScaler()),
    ('pca',PCA(n_components=0.95)),   
    ('ridge', MultiOutputRegressor(RidgeCV(fit_intercept=False,alphas=alphas, cv=tscv)))
])

pipe_nopca = Pipeline([
    ('scaler',StandardScaler()),   
    ('ridge', MultiOutputRegressor(RidgeCV(fit_intercept=False,alphas=alphas, cv=tscv)))
])

preds = []
dates_pred = []
r2_is_list = []
r2_os_list=[]
hit_rate_list = []

preds_nopca = []
dates_pred_nopca = []
r2_is_list_nopca = []
r2_os_list_nopca=[]
hit_rate_list_nopca = []

for start in tqdm(range(0, len(X) - window_train - window_pred + 1, window_pred)):
  
    end_train = start + window_train
    end_pred = end_train + window_pred

    X_train = X.iloc[start:end_train]
    Y_train = Y.iloc[start:end_train]

    X_test = X.iloc[end_train:end_pred]
    Y_test = Y.iloc[end_train:end_pred]


    # model with PCA 
    pipe.fit(X_train, Y_train)

    Y_pred = pipe.predict(X_test)

    preds.append(Y_pred)
    dates_pred.append(X.index[end_train:end_pred])

    mse = mean_squared_error(Y_test, Y_pred, multioutput='raw_values')
    r2_is = r2_score(Y_train, pipe.predict(X_train), multioutput='raw_values')
    r2_oos = r2_score(Y_test, Y_pred, multioutput='raw_values')
    hit_rate = np.mean(np.sign(Y_test.values) == np.sign(Y_pred), axis=0)
    r2_is_list.append(r2_is)
    r2_os_list.append(r2_oos)
    hit_rate_list.append(hit_rate)


    # model without PCA 
    pipe_nopca.fit(X_train, Y_train)

    Y_pred_nopca = pipe_nopca.predict(X_test)

    preds_nopca.append(Y_pred)
    dates_pred_nopca.append(X.index[end_train:end_pred])

    mse = mean_squared_error(Y_test, Y_pred_nopca, multioutput='raw_values')
    r2_is = r2_score(Y_train, pipe_nopca.predict(X_train), multioutput='raw_values')
    r2_oos = r2_score(Y_test, Y_pred_nopca, multioutput='raw_values')
    hit_rate = np.mean(np.sign(Y_test.values) == np.sign(Y_pred_nopca), axis=0)
    r2_is_list_nopca.append(r2_is)
    r2_os_list_nopca.append(r2_oos)
    hit_rate_list_nopca.append(hit_rate)





In [None]:
r2_is_df = pd.DataFrame(r2_is_list) 
r2_is_df_nopca = pd.DataFrame(r2_is_list_nopca) 
r2_is_df.columns = Y.columns
r2_is_df_nopca.columns = Y.columns
r2_is_df.index = [date[0] for date in dates_pred]
r2_is_df_nopca.index = [date[0] for date in dates_pred]


r2_os_df = pd.DataFrame(r2_os_list)  
r2_os_df.columns = Y.columns
r2_os_df.index = [date[0] for date in dates_pred]
r2_os_df_nopca = pd.DataFrame(r2_os_list_nopca) 
r2_os_df_nopca.columns = Y.columns
r2_os_df_nopca.index = [date[0] for date in dates_pred]


hr = pd.DataFrame(hit_rate_list) 
hr.columns = Y.columns
hr.index = [date[0] for date in dates_pred]
hr_nopca = pd.DataFrame(hit_rate_list_nopca) 
hr_nopca.columns = Y.columns
hr_nopca.index = [date[0] for date in dates_pred]


plt.close('all')  # ferme toutes les figures existantes


fig, ax = plt.subplots(3,2,figsize=(28,28))

r2_is_df.plot(ax = ax[0,0],title = 'Evolution of in sample R2 per model with PCA, per sample')
ax[0,0].grid(True, axis='y', linestyle='--', linewidth=0.5)
r2_is_df_nopca.plot(ax = ax[0,1],title = 'Evolution of in sample R2 per model without PCA, per sample')
ax[0,1].grid(True, axis='y', linestyle='--', linewidth=0.5)


r2_os_df.plot(ax = ax[1,0], ylim =(-0.3,0.3), title = 'Evolution of out-sample R2 per model with PCA, per sample')
ax[1,0].grid(True, axis='y', linestyle='--', linewidth=0.5)
r2_os_df_nopca.plot(ax = ax[1,1], ylim =(-0.3,0.3), title = 'Evolution of out-sample R2 per model without PCA, per sample')
ax[1,1].grid(True, axis='y', linestyle='--', linewidth=0.5)

hr.plot(ax = ax[2,0], title = 'Evolution of out-sample hit rate per model with PCA, per sample')
ax[2,0].grid(True, axis='y', linestyle='--', linewidth=0.5)
hr_nopca.plot(ax = ax[2,1], title = 'Evolution of out-sample hit rate per model without PCA, per sample')
ax[2,1].grid(True, axis='y', linestyle='--', linewidth=0.5)

- When looking at the in sample R2 of our models, we see that the R2 is overall low, but that it is better on short term yields, ie from 1 month to 1 year, and significantly lower for long-term yields. 
- we see significant variations in R2 in 2011 and 2020, probably because of outliers. 

- The out of sample R2 is close to zero or even negative so there is no predictive power in our model.

- adding a PCA in the pipeline do not change anything to the in sample R2. However, the hit rate is slightly less variable when adding the PCA, suggesting a bit less overfitting (although the out of sample results are as bad).
- The hit rate seems to be close to 0.5 on average for all models and all samples. So it appears the model does not do anything better than predicting at random - it completely overfits the data.



## Training Lasso

In [None]:
np.logspace(-5, 2, 20)

In [None]:
window_train = 252 *4
window_pred = 21          
alphas = np.logspace(-5, 2, 20)


tscv = TimeSeriesSplit(n_splits=4)

pipe = Pipeline([
    ('scaler',StandardScaler()),
    ('pca',PCA(n_components=0.95)),   
    ('lasso', MultiTaskLassoCV(fit_intercept=False,alphas=alphas, cv=tscv))
])

pipe_nopca = Pipeline([
    ('scaler',StandardScaler()),   
    ('lasso', MultiTaskLassoCV(fit_intercept=False,alphas=alphas, cv=tscv))
])

preds = []
dates_pred = []
r2_is_list = []
r2_os_list=[]
hit_rate_list = []
alpha,selected_features = [],[]

preds_nopca = []
dates_pred_nopca = []
r2_is_list_nopca = []
r2_os_list_nopca=[]
hit_rate_list_nopca = []
alpha_nopca,selected_features_nopca = [],[]

for start in tqdm(range(0, len(X) - window_train - window_pred + 1, window_pred)):
  
    end_train = start + window_train
    end_pred = end_train + window_pred

    X_train = X.iloc[start:end_train]
    Y_train = Y.iloc[start:end_train]

    X_test = X.iloc[end_train:end_pred]
    Y_test = Y.iloc[end_train:end_pred]


    # model with PCA 
    pipe.fit(X_train, Y_train)

    Y_pred = pipe.predict(X_test)

    preds.append(Y_pred)
    dates_pred.append(X.index[end_train:end_pred])

    mse = mean_squared_error(Y_test, Y_pred, multioutput='raw_values')
    r2_is = r2_score(Y_train, pipe.predict(X_train), multioutput='raw_values')
    r2_oos = r2_score(Y_test, Y_pred, multioutput='raw_values')
    hit_rate = np.mean(np.sign(Y_test.values) == np.sign(Y_pred), axis=0)
    r2_is_list.append(r2_is)
    r2_os_list.append(r2_oos)
    hit_rate_list.append(hit_rate)
    alpha.append(pipe.named_steps['lasso'].alpha_)
    selected_features.append(np.sum(np.any(pipe.named_steps['lasso'].coef_!=0,axis=0)))


    # model without PCA 
    pipe_nopca.fit(X_train, Y_train)

    Y_pred_nopca = pipe_nopca.predict(X_test)

    preds_nopca.append(Y_pred)
    dates_pred_nopca.append(X.index[end_train:end_pred])

    mse = mean_squared_error(Y_test, Y_pred_nopca, multioutput='raw_values')
    r2_is = r2_score(Y_train, pipe_nopca.predict(X_train), multioutput='raw_values')
    r2_oos = r2_score(Y_test, Y_pred_nopca, multioutput='raw_values')
    hit_rate = np.mean(np.sign(Y_test.values) == np.sign(Y_pred_nopca), axis=0)
    r2_is_list_nopca.append(r2_is)
    r2_os_list_nopca.append(r2_oos)
    hit_rate_list_nopca.append(hit_rate)
    alpha_nopca.append(pipe_nopca.named_steps['lasso'].alpha_)
    selected_features_nopca.append(np.sum(np.any(pipe_nopca.named_steps['lasso'].coef_!=0,axis=0)))






In [None]:
r2_is_df = pd.DataFrame(r2_is_list) 
r2_is_df_nopca = pd.DataFrame(r2_is_list_nopca) 
r2_is_df.columns = Y.columns
r2_is_df_nopca.columns = Y.columns
r2_is_df.index = [date[0] for date in dates_pred]
r2_is_df_nopca.index = [date[0] for date in dates_pred]


r2_os_df = pd.DataFrame(r2_os_list)  
r2_os_df.columns = Y.columns
r2_os_df.index = [date[0] for date in dates_pred]
r2_os_df_nopca = pd.DataFrame(r2_os_list_nopca) 
r2_os_df_nopca.columns = Y.columns
r2_os_df_nopca.index = [date[0] for date in dates_pred]


hr = pd.DataFrame(hit_rate_list) 
hr.columns = Y.columns
hr.index = [date[0] for date in dates_pred]
hr_nopca = pd.DataFrame(hit_rate_list_nopca) 
hr_nopca.columns = Y.columns
hr_nopca.index = [date[0] for date in dates_pred]

fs = pd.DataFrame(selected_features)
fs.index = [date[0] for date in dates_pred]

fs_nopca = pd.DataFrame(selected_features_nopca)
fs_nopca.index = [date[0] for date in dates_pred]


alphadf = pd.DataFrame(alpha)
alphadf.index = [date[0] for date in dates_pred]

alphadf_nopca = pd.DataFrame(alpha_nopca)
alphadf_nopca.index = [date[0] for date in dates_pred]




fig, ax = plt.subplots(5,2,figsize=(28,28))

r2_is_df.plot(ax = ax[0,0],title = 'Evolution of in sample R2 per maturity with PCA, per sample')
ax[0,0].grid(True, axis='y', linestyle='--', linewidth=0.5)
r2_is_df_nopca.plot(ax = ax[0,1],title = 'Evolution of in sample R2 per maturity without PCA, per sample')
ax[0,1].grid(True, axis='y', linestyle='--', linewidth=0.5)


r2_os_df.plot(ax = ax[1,0], ylim =(-0.3,0.3), title = 'Evolution of out-sample R2 per maturity with PCA, per sample')
ax[1,0].grid(True, axis='y', linestyle='--', linewidth=0.5)
r2_os_df_nopca.plot(ax = ax[1,1], ylim =(-0.3,0.3), title = 'Evolution of out-sample R2 per maturity without PCA, per sample')
ax[1,1].grid(True, axis='y', linestyle='--', linewidth=0.5)

hr.plot(ax = ax[2,0], title = 'Evolution of out-sample hit rate per maturity with PCA, per sample')
ax[2,0].grid(True, axis='y', linestyle='--', linewidth=0.5)
hr_nopca.plot(ax = ax[2,1], title = 'Evolution of out-sample hit rate per maturity without PCA, per sample')
ax[2,1].grid(True, axis='y', linestyle='--', linewidth=0.5)


fs.plot(ax = ax[3,0], title = 'Evolution of number of selected features with PCA, per sample')
ax[3,0].grid(True, axis='y', linestyle='--', linewidth=0.5)
fs_nopca.plot(ax = ax[3,1], title = 'Evolution of number of selected features without PCA, per sample')
ax[3,1].grid(True, axis='y', linestyle='--', linewidth=0.5)

alphadf.plot(ax = ax[4,0], title = 'Evolution of alpha selected by cross validation - model with PCA, per sample')
ax[4,0].grid(True, axis='y', linestyle='--', linewidth=0.5)
alphadf_nopca.plot(ax = ax[4,1], title = 'Evolution of alpha selected by cross validation - model without PCA, per sample')
ax[4,1].grid(True, axis='y', linestyle='--', linewidth=0.5)

Lasso performance is even worse than that of ridge because there are some periods for which the model selects no features. 

## Training Elastic net 

In [None]:
window_train = 252 *10
window_pred = 21          
alphas = np.logspace(-3, 2, 20)
l1_ratios = [0.25,0.5,0.75]


tscv = TimeSeriesSplit(n_splits=4)

pipe = Pipeline([
    ('scaler',StandardScaler()),
    ('pca',PCA(n_components=0.95)),   
    ('elasticnet', MultiTaskElasticNetCV(alphas=alphas, cv=tscv, l1_ratio=0.5, fit_intercept=False,max_iter=5000))
])

pipe_nopca = Pipeline([
    ('scaler',StandardScaler()),   
    ('elasticnet', MultiTaskElasticNetCV(alphas=alphas, cv=tscv, l1_ratio=l1_ratios, fit_intercept=False,max_iter=5000))
])

preds = []
dates_pred = []
r2_is_list = []
r2_os_list=[]
hit_rate_list = []
alpha,selected_features,l1 = [],[],[]

preds_nopca = []
dates_pred_nopca = []
r2_is_list_nopca = []
r2_os_list_nopca=[]
hit_rate_list_nopca = []
alpha_nopca,selected_features_nopca,l1_nopca = [],[],[]

for start in tqdm(range(0, len(X) - window_train - window_pred + 1, window_pred)):
  
    end_train = start + window_train
    end_pred = end_train + window_pred

    X_train = X.iloc[start:end_train]
    Y_train = Y.iloc[start:end_train]

    X_test = X.iloc[end_train:end_pred]
    Y_test = Y.iloc[end_train:end_pred]


    # model with PCA 
    pipe.fit(X_train, Y_train)

    Y_pred = pipe.predict(X_test)

    preds.append(Y_pred)
    dates_pred.append(X.index[end_train:end_pred])

    mse = mean_squared_error(Y_test, Y_pred, multioutput='raw_values')
    r2_is = r2_score(Y_train, pipe.predict(X_train), multioutput='raw_values')
    r2_oos = r2_score(Y_test, Y_pred, multioutput='raw_values')
    hit_rate = np.mean(np.sign(Y_test.values) == np.sign(Y_pred), axis=0)
    r2_is_list.append(r2_is)
    r2_os_list.append(r2_oos)
    hit_rate_list.append(hit_rate)
    alpha.append(pipe.named_steps['elasticnet'].alpha_)
    selected_features.append(np.sum(np.any(pipe.named_steps['elasticnet'].coef_!=0,axis=0)))
    l1.append(pipe.named_steps['elasticnet'].l1_ratio_)


    # model without PCA 
    pipe_nopca.fit(X_train, Y_train)

    Y_pred_nopca = pipe_nopca.predict(X_test)

    preds_nopca.append(Y_pred)
    dates_pred_nopca.append(X.index[end_train:end_pred])

    mse = mean_squared_error(Y_test, Y_pred_nopca, multioutput='raw_values')
    r2_is = r2_score(Y_train, pipe_nopca.predict(X_train), multioutput='raw_values')
    r2_oos = r2_score(Y_test, Y_pred_nopca, multioutput='raw_values')
    hit_rate = np.mean(np.sign(Y_test.values) == np.sign(Y_pred_nopca), axis=0)
    r2_is_list_nopca.append(r2_is)
    r2_os_list_nopca.append(r2_oos)
    hit_rate_list_nopca.append(hit_rate)
    alpha_nopca.append(pipe_nopca.named_steps['elasticnet'].alpha_)
    selected_features_nopca.append(np.sum(np.any(pipe_nopca.named_steps['elasticnet'].coef_!=0,axis=0)))
    l1_nopca.append(pipe_nopca.named_steps['elasticnet'].l1_ratio_)






In [None]:
r2_is_df = pd.DataFrame(r2_is_list) 
r2_is_df_nopca = pd.DataFrame(r2_is_list_nopca) 
r2_is_df.columns = Y.columns
r2_is_df_nopca.columns = Y.columns
r2_is_df.index = [date[0] for date in dates_pred]
r2_is_df_nopca.index = [date[0] for date in dates_pred]


r2_os_df = pd.DataFrame(r2_os_list)  
r2_os_df.columns = Y.columns
r2_os_df.index = [date[0] for date in dates_pred]
r2_os_df_nopca = pd.DataFrame(r2_os_list_nopca) 
r2_os_df_nopca.columns = Y.columns
r2_os_df_nopca.index = [date[0] for date in dates_pred]


hr = pd.DataFrame(hit_rate_list) 
hr.columns = Y.columns
hr.index = [date[0] for date in dates_pred]
hr_nopca = pd.DataFrame(hit_rate_list_nopca) 
hr_nopca.columns = Y.columns
hr_nopca.index = [date[0] for date in dates_pred]

fs = pd.DataFrame(selected_features)
fs.index = [date[0] for date in dates_pred]

fs_nopca = pd.DataFrame(selected_features_nopca)
fs_nopca.index = [date[0] for date in dates_pred]


alphadf = pd.DataFrame(alpha)
alphadf.index = [date[0] for date in dates_pred]

alphadf_nopca = pd.DataFrame(alpha_nopca)
alphadf_nopca.index = [date[0] for date in dates_pred]


l1df = pd.DataFrame(l1)
l1df.index = [date[0] for date in dates_pred]

l1df_nopca = pd.DataFrame(l1_nopca)
l1df_nopca.index = [date[0] for date in dates_pred]







fig, ax = plt.subplots(6,2,figsize=(28,28))

r2_is_df.plot(ax = ax[0,0],title = 'Evolution of in sample R2 per maturity with PCA, per sample')
ax[0,0].grid(True, axis='y', linestyle='--', linewidth=0.5)
r2_is_df_nopca.plot(ax = ax[0,1],title = 'Evolution of in sample R2 per maturity without PCA, per sample')
ax[0,1].grid(True, axis='y', linestyle='--', linewidth=0.5)


r2_os_df.plot(ax = ax[1,0], ylim =(-0.3,0.3), title = 'Evolution of out-sample R2 per maturity with PCA, per sample')
ax[1,0].grid(True, axis='y', linestyle='--', linewidth=0.5)
r2_os_df_nopca.plot(ax = ax[1,1], ylim =(-0.3,0.3), title = 'Evolution of out-sample R2 per maturity without PCA, per sample')
ax[1,1].grid(True, axis='y', linestyle='--', linewidth=0.5)

hr.plot(ax = ax[2,0], title = 'Evolution of out-sample hit rate per maturity with PCA, per sample')
ax[2,0].grid(True, axis='y', linestyle='--', linewidth=0.5)
hr_nopca.plot(ax = ax[2,1], title = 'Evolution of out-sample hit rate per maturity without PCA, per sample')
ax[2,1].grid(True, axis='y', linestyle='--', linewidth=0.5)


fs.plot(ax = ax[3,0], title = 'Evolution of number of selected features with PCA, per sample')
ax[3,0].grid(True, axis='y', linestyle='--', linewidth=0.5)
fs_nopca.plot(ax = ax[3,1], title = 'Evolution of number of selected features without PCA, per sample')
ax[3,1].grid(True, axis='y', linestyle='--', linewidth=0.5)

alphadf.plot(ax = ax[4,0], title = 'Evolution of alpha selected by cross validation - model with PCA, per sample')
ax[4,0].grid(True, axis='y', linestyle='--', linewidth=0.5)
alphadf_nopca.plot(ax = ax[4,1], title = 'Evolution of alpha selected by cross validation - model without PCA, per sample')
ax[4,1].grid(True, axis='y', linestyle='--', linewidth=0.5)

l1df.plot(ax = ax[5,0], title = 'Evolution of L1 ratio selected by cross validation - model with PCA, per sample')
ax[5,0].grid(True, axis='y', linestyle='--', linewidth=0.5)
l1df_nopca.plot(ax = ax[5,1], title = 'Evolution of L1 ratio selected by cross validation - model without PCA, per sample')
ax[5,1].grid(True, axis='y', linestyle='--', linewidth=0.5)

Overall all these linear models work very poorly, which is not surprising given the low correlation between features and past yields. We can now try to work on weekly data, which may be a bit less noisy: hopefully our model will be able to better identify some trends and patterns. 

# II) Training linear models on weekly data 

In [None]:
datapath = os.path.join('data', 'US', 'us_data_weekly.csv')
dus = pd.read_csv(datapath, index_col=0)

In [None]:
dus = dus[dus.index>'2003-01-03']

### A) Creating new features

In [None]:

for USyield in ['DGS1MO', 'DGS3MO', 'DGS6MO', 'DGS1', 'DGS2', 'DGS3','DGS5', 'DGS7', 'DGS10', 'DGS20', 'DGS30']:
    series = dus[USyield]
    fig, axes = plt.subplots(1,2, figsize=(14,6))
    plot_acf(series, lags=50,title = f'ACF {USyield}', ax = axes[0])
    plot_pacf(series, lags=50,title = f'PACF {USyield}',ax = axes[1])
    

    for ax in axes:
        ax.set_xlim(0.5, 50)  # décale le début après 0
        ax.set_ylim(-0.2, 0.2) 

    
    plt.show()


- We can see that on short term yields, there is much more autocorrelation in the data, up to 40 lags. Returns in the past few weeks (and so on the past year) are highly correlated to returns in the next week. 
- However, on long term yields, there is much less autocorrelation and returns in the past 2 weeks are only very slightly correlated to next day return. Surprisingly we see some persistent autocorrelation between returns at day t and t-25 and t-50. 

For maturities less than 1y, we'll add the following lags:
- t-1,t-2,t-5,t-10,t-15,t-20,t-25,t-30,t-40

For maturities more than 1y, we will add:
- t-1,t-2,t-10

We will probably need to do some PCA to combine features as they will be very correlated.

In [None]:
for lag in [1,2,5,10,15,20,25,30,40,50]:
    dus[f'DGS1MO_t-{lag}'] = dus['DGS1MO'].shift(lag-1)
    dus[f'DGS3MO_t-{lag}'] = dus['DGS3MO'].shift(lag-1)
    dus[f'DGS6MO_t-{lag}'] = dus['DGS6MO'].shift(lag-1)
    dus[f'DGS1_t-{lag}'] = dus['DGS1'].shift(lag-1)
  

for lag in [1,2,10]:
    dus[f'DGS1_t-{lag}'] = dus['DGS1'].shift(lag-1)
    dus[f'DGS2_t-{lag}'] = dus['DGS2'].shift(lag-1)
    dus[f'DGS3_t-{lag}'] = dus['DGS3'].shift(lag-1)
    dus[f'DGS5_t-{lag}'] = dus['DGS5'].shift(lag-1)
    dus[f'DGS7_t-{lag}'] = dus['DGS7'].shift(lag-1)
    dus[f'DGS10_t-{lag}'] = dus['DGS10'].shift(lag-1)
    dus[f'DGS20_t-{lag}'] = dus['DGS20'].shift(lag-1)
    dus[f'DGS30_t-{lag}'] = dus['DGS30'].shift(lag-1)

In [None]:
#creating variables to forecast 

dus['Y_1MO'] = dus['DGS1MO'].shift(-1)
dus['Y_3MO'] = dus['DGS3MO'].shift(-1)
dus['Y_6MO'] = dus['DGS6MO'].shift(-1)
dus['Y_1year'] = dus['DGS1'].shift(-1)
dus['Y_2year'] = dus['DGS2'].shift(-1)
dus['Y_3year'] = dus['DGS3'].shift(-1)
dus['Y_5year'] = dus['DGS5'].shift(-1)
dus['Y_7year'] = dus['DGS7'].shift(-1)
dus['Y_10year'] = dus['DGS10'].shift(-1)
dus['Y_20year'] = dus['DGS20'].shift(-1)
dus['Y_30year'] = dus['DGS30'].shift(-1)

We'll also add statistical features like the mean, variance, autocorrelation, quantiles of the time series to forecast.

In [None]:
def add_ts_features(df, cols, max_lag=30, windows=[20, 60]):
    features = pd.DataFrame(index=df.index)
    
    for col in cols:
        y = df[col]

  
        # --- Statistiques glissantes ---
        for w in windows:
            features[f'{col}_mean_{w}'] = y.rolling(w).mean()
            features[f'{col}_std_{w}'] = y.rolling(w).std()
            features[f'{col}_q25_{w}'] = y.rolling(w).quantile(0.25)
            features[f'{col}_q75_{w}'] = y.rolling(w).quantile(0.75)
            features[f'{col}_q05_{w}'] = y.rolling(w).quantile(0.1)
            features[f'{col}_q90_{w}'] = y.rolling(w).quantile(0.9)
            features[f'{col}_range_{w}'] = y.rolling(w).max() - y.rolling(w).min() 
        
       
        # --- Autocorrélations locales ---
        for lag in range(1, max_lag + 1):
            features[f'{col}_autocorr_{lag}'] = (
                y.rolling(window=max(windows)).apply(lambda x: x.autocorr(lag=lag), raw=False)
            )
        
    return features

# Exemple d’usage :
cols = ['DGS1MO', 'DGS3MO', 'DGS6MO', 'DGS1', 'DGS2', 'DGS3',
        'DGS5', 'DGS7', 'DGS10', 'DGS20', 'DGS30']

dus_features = add_ts_features(dus, cols)


In [None]:
dus_features.dropna()

In [None]:
dus = dus.merge(dus_features, how = 'inner', left_index=True, right_index=True)

In [None]:
dus = dus.dropna()
dus

In [None]:
#we can now remove the original yield columns
dus = dus.drop(columns=['DGS1MO', 'DGS3MO', 'DGS6MO', 'DGS1', 'DGS2', 'DGS3','DGS5', 'DGS7', 'DGS10', 'DGS20', 'DGS30'])

Many features aren't that much correlated to the target, so we'll remove all features with correlation lower than 0.1 to the target.

In [None]:
Y = dus[['Y_1MO', 'Y_3MO', 'Y_6MO', 'Y_1year', 'Y_2year', 'Y_3year', 'Y_5year', 'Y_7year', 'Y_10year', 'Y_20year', 'Y_30year']]

corrs = pd.DataFrame({
    target: dus.corrwith(Y[target]) for target in Y.columns
}).abs()  

# repérer les colonnes où la corrélation absolue < 0.05 pour toutes les targets
mask = (corrs < 0.1).all(axis=1)
low_corr_features = corrs.index[mask]

# supprimer ces colonnes
dus_filtered = dus.drop(columns=low_corr_features)

print(f"{len(low_corr_features)} features were deleted")

In [None]:
print(low_corr_features)

In [None]:
plt.figure(figsize = (25,25))
sns.heatmap(dus_filtered.corr(), cmap='seismic', center=0)

In [None]:
print(dus_filtered.shape)

### B) Training a ridge model 

We can now try to train a ridge.

In [None]:
Y = dus_filtered[['Y_1MO', 'Y_3MO', 'Y_6MO', 'Y_1year', 'Y_2year', 'Y_3year', 'Y_5year', 'Y_7year', 'Y_10year', 'Y_20year', 'Y_30year']]

X = dus_filtered.drop(columns = ['Y_1MO', 'Y_3MO', 'Y_6MO', 'Y_1year', 'Y_2year', 'Y_3year', 'Y_5year', 'Y_7year', 'Y_10year', 'Y_20year', 'Y_30year'])

In [None]:
window_train = 52*5 #we train our model on 5 years of data and test it on the next month 
window_pred = 4          
alphas = np.logspace(-1, 3, 20)


tscv = TimeSeriesSplit(n_splits=4)

pipe = Pipeline([
    ('scaler',StandardScaler()),
    ('pca',PCA(n_components=0.95)),   
    ('ridge', MultiOutputRegressor(RidgeCV(fit_intercept=False,alphas=alphas, cv=tscv)))
])

pipe_nopca = Pipeline([
    ('scaler',StandardScaler()),   
    ('ridge', MultiOutputRegressor(RidgeCV(fit_intercept=False,alphas=alphas, cv=tscv)))
])

preds = []
dates_pred = []
r2_is_list = []
r2_os_list=[]
hit_rate_list = []

preds_nopca = []
dates_pred_nopca = []
r2_is_list_nopca = []
r2_os_list_nopca=[]
hit_rate_list_nopca = []

for start in tqdm(range(0, len(X) - window_train - window_pred + 1, window_pred)):
  
    end_train = start + window_train
    end_pred = end_train + window_pred

    X_train = X.iloc[start:end_train]
    Y_train = Y.iloc[start:end_train]

    X_test = X.iloc[end_train:end_pred]
    Y_test = Y.iloc[end_train:end_pred]


    # model with PCA 
    pipe.fit(X_train, Y_train)

    Y_pred = pipe.predict(X_test)

    preds.append(Y_pred)
    dates_pred.append(X.index[end_train:end_pred])

    mse = mean_squared_error(Y_test, Y_pred, multioutput='raw_values')
    r2_is = r2_score(Y_train, pipe.predict(X_train), multioutput='raw_values')
    r2_oos = r2_score(Y_test, Y_pred, multioutput='raw_values')
    hit_rate = np.mean(np.sign(Y_test.values) == np.sign(Y_pred), axis=0)
    r2_is_list.append(r2_is)
    r2_os_list.append(r2_oos)
    hit_rate_list.append(hit_rate)


    # model without PCA 
    pipe_nopca.fit(X_train, Y_train)

    Y_pred_nopca = pipe_nopca.predict(X_test)

    preds_nopca.append(Y_pred)
    dates_pred_nopca.append(X.index[end_train:end_pred])

    mse = mean_squared_error(Y_test, Y_pred_nopca, multioutput='raw_values')
    r2_is = r2_score(Y_train, pipe_nopca.predict(X_train), multioutput='raw_values')
    r2_oos = r2_score(Y_test, Y_pred_nopca, multioutput='raw_values')
    hit_rate = np.mean(np.sign(Y_test.values) == np.sign(Y_pred_nopca), axis=0)
    r2_is_list_nopca.append(r2_is)
    r2_os_list_nopca.append(r2_oos)
    hit_rate_list_nopca.append(hit_rate)





In [None]:
r2_is_df = pd.DataFrame(r2_is_list) 
r2_is_df_nopca = pd.DataFrame(r2_is_list_nopca) 
r2_is_df.columns = Y.columns
r2_is_df_nopca.columns = Y.columns
r2_is_df.index = [date[0] for date in dates_pred]
r2_is_df_nopca.index = [date[0] for date in dates_pred]


r2_os_df = pd.DataFrame(r2_os_list)  
r2_os_df.columns = Y.columns
r2_os_df.index = [date[0] for date in dates_pred]
r2_os_df_nopca = pd.DataFrame(r2_os_list_nopca) 
r2_os_df_nopca.columns = Y.columns
r2_os_df_nopca.index = [date[0] for date in dates_pred]


hr = pd.DataFrame(hit_rate_list) 
hr.columns = Y.columns
hr.index = [date[0] for date in dates_pred]
hr_nopca = pd.DataFrame(hit_rate_list_nopca) 
hr_nopca.columns = Y.columns
hr_nopca.index = [date[0] for date in dates_pred]


plt.close('all')  # ferme toutes les figures existantes


fig, ax = plt.subplots(3,2,figsize=(28,28))

r2_is_df.plot(ax = ax[0,0],title = 'Evolution of in sample R2 per model with PCA, per sample')
ax[0,0].grid(True, axis='y', linestyle='--', linewidth=0.5)
r2_is_df_nopca.plot(ax = ax[0,1],title = 'Evolution of in sample R2 per model without PCA, per sample')
ax[0,1].grid(True, axis='y', linestyle='--', linewidth=0.5)


r2_os_df.plot(ax = ax[1,0], ylim =(-0.3,0.3), title = 'Evolution of out-sample R2 per model with PCA, per sample')
ax[1,0].grid(True, axis='y', linestyle='--', linewidth=0.5)
r2_os_df_nopca.plot(ax = ax[1,1], ylim =(-0.3,0.3), title = 'Evolution of out-sample R2 per model without PCA, per sample')
ax[1,1].grid(True, axis='y', linestyle='--', linewidth=0.5)

hr.plot(ax = ax[2,0], title = 'Evolution of out-sample hit rate per model with PCA, per sample')
ax[2,0].grid(True, axis='y', linestyle='--', linewidth=0.5)
hr_nopca.plot(ax = ax[2,1], title = 'Evolution of out-sample hit rate per model without PCA, per sample')
ax[2,1].grid(True, axis='y', linestyle='--', linewidth=0.5)

Results are slightly better than when working on daily data, but we still only learn noise and completely overfit the data.

### B) Training an elastic net model

In [None]:
window_train = 52*5
window_pred = 4         
alphas = np.logspace(-2, 2, 20)
l1_ratios = [0.25,0.5,0.75]


tscv = TimeSeriesSplit(n_splits=4)

pipe = Pipeline([
    ('scaler',StandardScaler()),
    ('pca',PCA(n_components=0.95)),   
    ('elasticnet', MultiTaskElasticNetCV(alphas=alphas, cv=tscv, l1_ratio=0.5, fit_intercept=False,max_iter=5000))
])

pipe_nopca = Pipeline([
    ('scaler',StandardScaler()),   
    ('elasticnet', MultiTaskElasticNetCV(alphas=alphas, cv=tscv, l1_ratio=l1_ratios, fit_intercept=False,max_iter=5000))
])

preds = []
dates_pred = []
r2_is_list = []
r2_os_list=[]
hit_rate_list = []
alpha,selected_features,l1 = [],[],[]

preds_nopca = []
dates_pred_nopca = []
r2_is_list_nopca = []
r2_os_list_nopca=[]
hit_rate_list_nopca = []
alpha_nopca,selected_features_nopca,l1_nopca = [],[],[]

for start in tqdm(range(0, len(X) - window_train - window_pred + 1, window_pred)):
  
    end_train = start + window_train
    end_pred = end_train + window_pred

    X_train = X.iloc[start:end_train]
    Y_train = Y.iloc[start:end_train]

    X_test = X.iloc[end_train:end_pred]
    Y_test = Y.iloc[end_train:end_pred]


    # model with PCA 
    pipe.fit(X_train, Y_train)

    Y_pred = pipe.predict(X_test)

    preds.append(Y_pred)
    dates_pred.append(X.index[end_train:end_pred])

    mse = mean_squared_error(Y_test, Y_pred, multioutput='raw_values')
    r2_is = r2_score(Y_train, pipe.predict(X_train), multioutput='raw_values')
    r2_oos = r2_score(Y_test, Y_pred, multioutput='raw_values')
    hit_rate = np.mean(np.sign(Y_test.values) == np.sign(Y_pred), axis=0)
    r2_is_list.append(r2_is)
    r2_os_list.append(r2_oos)
    hit_rate_list.append(hit_rate)
    alpha.append(pipe.named_steps['elasticnet'].alpha_)
    selected_features.append(np.sum(np.any(pipe.named_steps['elasticnet'].coef_!=0,axis=0)))
    l1.append(pipe.named_steps['elasticnet'].l1_ratio_)


    # model without PCA 
    pipe_nopca.fit(X_train, Y_train)

    Y_pred_nopca = pipe_nopca.predict(X_test)

    preds_nopca.append(Y_pred)
    dates_pred_nopca.append(X.index[end_train:end_pred])

    mse = mean_squared_error(Y_test, Y_pred_nopca, multioutput='raw_values')
    r2_is = r2_score(Y_train, pipe_nopca.predict(X_train), multioutput='raw_values')
    r2_oos = r2_score(Y_test, Y_pred_nopca, multioutput='raw_values')
    hit_rate = np.mean(np.sign(Y_test.values) == np.sign(Y_pred_nopca), axis=0)
    r2_is_list_nopca.append(r2_is)
    r2_os_list_nopca.append(r2_oos)
    hit_rate_list_nopca.append(hit_rate)
    alpha_nopca.append(pipe_nopca.named_steps['elasticnet'].alpha_)
    selected_features_nopca.append(np.sum(np.any(pipe_nopca.named_steps['elasticnet'].coef_!=0,axis=0)))
    l1_nopca.append(pipe_nopca.named_steps['elasticnet'].l1_ratio_)






In [None]:
r2_is_df = pd.DataFrame(r2_is_list) 
r2_is_df_nopca = pd.DataFrame(r2_is_list_nopca) 
r2_is_df.columns = Y.columns
r2_is_df_nopca.columns = Y.columns
r2_is_df.index = [date[0] for date in dates_pred]
r2_is_df_nopca.index = [date[0] for date in dates_pred]


r2_os_df = pd.DataFrame(r2_os_list)  
r2_os_df.columns = Y.columns
r2_os_df.index = [date[0] for date in dates_pred]
r2_os_df_nopca = pd.DataFrame(r2_os_list_nopca) 
r2_os_df_nopca.columns = Y.columns
r2_os_df_nopca.index = [date[0] for date in dates_pred]


hr = pd.DataFrame(hit_rate_list) 
hr.columns = Y.columns
hr.index = [date[0] for date in dates_pred]
hr_nopca = pd.DataFrame(hit_rate_list_nopca) 
hr_nopca.columns = Y.columns
hr_nopca.index = [date[0] for date in dates_pred]

fs = pd.DataFrame(selected_features)
fs.index = [date[0] for date in dates_pred]

fs_nopca = pd.DataFrame(selected_features_nopca)
fs_nopca.index = [date[0] for date in dates_pred]


alphadf = pd.DataFrame(alpha)
alphadf.index = [date[0] for date in dates_pred]

alphadf_nopca = pd.DataFrame(alpha_nopca)
alphadf_nopca.index = [date[0] for date in dates_pred]


l1df = pd.DataFrame(l1)
l1df.index = [date[0] for date in dates_pred]

l1df_nopca = pd.DataFrame(l1_nopca)
l1df_nopca.index = [date[0] for date in dates_pred]







fig, ax = plt.subplots(6,2,figsize=(28,28))

r2_is_df.plot(ax = ax[0,0],title = 'Evolution of in sample R2 per maturity with PCA, per sample')
ax[0,0].grid(True, axis='y', linestyle='--', linewidth=0.5)
r2_is_df_nopca.plot(ax = ax[0,1],title = 'Evolution of in sample R2 per maturity without PCA, per sample')
ax[0,1].grid(True, axis='y', linestyle='--', linewidth=0.5)


r2_os_df.plot(ax = ax[1,0], ylim =(-0.3,0.3), title = 'Evolution of out-sample R2 per maturity with PCA, per sample')
ax[1,0].grid(True, axis='y', linestyle='--', linewidth=0.5)
r2_os_df_nopca.plot(ax = ax[1,1], ylim =(-0.3,0.3), title = 'Evolution of out-sample R2 per maturity without PCA, per sample')
ax[1,1].grid(True, axis='y', linestyle='--', linewidth=0.5)

hr.plot(ax = ax[2,0], title = 'Evolution of out-sample hit rate per maturity with PCA, per sample')
ax[2,0].grid(True, axis='y', linestyle='--', linewidth=0.5)
hr_nopca.plot(ax = ax[2,1], title = 'Evolution of out-sample hit rate per maturity without PCA, per sample')
ax[2,1].grid(True, axis='y', linestyle='--', linewidth=0.5)


fs.plot(ax = ax[3,0], title = 'Evolution of number of selected features with PCA, per sample')
ax[3,0].grid(True, axis='y', linestyle='--', linewidth=0.5)
fs_nopca.plot(ax = ax[3,1], title = 'Evolution of number of selected features without PCA, per sample')
ax[3,1].grid(True, axis='y', linestyle='--', linewidth=0.5)

alphadf.plot(ax = ax[4,0], title = 'Evolution of alpha selected by cross validation - model with PCA, per sample')
ax[4,0].grid(True, axis='y', linestyle='--', linewidth=0.5)
alphadf_nopca.plot(ax = ax[4,1], title = 'Evolution of alpha selected by cross validation - model without PCA, per sample')
ax[4,1].grid(True, axis='y', linestyle='--', linewidth=0.5)

l1df.plot(ax = ax[5,0], title = 'Evolution of L1 ratio selected by cross validation - model with PCA, per sample')
ax[5,0].grid(True, axis='y', linestyle='--', linewidth=0.5)
l1df_nopca.plot(ax = ax[5,1], title = 'Evolution of L1 ratio selected by cross validation - model without PCA, per sample')
ax[5,1].grid(True, axis='y', linestyle='--', linewidth=0.5)

The results are also disappointing. Overall this is not surprising, we've seen that our features have a low correlation with the yield so the predictive power of these models is logically very low. We'll now try to do binary classification, maybe it could work better. Moreover, we'll focus on more complex models to see if they can extract non linear relationships between features and yields.

# III - Binary classification models

In [288]:
datapath = os.path.join('data', 'US', 'us_data_weekly.csv')
dus_weekly = pd.read_csv(datapath, index_col=0)
dus_weekly = dus_weekly.dropna()

### A) Creating new features

In [289]:

def add_ts_features(df, cols, max_lag=20, windows=[10,20,50]):

    all_features = {}

    for col in cols:
        y = df[col]

        # --- Rolling statistics ---
        for w in windows:
            roll = y.rolling(w)
            all_features[f'{col}_mean_{w}'] = roll.mean()
            all_features[f'{col}_std_{w}'] = roll.std()
            all_features[f'{col}_q25_{w}'] = roll.quantile(0.25)
            all_features[f'{col}_q75_{w}'] = roll.quantile(0.75)
            all_features[f'{col}_q05_{w}'] = roll.quantile(0.05)
            all_features[f'{col}_q90_{w}'] = roll.quantile(0.9)
            all_features[f'{col}_range_{w}'] = roll.max() - roll.min()

            # Z-score et momentum
            all_features[f'{col}_zscore_{w}'] = (y - roll.mean()) / roll.std()
            all_features[f'{col}_momentum_{w}'] = y - y.shift(w)

        # Ratio de moyennes rapides / lentes
        all_features[f'{col}_ratio_{10}_{50}'] = (
            y.rolling(10).mean() / y.rolling(50).mean()
        )

        # Volatilité annualisée approx
        all_features[f'{col}_vol_20'] = y.rolling(20).std() * np.sqrt(52)

        # --- Lags bruts ---
        for lag in [1,2,3,4,5,10,15,20,25,30,40,50]:
            all_features[f'{col}_lag_{lag}'] = y.shift(lag-1)

        # --- Autocorrélations (in-sample) ---
        
        acf_vals = acf(y.dropna(), nlags=max_lag, fft=True)
        for lag in range(1, max_lag + 1):
            all_features[f'{col}_autocorr_{lag}'] = acf_vals[lag]

    # --- Construction finale ---
    features = pd.DataFrame(all_features, index=df.index)
    return features


In [290]:
cols = ['DGS1MO', 'DGS3MO', 'DGS6MO', 'DGS1', 'DGS2', 'DGS3',
        'DGS5', 'DGS7', 'DGS10', 'DGS20', 'DGS30']

dus_features = add_ts_features(dus_weekly, cols)
dus_features= dus_features.dropna()
dus_weekly = dus_weekly.merge(dus_features, how = 'inner', left_index=True, right_index=True)

In [291]:
for col in cols: 
    dus_weekly[f'Y_{col}'] = (dus_weekly[col]>0).astype(int).shift(-1)


dus_weekly= dus_weekly.dropna()
dus_weekly = dus_weekly.replace([np.inf, -np.inf], np.nan)
dus_weekly = dus_weekly.ffill()

#we can now remove the original yield columns
dus_weekly = dus_weekly.drop(columns=cols)

In [292]:
Yw = dus_weekly[[f'Y_{col}' for col in cols]]
Xw = dus_weekly.drop(columns = [f'Y_{col}' for col in cols])

print(f"there are {Xw.shape[1]} features in the dataset")

there are 738 features in the dataset


We augmented the dataset by adding a very large amount of features created from the yields time series. however, many of these features are not informative so we need to remove them before training our model. 

We'll filter features by keeping only those with a mutual information score above than a given threshold. We'll create several datasets of features containing features filtered for the thresholds 0.03, 0.035, 0.04, 0.05 and train models on these specific datasets. 

In [None]:
initial_number_of_features = Xw.shape[1]

threshold_list = [0.03,0.035,0.04,0.05]

datasets = {t:pd.DataFrame() for t in threshold_list}

for threshold_mi in tqdm(threshold_list): 
    selected_features_w = set()
    for col in Yw.columns:
        mi = mutual_info_classif(Xw, Yw[col])
        top_features_i = Xw.columns[mi > threshold_mi]  
        selected_features_w.update(top_features_i)

    datasets[threshold_mi] = Xw[list(selected_features_w)]

    print(f'we removed {initial_number_of_features-len(selected_features_w)} features in the weekly dataset for threshold {threshold_mi}')
    print(f'Number of variables in the dataset with MI threshold {threshold_mi}:',datasets[threshold_mi].shape[1])
    print(f'Macro and market variables in the dataset with MI threshold {threshold_mi}:', [col for col in datasets[threshold_mi].columns if 'DGS' not in col])
    

  0%|          | 0/4 [00:00<?, ?it/s]

Notice that many of the macro and market variables that we extracted from the FRED website do not share much mutual information with the target since many of these variables are removed with the threshold we used. 


### B) Logistic regression 

We define the following pipelines for our logistic regression models.
We train our model on a rolling window of 15 years of data (with cross validation without leakage of future information) and then predict the 4 next weeks. We then update the rolling window, train another model, predict the 4 next weekds and so on. 

In [None]:
window_train = 52 * 15
window_pred = 4

alphas = np.logspace(-5, 2, 30)
tscv = TimeSeriesSplit(n_splits=4)

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('logreg', MultiOutputClassifier(
        LogisticRegressionCV(
            penalty='l2',
            cv=tscv,
            Cs=alphas,
            fit_intercept=False,
            scoring='accuracy',
            max_iter=5000)))
])

pipepca = Pipeline([
    ('scaler', StandardScaler()),
    ('pca',PCA(n_components=0.99)), 
    ('logreg', MultiOutputClassifier(
        LogisticRegressionCV(
            penalty='l2',
            cv=tscv,
            Cs=alphas,
            fit_intercept=False,
            scoring='accuracy',
            max_iter=5000)))
])

We train logistic regression models for the 4 datasets containing features with mutual information larger than 0.03, 0.035, 0.04, 0.05 respectively. Then we save the results in datasets. 

The dataset with variables with mutual information > 0.03 contains 266 features, which is very high compared to the numer of points in the training datasets (52*15 lines), so we'll also train a model that performs a PCA before applying the logistic regression in order to reduce the number of features and see if it can improve the performance.

In [None]:
for threshold in threshold_list:

    print(f'training model on dataset with features with mutual information above {threshold}')

    Xw = datasets[threshold]

    # for the dataset containing features with mutual information above 0.03:
    # we'll train our logistic model with and without PCA since there are many features in this dataset
    if threshold == 0.03:


        accuraciesl2 = {col: [] for col in Y.columns}
        accuraciesl2pca = {col: [] for col in Y.columns}
        alphal2 = {col: [] for col in Y.columns}
        alphal2pca = {col: [] for col in Y.columns}
        feature_importancel2 = {col: [] for col in Y.columns}
        feature_importancel2pca = {col: [] for col in Y.columns}
        y_true = []
        y_predl2 = []
        y_predl2pca = []



        for start in tqdm(range(0, len(Xw) - window_train - window_pred + 1, window_pred)):
            end_train = start + window_train
            end_pred = end_train + window_pred

            Xw_train = Xw.iloc[start:end_train]
            Yw_train = Yw.iloc[start:end_train]
            Xw_test = Xw.iloc[end_train:end_pred]
            Yw_test = Yw.iloc[end_train:end_pred]

            y_true.append(Yw_test)

        
            pipe.fit(Xw_train, Yw_train)
            pipepca.fit(Xw_train, Yw_train)


            Yw_pred = pipe.predict(Xw_test)
            Yw_predpca = pipepca.predict(Xw_test)

            y_predl2.append(Yw_pred)
            y_predl2pca.append(Yw_predpca)

            for i, col in enumerate(Yw_train.columns):
                acc = accuracy_score(Yw_test[col], Yw_pred[:, i])
                accuraciesl2[col].append(acc)

                acc = accuracy_score(Yw_test[col], Yw_predpca[:, i])
                accuraciesl2pca[col].append(acc)

                # parameters chosen
                best_C = pipe.named_steps['logreg'].estimators_[i].C_[0]
                alphal2[col].append(best_C)

                best_C = pipepca.named_steps['logreg'].estimators_[i].C_[0]
                alphal2pca[col].append(best_C)

                # feature importance
                coefs = np.abs(pipe.named_steps['logreg'].estimators_[i].coef_).flatten()
                feature_importancel2[col].append(coefs)

                coefs = np.abs(pipepca.named_steps['logreg'].estimators_[i].coef_).flatten()
                feature_importancel2pca[col].append(coefs)

        ### saving the results
        acc = pd.DataFrame(accuraciesl2,columns = Yw.columns)
        params = pd.DataFrame(alphal2, columns = Yw.columns) 
        feature_imp = pd.DataFrame(feature_importancel2, columns = Yw.columns)
        Y_true_flat = np.array(y_true).reshape(-1, np.array(y_true).shape[2])  # aplati les 2 premières dimensions en une seule
        ytrue = pd.DataFrame(Y_true_flat, index = Yw.iloc[780:Yw.shape[0]-1].index,columns=Yw.columns)
        Y_pred_flat = np.array(y_predl2).reshape(-1, np.array(y_predl2).shape[2])  # aplati les 2 premières dimensions en une seule
        ypred = pd.DataFrame(Y_pred_flat,index = Yw.iloc[780:Yw.shape[0]-1].index, columns=Yw.columns)


        print(f'For each yield, the average out-of-sample accuracy over the 79 rolling windows on the dataset with mutual information > {threshold} is:')
        print(acc.mean())
        acc.to_csv('results of models/accuracies logistic regression, 15y train test.csv')
        params.to_csv('results of models/params logistic regression, 15y train test.csv')
        feature_imp.to_csv('results of models/feature importance logistic regression, 15y train test.csv')
        ypred.to_csv('results of models/forecast logistic regression, 15y train test.csv')
        ytrue.to_csv('results of models/true values logistic regression, 15y train test.csv')

        acc = pd.DataFrame(accuraciesl2pca,columns = Yw.columns)
        params = pd.DataFrame(alphal2pca, columns = Yw.columns) 
        feature_imp = pd.DataFrame(feature_importancel2pca, columns = Yw.columns)
        Y_true_flat = np.array(y_true).reshape(-1, np.array(y_true).shape[2])  # aplati les 2 premières dimensions en une seule
        ytrue = pd.DataFrame(Y_true_flat, index = Yw.iloc[780:Yw.shape[0]-1].index,columns=Yw.columns)
        Y_pred_flat = np.array(y_predl2pca).reshape(-1, np.array(y_predl2pca).shape[2])  # aplati les 2 premières dimensions en une seule
        ypred = pd.DataFrame(Y_pred_flat,index = Yw.iloc[780:Yw.shape[0]-1].index, columns=Yw.columns)

        print(f'For each yield, the average out-of-sample accuracy over the 79 rolling windows on the dataset with mutual information > {threshold} and PCA is:')
        print(acc.mean())
        acc.to_csv('results of models/accuracies logistic regression pca, 15y train test.csv')
        params.to_csv('results of models/params logistic regression pca, 15y train test.csv')
        feature_imp.to_csv('results of models/feature importance logistic regression pca, 15y train test.csv')
        ypred.to_csv('results of models/forecast logistic regression pca, 15y train test.csv')
        ytrue.to_csv('results of models/true values logistic regression pca, 15y train test.csv')

    #for other datasets, we train models without applying a PCA before. 
    else: 


        accuraciesl2 = {col: [] for col in Y.columns}
        alphal2 = {col: [] for col in Y.columns}
        feature_importancel2 = {col: [] for col in Y.columns}
        y_true = []
        y_predl2 = []

        for start in tqdm(range(0, len(Xw) - window_train - window_pred + 1, window_pred)):

            end_train = start + window_train
            end_pred = end_train + window_pred
            Xw_train = Xw.iloc[start:end_train]
            Yw_train = Yw.iloc[start:end_train]
            Xw_test = Xw.iloc[end_train:end_pred]
            Yw_test = Yw.iloc[end_train:end_pred]
            y_true.append(Yw_test)

            pipe.fit(Xw_train, Yw_train)

            Yw_pred = pipe.predict(Xw_test)

            y_predl2.append(Yw_pred)


            for i, col in enumerate(Yw_train.columns):
                acc = accuracy_score(Yw_test[col], Yw_pred[:, i])
                accuraciesl2[col].append(acc)


                # alpha optimal choisi
                best_C = pipe.named_steps['logreg'].estimators_[i].C_[0]
                alphal2[col].append(best_C)

                # importance des features = moyenne absolue des coefficients
                coefs = np.abs(pipe.named_steps['logreg'].estimators_[i].coef_).flatten()
                feature_importancel2[col].append(coefs)

        acc = pd.DataFrame(accuraciesl2,columns = Yw.columns)
        params = pd.DataFrame(alphal2, columns = Yw.columns) 
        feature_imp = pd.DataFrame(feature_importancel2, columns = Yw.columns)
        Y_true_flat = np.array(y_true).reshape(-1, np.array(y_true).shape[2])  # aplati les 2 premières dimensions en une seule
        ytrue = pd.DataFrame(Y_true_flat, index = Yw.iloc[780:Yw.shape[0]-1].index,columns=Yw.columns)
        Y_pred_flat = np.array(y_predl2).reshape(-1, np.array(y_predl2).shape[2])  # aplati les 2 premières dimensions en une seule
        ypred = pd.DataFrame(Y_pred_flat,index = Yw.iloc[780:Yw.shape[0]-1].index, columns=Yw.columns)

        print(f'For each yield, the average out-of-sample accuracy over the 79 rolling windows on the dataset with mutual information > {threshold} is:')
        print(acc.mean())
        acc.to_csv('results of models/accuracies logistic regression, MI 0.035, 15y train test.csv')
        params.to_csv('results of models/params logistic regression, MI 0.035, 15y train test.csv')
        feature_imp.to_csv('results of models/feature importance logistic regression, MI 0.035, 15y train test.csv')
        ypred.to_csv('results of models/forecast logistic regression, MI 0.035, 15y train test.csv')
        ytrue.to_csv('results of models/true values logistic regression, MI 0.035, 15y train test.csv')


training model on dataset with features with mutual information above 0.03


  0%|          | 0/79 [00:00<?, ?it/s]

100%|██████████| 79/79 [15:03<00:00, 11.44s/it]


For each yield, the average out-of-sample accuracy over the 79 rolling windows on the dataset with mutual information > 0.03 is:
Y_DGS1MO    0.607595
Y_DGS3MO    0.658228
Y_DGS6MO    0.664557
Y_DGS1      0.582278
Y_DGS2      0.518987
Y_DGS3      0.563291
Y_DGS5      0.553797
Y_DGS7      0.518987
Y_DGS10     0.503165
Y_DGS20     0.477848
Y_DGS30     0.560127
dtype: float64
For each yield, the average out-of-sample accuracy over the 79 rolling windows on the dataset with mutual information > 0.03 and PCA is:
Y_DGS1MO    0.636076
Y_DGS3MO    0.680380
Y_DGS6MO    0.655063
Y_DGS1      0.572785
Y_DGS2      0.537975
Y_DGS3      0.575949
Y_DGS5      0.522152
Y_DGS7      0.506329
Y_DGS10     0.569620
Y_DGS20     0.496835
Y_DGS30     0.512658
dtype: float64
training model on dataset with features with mutual information above 0.035


100%|██████████| 79/79 [11:14<00:00,  8.53s/it]


For each yield, the average out-of-sample accuracy over the 79 rolling windows on the dataset with mutual information > 0.035 is:
Y_DGS1MO    0.607595
Y_DGS3MO    0.658228
Y_DGS6MO    0.664557
Y_DGS1      0.582278
Y_DGS2      0.518987
Y_DGS3      0.563291
Y_DGS5      0.553797
Y_DGS7      0.518987
Y_DGS10     0.503165
Y_DGS20     0.477848
Y_DGS30     0.560127
dtype: float64
training model on dataset with features with mutual information above 0.04


100%|██████████| 79/79 [12:30<00:00,  9.50s/it]


For each yield, the average out-of-sample accuracy over the 79 rolling windows on the dataset with mutual information > 0.04 is:
Y_DGS1MO    0.607595
Y_DGS3MO    0.658228
Y_DGS6MO    0.664557
Y_DGS1      0.582278
Y_DGS2      0.518987
Y_DGS3      0.563291
Y_DGS5      0.553797
Y_DGS7      0.518987
Y_DGS10     0.503165
Y_DGS20     0.477848
Y_DGS30     0.560127
dtype: float64
training model on dataset with features with mutual information above 0.05


 71%|███████   | 56/79 [09:45<04:00, 10.46s/it]


KeyboardInterrupt: 

Logistic regression without PCA, for the dataset containing features with a mutual information above 0.035. 

In [None]:
# accuraciesl2 = {col: [] for col in Y.columns}
# alphal2 = {col: [] for col in Y.columns}
# feature_importancel2 = {col: [] for col in Y.columns}
# y_true = []
# y_predl2 = []


# for start in tqdm(range(0, len(Xw) - window_train - window_pred + 1, window_pred)):
#     end_train = start + window_train
#     end_pred = end_train + window_pred

#     Xw_train = Xw35.iloc[start:end_train]
#     Yw_train = Yw.iloc[start:end_train]
#     Xw_test = Xw35.iloc[end_train:end_pred]
#     Yw_test = Yw.iloc[end_train:end_pred]

#     y_true.append(Yw_test)

 
#     pipe.fit(Xw_train, Yw_train)



#     Yw_pred = pipe.predict(Xw_test)


#     y_predl2.append(Yw_pred)


#     for i, col in enumerate(Yw_train.columns):
#         acc = accuracy_score(Yw_test[col], Yw_pred[:, i])
#         accuraciesl2[col].append(acc)


#         # alpha optimal choisi
#         best_C = pipe.named_steps['logreg'].estimators_[i].C_[0]
#         alphal2[col].append(best_C)

#         # importance des features = moyenne absolue des coefficients
#         coefs = np.abs(pipe.named_steps['logreg'].estimators_[i].coef_).flatten()
#         feature_importancel2[col].append(coefs)

# acc = pd.DataFrame(accuraciesl2,columns = Yw.columns)
# params = pd.DataFrame(alphal2, columns = Yw.columns) 
# feature_imp = pd.DataFrame(feature_importancel2, columns = Yw.columns)
# Y_true_flat = np.array(y_true).reshape(-1, np.array(y_true).shape[2])  # aplati les 2 premières dimensions en une seule
# ytrue = pd.DataFrame(Y_true_flat, index = Yw.iloc[780:Yw.shape[0]-1].index,columns=Yw.columns)
# Y_pred_flat = np.array(y_predl2).reshape(-1, np.array(y_predl2).shape[2])  # aplati les 2 premières dimensions en une seule
# ypred = pd.DataFrame(Y_pred_flat,index = Yw.iloc[780:Yw.shape[0]-1].index, columns=Yw.columns)


# acc.to_csv('accuracies logistic regression, MI 0.035, 15y train test.csv')
# params.to_csv('params logistic regression, MI 0.035, 15y train test.csv')
# feature_imp.to_csv('feature importance logistic regression, MI 0.035, 15y train test.csv')
# ypred.to_csv('forecast logistic regression, MI 0.035, 15y train test.csv')
# ytrue.to_csv('true values logistic regression, MI 0.035, 15y train test.csv')

# acc.mean()



100%|██████████| 11/11 [00:14<00:00,  1.34s/it]
100%|██████████| 79/79 [15:33<00:00, 11.81s/it]


Logistic regression without PCA, for the dataset containing features with a mutual information above 0.04. 

In [None]:
# accuraciesl2 = {col: [] for col in Y.columns}
# alphal2 = {col: [] for col in Y.columns}
# feature_importancel2 = {col: [] for col in Y.columns}
# y_true = []
# y_predl2 = []


# for start in tqdm(range(0, len(Xw) - window_train - window_pred + 1, window_pred)):
#     end_train = start + window_train
#     end_pred = end_train + window_pred

#     Xw_train = Xw4.iloc[start:end_train]
#     Yw_train = Yw.iloc[start:end_train]
#     Xw_test = Xw4.iloc[end_train:end_pred]
#     Yw_test = Yw.iloc[end_train:end_pred]

#     y_true.append(Yw_test)

 
#     pipe.fit(Xw_train, Yw_train)



#     Yw_pred = pipe.predict(Xw_test)


#     y_predl2.append(Yw_pred)


#     for i, col in enumerate(Yw_train.columns):
#         acc = accuracy_score(Yw_test[col], Yw_pred[:, i])
#         accuraciesl2[col].append(acc)


#         # alpha optimal choisi
#         best_C = pipe.named_steps['logreg'].estimators_[i].C_[0]
#         alphal2[col].append(best_C)

#         # importance des features = moyenne absolue des coefficients
#         coefs = np.abs(pipe.named_steps['logreg'].estimators_[i].coef_).flatten()
#         feature_importancel2[col].append(coefs)

# acc = pd.DataFrame(accuraciesl2,columns = Yw.columns)
# params = pd.DataFrame(alphal2, columns = Yw.columns) 
# feature_imp = pd.DataFrame(feature_importancel2, columns = Yw.columns)
# Y_true_flat = np.array(y_true).reshape(-1, np.array(y_true).shape[2])  # aplati les 2 premières dimensions en une seule
# ytrue = pd.DataFrame(Y_true_flat, index = Yw.iloc[780:Yw.shape[0]-1].index,columns=Yw.columns)
# Y_pred_flat = np.array(y_predl2).reshape(-1, np.array(y_predl2).shape[2])  # aplati les 2 premières dimensions en une seule
# ypred = pd.DataFrame(Y_pred_flat,index = Yw.iloc[780:Yw.shape[0]-1].index, columns=Yw.columns)


# acc.to_csv('accuracies logistic regression, MI 0.04, 15y train test.csv')
# params.to_csv('params logistic regression, MI 0.04, 15y train test.csv')
# feature_imp.to_csv('feature importance logistic regression, MI 0.04, 15y train test.csv')
# ypred.to_csv('forecast logistic regression, MI 0.04, 15y train test.csv')
# ytrue.to_csv('true values logistic regression, MI 0.04, 15y train test.csv')

# acc.mean()



Logistic regression without PCA, for the dataset containing features with a mutual information above 0.05.

In [None]:
# accuraciesl2 = {col: [] for col in Y.columns}
# alphal2 = {col: [] for col in Y.columns}
# feature_importancel2 = {col: [] for col in Y.columns}
# y_true = []
# y_predl2 = []


# for start in tqdm(range(0, len(Xw) - window_train - window_pred + 1, window_pred)):
#     end_train = start + window_train
#     end_pred = end_train + window_pred

#     Xw_train = Xw5.iloc[start:end_train]
#     Yw_train = Yw.iloc[start:end_train]
#     Xw_test = Xw5.iloc[end_train:end_pred]
#     Yw_test = Yw.iloc[end_train:end_pred]

#     y_true.append(Yw_test)

 
#     pipe.fit(Xw_train, Yw_train)



#     Yw_pred = pipe.predict(Xw_test)


#     y_predl2.append(Yw_pred)


#     for i, col in enumerate(Yw_train.columns):
#         acc = accuracy_score(Yw_test[col], Yw_pred[:, i])
#         accuraciesl2[col].append(acc)


#         # alpha optimal choisi
#         best_C = pipe.named_steps['logreg'].estimators_[i].C_[0]
#         alphal2[col].append(best_C)

#         # importance des features = moyenne absolue des coefficients
#         coefs = np.abs(pipe.named_steps['logreg'].estimators_[i].coef_).flatten()
#         feature_importancel2[col].append(coefs)

# acc = pd.DataFrame(accuraciesl2,columns = Yw.columns)
# params = pd.DataFrame(alphal2, columns = Yw.columns) 
# feature_imp = pd.DataFrame(feature_importancel2, columns = Yw.columns)
# Y_true_flat = np.array(y_true).reshape(-1, np.array(y_true).shape[2])  # aplati les 2 premières dimensions en une seule
# ytrue = pd.DataFrame(Y_true_flat, index = Yw.iloc[780:Yw.shape[0]-1].index,columns=Yw.columns)
# Y_pred_flat = np.array(y_predl2).reshape(-1, np.array(y_predl2).shape[2])  # aplati les 2 premières dimensions en une seule
# ypred = pd.DataFrame(Y_pred_flat,index = Yw.iloc[780:Yw.shape[0]-1].index, columns=Yw.columns)


# acc.to_csv('accuracies logistic regression, MI 0.05, 15y train test.csv')
# params.to_csv('params logistic regression, MI 0.05, 15y train test.csv')
# feature_imp.to_csv('feature importance logistic regression, MI 0.05, 15y train test.csv')
# ypred.to_csv('forecast logistic regression, MI 0.05, 15y train test.csv')
# ytrue.to_csv('true values logistic regression, MI 0.05, 15y train test.csv')

# acc.mean()



## C) XGBoost

We'll now train XGboost models to see whether it can capture non linearities in the data and improve accuracy of out-of-sample predictions. Once again, we'll train some models on the 4 datasets that we created. 

For the first dataset (with features with a MI>0.03), we'll also train a model that performs a PCA to reduce the number of features before applying XGboost. 

The pipelines that we'll use are the following:

In [None]:
window_train = 52 * 15
window_pred = 4
tscv = TimeSeriesSplit(n_splits=4)


# parameters to test in cross validation 
param_grid = {
 'xgb__estimator__n_estimators': [100, 200],
 'xgb__estimator__learning_rate': [0.01, 0.05],
 'xgb__estimator__max_depth': [4,7],
 'xgb__estimator__subsample': [0.5, 0.7],
 'xgb__estimator__colsample_bytree': [0.4, 0.8],
 'xgb__estimator__min_child_weight': [5, 10],
 'xgb__estimator__reg_alpha': [0.5, 1.0],
 'xgb__estimator__reg_lambda': [5, 10]
}




#pipeline without PCA 
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('xgb', MultiOutputClassifier(
        XGBClassifier(
            objective='binary:logistic',
            use_label_encoder=False,
            n_jobs=-1,
            random_state=42,
            verbosity=0
        )
    ))
])


#pipeline with PCA 
pipepca = Pipeline([
    ('scaler', StandardScaler()),
    ('pca',PCA(n_components = 0.99)),
    ('xgb', MultiOutputClassifier(
        XGBClassifier(
            objective='binary:logistic',
            use_label_encoder=False,
            n_jobs=-1,
            random_state=42,
            verbosity=0
        )
    ))
])


#and we'll use gridsearchCV to cross validate the model:
GridSearchCV(pipepca, #or pipe
            param_grid,
            cv=tscv,
            scoring='accuracy',
            n_jobs=-1,
            verbose=0)


We now train XGBoost models on the datasets:

# AUDRIC FAIS TOURNER LE BLOC DE CODE CI DESSOUS STP (+ le bloc juste au dessus aussi pour que ca fonctionne)

In [None]:
for threshold in threshold_list:

    print(f'training model on dataset with features with mutual information above {threshold}')

    Xw = datasets[threshold]

    # for the dataset containing features with mutual information above 0.03:
    # we'll train our xgboost model with and without PCA since there are many features in this dataset
    if threshold == 0.03:

        accuracies = {col: [] for col in Yw.columns}
        params_by_target = {col: [] for col in Yw.columns}
        feature_importance = {col: [] for col in Yw.columns}
        y_true = []
        y_pred = []

        for start in tqdm(range(0, len(Xw) - window_train - window_pred + 1, window_pred)):
            end_train = start + window_train
            end_pred = end_train + window_pred

            Xw_train = Xw.iloc[start:end_train]
            Yw_train = Yw.iloc[start:end_train]
            Xw_test = Xw.iloc[end_train:end_pred]
            Yw_test = Yw.iloc[end_train:end_pred]

            y_true.append(Yw_test)

            grid = GridSearchCV(pipe,
                                param_grid,
                                cv=tscv,
                                scoring='accuracy',
                                n_jobs=-1,
                                verbose=0)

            grid.fit(Xw_train, Yw_train)
            best_model = grid.best_estimator_

            Yw_pred = best_model.predict(Xw_test)
            y_pred.append(Yw_pred)

            for i, col in enumerate(Yw_train.columns):
                acc = accuracy_score(Yw_test[col], Yw_pred[:, i])
                accuracies[col].append(acc)
                print(col, np.mean(accuracies[col]))

                # feature importance pour la i-ème sortie
                est = best_model.named_steps['xgb'].estimators_[i]   # XGBClassifier
                importance = est.feature_importances_
                feature_importance[col].append(importance)

                # hyperparamètres de l'estimateur i
                params = est.get_params()
                params_by_target[col].append(params)



        acc = pd.DataFrame(accuracies,columns = Yw.columns)
        params = pd.DataFrame(params_by_target, columns = Yw.columns) 
        feature_imp = pd.DataFrame(feature_importance, columns = Yw.columns)
        Y_true_flat = np.array(y_true).reshape(-1, np.array(y_true).shape[2])  # aplati les 2 premières dimensions en une seule
        ytrue = pd.DataFrame(Y_true_flat, index = Yw.iloc[780:Yw.shape[0]-1].index,columns=Yw.columns)
        Y_pred_flat = np.array(y_pred).reshape(-1, np.array(y_pred).shape[2])  # aplati les 2 premières dimensions en une seule
        ypred = pd.DataFrame(Y_pred_flat,index = Yw.iloc[780:Yw.shape[0]-1].index, columns=Yw.columns)

        print(f'For each yield, the average out-of-sample accuracy over the 79 rolling windows on the dataset with mutual information > {threshold} is:')
        print(acc.mean())
        acc.to_csv('results of models/accuracies xgboost, 15y train test.csv')
        params.to_csv('results of models/params xgboost, 15y train test.csv')
        feature_imp.to_csv('results of models/feature importance xgboost, 15y train test.csv')
        ypred.to_csv('results of models/forecast xgboost, 15y train test.csv')
        ytrue.to_csv('results of models/true values xgboost, 15y train test.csv')





        #### Now we do the same training but this time we add the PCA in the pipeline before training the model
        accuracies = {col: [] for col in Yw.columns}
        params_by_target = {col: [] for col in Yw.columns}
        feature_importance = {col: [] for col in Yw.columns}
        y_true = []
        y_pred = []


        
        for start in tqdm(range(0, len(Xw) - window_train - window_pred + 1, window_pred)):
            end_train = start + window_train
            end_pred = end_train + window_pred

            Xw_train = Xw.iloc[start:end_train]
            Yw_train = Yw.iloc[start:end_train]
            Xw_test = Xw.iloc[end_train:end_pred]
            Yw_test = Yw.iloc[end_train:end_pred]

            y_true.append(Yw_test)

            grid = GridSearchCV(pipepca,
                                param_grid,
                                cv=tscv,
                                scoring='accuracy',
                                n_jobs=-1,
                                verbose=0)

            grid.fit(Xw_train, Yw_train)
            best_model = grid.best_estimator_

            Yw_pred = best_model.predict(Xw_test)
            y_pred.append(Yw_pred)

            for i, col in enumerate(Yw_train.columns):
                acc = accuracy_score(Yw_test[col], Yw_pred[:, i])
                accuracies[col].append(acc)
                print(col, np.mean(accuracies[col]))

                # feature importance pour la i-ème sortie
                est = best_model.named_steps['xgb'].estimators_[i]   # XGBClassifier
                importance = est.feature_importances_
                feature_importance[col].append(importance)

                # hyperparamètres de l'estimateur i
                params = est.get_params()
                params_by_target[col].append(params)



        acc = pd.DataFrame(accuracies,columns = Yw.columns)
        params = pd.DataFrame(params_by_target, columns = Yw.columns) 
        feature_imp = pd.DataFrame(feature_importance, columns = Yw.columns)
        Y_true_flat = np.array(y_true).reshape(-1, np.array(y_true).shape[2])  # aplati les 2 premières dimensions en une seule
        ytrue = pd.DataFrame(Y_true_flat, index = Yw.iloc[780:Yw.shape[0]-1].index,columns=Yw.columns)
        Y_pred_flat = np.array(y_pred).reshape(-1, np.array(y_pred).shape[2])  # aplati les 2 premières dimensions en une seule
        ypred = pd.DataFrame(Y_pred_flat,index = Yw.iloc[780:Yw.shape[0]-1].index, columns=Yw.columns)

        print(f'For each yield, the average out-of-sample accuracy over the 79 rolling windows on the dataset with mutual information > {threshold} and PCA is:')
        print(acc.mean())
        acc.to_csv('results of models/accuracies xgboost, pca, 15y train test.csv')
        params.to_csv('results of models/params xgboost, pca, 15y train test.csv')
        feature_imp.to_csv('results of models/feature importance xgboost, pca, 15y train test.csv')
        ypred.to_csv('results of models/forecast xgboost, pca, 15y train test.csv')
        ytrue.to_csv('results of models/true values xgboost, pca, 15y train test.csv')
    

    # for other datasets with larger mutual information threshold, we don't apply PCA and train the model directly 
    else:
        accuracies = {col: [] for col in Yw.columns}
        params_by_target = {col: [] for col in Yw.columns}
        feature_importance = {col: [] for col in Yw.columns}
        y_true = []
        y_pred = []

        for start in tqdm(range(0, len(Xw) - window_train - window_pred + 1, window_pred)):
            end_train = start + window_train
            end_pred = end_train + window_pred

            Xw_train = Xw.iloc[start:end_train]
            Yw_train = Yw.iloc[start:end_train]
            Xw_test = Xw.iloc[end_train:end_pred]
            Yw_test = Yw.iloc[end_train:end_pred]

            y_true.append(Yw_test)

            grid = GridSearchCV(pipe,
                                param_grid,
                                cv=tscv,
                                scoring='accuracy',
                                n_jobs=-1,
                                verbose=0)

            grid.fit(Xw_train, Yw_train)
            best_model = grid.best_estimator_

            Yw_pred = best_model.predict(Xw_test)
            y_pred.append(Yw_pred)

            for i, col in enumerate(Yw_train.columns):
                acc = accuracy_score(Yw_test[col], Yw_pred[:, i])
                accuracies[col].append(acc)
                print(col, np.mean(accuracies[col]))

                # feature importance pour la i-ème sortie
                est = best_model.named_steps['xgb'].estimators_[i]   # XGBClassifier
                importance = est.feature_importances_
                feature_importance[col].append(importance)

                # hyperparamètres de l'estimateur i
                params = est.get_params()
                params_by_target[col].append(params)


        
        acc = pd.DataFrame(accuracies,columns = Yw.columns)
        params = pd.DataFrame(params_by_target, columns = Yw.columns) 
        feature_imp = pd.DataFrame(feature_importance, columns = Yw.columns)
        Y_true_flat = np.array(y_true).reshape(-1, np.array(y_true).shape[2])  # aplati les 2 premières dimensions en une seule
        ytrue = pd.DataFrame(Y_true_flat, index = Yw.iloc[780:Yw.shape[0]-1].index,columns=Yw.columns)
        Y_pred_flat = np.array(y_pred).reshape(-1, np.array(y_pred).shape[2])  # aplati les 2 premières dimensions en une seule
        ypred = pd.DataFrame(Y_pred_flat,index = Yw.iloc[780:Yw.shape[0]-1].index, columns=Yw.columns)

        print(f'For each yield, the average out-of-sample accuracy over the 79 rolling windows on the dataset with mutual information > {threshold} is:')
        print(acc.mean())
        acc.to_csv(f'results of models/accuracies xgboost, MI {threshold}, 15y train test.csv')
        params.to_csv(f'results of models/params xgboost, MI {threshold}, 15y train test.csv')
        feature_imp.to_csv(f'results of models/feature importance xgboost, MI {threshold}, 15y train test.csv')
        ypred.to_csv(f'results of models/forecast xgboost, MI {threshold}, 15y train test.csv')
        ytrue.to_csv(f'results of models/true values xgboost, MI {threshold}, 15y train test.csv')


In [None]:
# accuracies = {col: [] for col in Yw.columns}
# params_by_target = {col: [] for col in Yw.columns}
# feature_importance = {col: [] for col in Yw.columns}
# y_true = []
# y_pred = []

# for start in tqdm(range(0, len(Xw) - window_train - window_pred + 1, window_pred)):
#     end_train = start + window_train
#     end_pred = end_train + window_pred

#     Xw_train = Xw.iloc[start:end_train]
#     Yw_train = Yw.iloc[start:end_train]
#     Xw_test = Xw.iloc[end_train:end_pred]
#     Yw_test = Yw.iloc[end_train:end_pred]

#     y_true.append(Yw_test)

#     grid = GridSearchCV(pipe,
#                         param_grid,
#                         cv=tscv,
#                         scoring='accuracy',
#                         n_jobs=-1,
#                         verbose=0)

#     grid.fit(Xw_train, Yw_train)
#     best_model = grid.best_estimator_

#     Yw_pred = best_model.predict(Xw_test)
#     y_pred.append(Yw_pred)

#     for i, col in enumerate(Yw_train.columns):
#         acc = accuracy_score(Yw_test[col], Yw_pred[:, i])
#         accuracies[col].append(acc)
#         print(col, np.mean(accuracies[col]))

#         # feature importance pour la i-ème sortie
#         est = best_model.named_steps['xgb'].estimators_[i]   # XGBClassifier
#         importance = est.feature_importances_
#         feature_importance[col].append(importance)

#         # hyperparamètres de l'estimateur i
#         params = est.get_params()
#         params_by_target[col].append(params)



# acc = pd.DataFrame(accuracies,columns = Yw.columns)
# params = pd.DataFrame(params_by_target, columns = Yw.columns) 
# feature_imp = pd.DataFrame(feature_importance, columns = Yw.columns)
# Y_true_flat = np.array(y_true).reshape(-1, np.array(y_true).shape[2])  # aplati les 2 premières dimensions en une seule
# ytrue = pd.DataFrame(Y_true_flat, index = Yw.iloc[780:Yw.shape[0]-1].index,columns=Yw.columns)
# Y_pred_flat = np.array(y_pred).reshape(-1, np.array(y_pred).shape[2])  # aplati les 2 premières dimensions en une seule
# ypred = pd.DataFrame(Y_pred_flat,index = Yw.iloc[780:Yw.shape[0]-1].index, columns=Yw.columns)


# acc.to_csv('accuracies xgboost, 15y train test.csv')
# params.to_csv('params xgboost, 15y train test.csv')
# feature_imp.to_csv('feature importance xgboost, 15y train test.csv')
# ypred.to_csv('forecast xgboost, 15y train test.csv')
# ytrue.to_csv('true values xgboost, 15y train test.csv')

# print(acc.mean())


We now train XGBoost models for the dataset containing features with a mutual information above 0.03, but this time we include a PCA to reduce the number of features. 

In [None]:
# accuracies = {col: [] for col in Yw.columns}
# params_by_target = {col: [] for col in Yw.columns}
# feature_importance = {col: [] for col in Yw.columns}
# y_true = []
# y_pred = []

# for start in tqdm(range(0, len(Xw) - window_train - window_pred + 1, window_pred)):
#     end_train = start + window_train
#     end_pred = end_train + window_pred

#     Xw_train = Xw.iloc[start:end_train]
#     Yw_train = Yw.iloc[start:end_train]
#     Xw_test = Xw.iloc[end_train:end_pred]
#     Yw_test = Yw.iloc[end_train:end_pred]

#     y_true.append(Yw_test)

#     grid = GridSearchCV(pipepca,
#                         param_grid,
#                         cv=tscv,
#                         scoring='accuracy',
#                         n_jobs=-1,
#                         verbose=0)

#     grid.fit(Xw_train, Yw_train)
#     best_model = grid.best_estimator_

#     Yw_pred = best_model.predict(Xw_test)
#     y_pred.append(Yw_pred)

#     for i, col in enumerate(Yw_train.columns):
#         acc = accuracy_score(Yw_test[col], Yw_pred[:, i])
#         accuracies[col].append(acc)
#         print(col, np.mean(accuracies[col]))

#         # feature importance pour la i-ème sortie
#         est = best_model.named_steps['xgb'].estimators_[i]   # XGBClassifier
#         importance = est.feature_importances_
#         feature_importance[col].append(importance)

#         # hyperparamètres de l'estimateur i
#         params = est.get_params()
#         params_by_target[col].append(params)



# acc = pd.DataFrame(accuracies,columns = Yw.columns)
# params = pd.DataFrame(params_by_target, columns = Yw.columns) 
# feature_imp = pd.DataFrame(feature_importance, columns = Yw.columns)
# Y_true_flat = np.array(y_true).reshape(-1, np.array(y_true).shape[2])  # aplati les 2 premières dimensions en une seule
# ytrue = pd.DataFrame(Y_true_flat, index = Yw.iloc[780:Yw.shape[0]-1].index,columns=Yw.columns)
# Y_pred_flat = np.array(y_pred).reshape(-1, np.array(y_pred).shape[2])  # aplati les 2 premières dimensions en une seule
# ypred = pd.DataFrame(Y_pred_flat,index = Yw.iloc[780:Yw.shape[0]-1].index, columns=Yw.columns)


# acc.to_csv('accuracies xgboost, pca, 15y train test.csv')
# params.to_csv('params xgboost, pca, 15y train test.csv')
# feature_imp.to_csv('feature importance xgboost, pca, 15y train test.csv')
# ypred.to_csv('forecast xgboost, pca, 15y train test.csv')
# ytrue.to_csv('true values xgboost, pca, 15y train test.csv')

# print(acc.mean())


Results are surprisingly very disappointing for XGBoost. When looking at predictions made by this model, we see that the prediction vectors are almost always 0. There are very few ones in the predictions, and since classes are well balanced, this results in a performance close to predicting yields randomly. 

We will now try to run this model on the other datasets of features such that their mutual information is >0.035 or 0.04 or 0.05 to see if there are any improvements.

For the dataset with MI >0.035:

In [None]:
# accuracies = {col: [] for col in Yw.columns}
# params_by_target = {col: [] for col in Yw.columns}
# feature_importance = {col: [] for col in Yw.columns}
# y_true = []
# y_pred = []

# for start in tqdm(range(0, len(Xw) - window_train - window_pred + 1, window_pred)):
#     end_train = start + window_train
#     end_pred = end_train + window_pred

#     Xw_train = Xw35.iloc[start:end_train]
#     Yw_train = Yw.iloc[start:end_train]
#     Xw_test = Xw35.iloc[end_train:end_pred]
#     Yw_test = Yw.iloc[end_train:end_pred]

#     y_true.append(Yw_test)

#     grid = GridSearchCV(pipe,
#                         param_grid,
#                         cv=tscv,
#                         scoring='accuracy',
#                         n_jobs=-1,
#                         verbose=0)

#     grid.fit(Xw_train, Yw_train)
#     best_model = grid.best_estimator_

#     Yw_pred = best_model.predict(Xw_test)
#     y_pred.append(Yw_pred)

#     for i, col in enumerate(Yw_train.columns):
#         acc = accuracy_score(Yw_test[col], Yw_pred[:, i])
#         accuracies[col].append(acc)
#         print(col, np.mean(accuracies[col]))

#         # feature importance pour la i-ème sortie
#         est = best_model.named_steps['xgb'].estimators_[i]   # XGBClassifier
#         importance = est.feature_importances_
#         feature_importance[col].append(importance)

#         # hyperparamètres de l'estimateur i
#         params = est.get_params()
#         params_by_target[col].append(params)



# acc = pd.DataFrame(accuracies,columns = Yw.columns)
# params = pd.DataFrame(params_by_target, columns = Yw.columns) 
# feature_imp = pd.DataFrame(feature_importance, columns = Yw.columns)
# Y_true_flat = np.array(y_true).reshape(-1, np.array(y_true).shape[2])  # aplati les 2 premières dimensions en une seule
# ytrue = pd.DataFrame(Y_true_flat, index = Yw.iloc[780:Yw.shape[0]-1].index,columns=Yw.columns)
# Y_pred_flat = np.array(y_pred).reshape(-1, np.array(y_pred).shape[2])  # aplati les 2 premières dimensions en une seule
# ypred = pd.DataFrame(Y_pred_flat,index = Yw.iloc[780:Yw.shape[0]-1].index, columns=Yw.columns)

# acc.to_csv('accuracies xgboost, MI 0.035, 15y train test.csv')
# params.to_csv('params xgboost, MI 0.035, 15y train test.csv')
# feature_imp.to_csv('feature importance xgboost, MI 0.035, 15y train test.csv')
# ypred.to_csv('forecast xgboost, MI 0.035, 15y train test.csv')
# ytrue.to_csv('true values xgboost, MI 0.035, 15y train test.csv')


For the dataset with MI > 0.04:

In [None]:
# accuracies = {col: [] for col in Yw.columns}
# params_by_target = {col: [] for col in Yw.columns}
# feature_importance = {col: [] for col in Yw.columns}
# y_true = []
# y_pred = []

# for start in tqdm(range(0, len(Xw) - window_train - window_pred + 1, window_pred)):
#     end_train = start + window_train
#     end_pred = end_train + window_pred

#     Xw_train = Xw4.iloc[start:end_train]
#     Yw_train = Yw.iloc[start:end_train]
#     Xw_test = Xw4.iloc[end_train:end_pred]
#     Yw_test = Yw.iloc[end_train:end_pred]

#     y_true.append(Yw_test)

#     grid = GridSearchCV(pipe,
#                         param_grid,
#                         cv=tscv,
#                         scoring='accuracy',
#                         n_jobs=-1,
#                         verbose=0)

#     grid.fit(Xw_train, Yw_train)
#     best_model = grid.best_estimator_

#     Yw_pred = best_model.predict(Xw_test)
#     y_pred.append(Yw_pred)

#     for i, col in enumerate(Yw_train.columns):
#         acc = accuracy_score(Yw_test[col], Yw_pred[:, i])
#         accuracies[col].append(acc)
#         print(col, np.mean(accuracies[col]))

#         # feature importance pour la i-ème sortie
#         est = best_model.named_steps['xgb'].estimators_[i]   # XGBClassifier
#         importance = est.feature_importances_
#         feature_importance[col].append(importance)

#         # hyperparamètres de l'estimateur i
#         params = est.get_params()
#         params_by_target[col].append(params)



# acc = pd.DataFrame(accuracies,columns = Yw.columns)
# params = pd.DataFrame(params_by_target, columns = Yw.columns) 
# feature_imp = pd.DataFrame(feature_importance, columns = Yw.columns)
# Y_true_flat = np.array(y_true).reshape(-1, np.array(y_true).shape[2])  # aplati les 2 premières dimensions en une seule
# ytrue = pd.DataFrame(Y_true_flat, index = Yw.iloc[780:Yw.shape[0]-1].index,columns=Yw.columns)
# Y_pred_flat = np.array(y_pred).reshape(-1, np.array(y_pred).shape[2])  # aplati les 2 premières dimensions en une seule
# ypred = pd.DataFrame(Y_pred_flat,index = Yw.iloc[780:Yw.shape[0]-1].index, columns=Yw.columns)

# acc.to_csv('accuracies xgboost, MI 0.04, 15y train test.csv')
# params.to_csv('params xgboost, MI 0.04, 15y train test.csv')
# feature_imp.to_csv('feature importance xgboost, MI 0.04, 15y train test.csv')
# ypred.to_csv('forecast xgboost, MI 0.04, 15y train test.csv')
# ytrue.to_csv('true values xgboost, MI 0.04, 15y train test.csv')

100%|██████████| 11/11 [00:16<00:00,  1.46s/it]
  1%|▏         | 1/79 [10:36<13:47:11, 636.30s/it]

Y_DGS1MO 0.5
Y_DGS3MO 0.75
Y_DGS6MO 0.5
Y_DGS1 0.0
Y_DGS2 0.0
Y_DGS3 0.25
Y_DGS5 0.25
Y_DGS7 0.0
Y_DGS10 0.5
Y_DGS20 0.5
Y_DGS30 0.75


  3%|▎         | 2/79 [20:35<13:08:54, 614.74s/it]

Y_DGS1MO 0.5
Y_DGS3MO 0.625
Y_DGS6MO 0.5
Y_DGS1 0.125
Y_DGS2 0.125
Y_DGS3 0.25
Y_DGS5 0.25
Y_DGS7 0.125
Y_DGS10 0.375
Y_DGS20 0.5
Y_DGS30 0.625


  4%|▍         | 3/79 [29:53<12:25:26, 588.51s/it]

Y_DGS1MO 0.5
Y_DGS3MO 0.6666666666666666
Y_DGS6MO 0.5833333333333334
Y_DGS1 0.16666666666666666
Y_DGS2 0.25
Y_DGS3 0.3333333333333333
Y_DGS5 0.25
Y_DGS7 0.25
Y_DGS10 0.3333333333333333
Y_DGS20 0.5
Y_DGS30 0.5833333333333334


  5%|▌         | 4/79 [39:36<12:13:06, 586.49s/it]

Y_DGS1MO 0.4375
Y_DGS3MO 0.75
Y_DGS6MO 0.4375
Y_DGS1 0.25
Y_DGS2 0.25
Y_DGS3 0.4375
Y_DGS5 0.375
Y_DGS7 0.375
Y_DGS10 0.375
Y_DGS20 0.4375
Y_DGS30 0.5625


  6%|▋         | 5/79 [48:28<11:38:59, 566.75s/it]

Y_DGS1MO 0.5
Y_DGS3MO 0.75
Y_DGS6MO 0.55
Y_DGS1 0.3
Y_DGS2 0.3
Y_DGS3 0.45
Y_DGS5 0.45
Y_DGS7 0.4
Y_DGS10 0.35
Y_DGS20 0.5
Y_DGS30 0.65


  8%|▊         | 6/79 [57:29<11:18:45, 557.89s/it]

Y_DGS1MO 0.5416666666666666
Y_DGS3MO 0.7083333333333334
Y_DGS6MO 0.5833333333333334
Y_DGS1 0.375
Y_DGS2 0.375
Y_DGS3 0.5
Y_DGS5 0.5
Y_DGS7 0.4166666666666667
Y_DGS10 0.3333333333333333
Y_DGS20 0.4583333333333333
Y_DGS30 0.5833333333333334


  9%|▉         | 7/79 [1:06:40<11:06:46, 555.64s/it]

Y_DGS1MO 0.6071428571428571
Y_DGS3MO 0.75
Y_DGS6MO 0.6071428571428571
Y_DGS1 0.42857142857142855
Y_DGS2 0.4642857142857143
Y_DGS3 0.5714285714285714
Y_DGS5 0.5714285714285714
Y_DGS7 0.5
Y_DGS10 0.42857142857142855
Y_DGS20 0.4642857142857143
Y_DGS30 0.5714285714285714


 10%|█         | 8/79 [1:16:33<11:11:52, 567.78s/it]

Y_DGS1MO 0.59375
Y_DGS3MO 0.71875
Y_DGS6MO 0.59375
Y_DGS1 0.46875
Y_DGS2 0.5
Y_DGS3 0.59375
Y_DGS5 0.5625
Y_DGS7 0.5
Y_DGS10 0.4375
Y_DGS20 0.46875
Y_DGS30 0.5625


 11%|█▏        | 9/79 [1:24:37<10:31:47, 541.54s/it]

Y_DGS1MO 0.6111111111111112
Y_DGS3MO 0.7222222222222222
Y_DGS6MO 0.6111111111111112
Y_DGS1 0.5
Y_DGS2 0.5277777777777778
Y_DGS3 0.6111111111111112
Y_DGS5 0.5833333333333334
Y_DGS7 0.5277777777777778
Y_DGS10 0.4722222222222222
Y_DGS20 0.5
Y_DGS30 0.5833333333333334


 13%|█▎        | 10/79 [1:32:55<10:07:19, 528.10s/it]

Y_DGS1MO 0.6
Y_DGS3MO 0.7
Y_DGS6MO 0.575
Y_DGS1 0.475
Y_DGS2 0.5
Y_DGS3 0.6
Y_DGS5 0.575
Y_DGS7 0.525
Y_DGS10 0.475
Y_DGS20 0.5
Y_DGS30 0.55


 14%|█▍        | 11/79 [1:41:47<9:59:46, 529.21s/it] 

Y_DGS1MO 0.6136363636363636
Y_DGS3MO 0.7272727272727273
Y_DGS6MO 0.5909090909090909
Y_DGS1 0.5
Y_DGS2 0.5227272727272727
Y_DGS3 0.6136363636363636
Y_DGS5 0.5909090909090909
Y_DGS7 0.5454545454545454
Y_DGS10 0.5
Y_DGS20 0.5227272727272727
Y_DGS30 0.5681818181818182


 15%|█▌        | 12/79 [1:50:53<9:56:30, 534.19s/it]

Y_DGS1MO 0.6458333333333334
Y_DGS3MO 0.7291666666666666
Y_DGS6MO 0.5833333333333334
Y_DGS1 0.5
Y_DGS2 0.5208333333333334
Y_DGS3 0.6041666666666666
Y_DGS5 0.5833333333333334
Y_DGS7 0.5416666666666666
Y_DGS10 0.5
Y_DGS20 0.5
Y_DGS30 0.5416666666666666


 16%|█▋        | 13/79 [2:01:52<10:29:31, 572.29s/it]

Y_DGS1MO 0.6346153846153846
Y_DGS3MO 0.6923076923076923
Y_DGS6MO 0.5769230769230769
Y_DGS1 0.5
Y_DGS2 0.5576923076923077
Y_DGS3 0.5961538461538461
Y_DGS5 0.5961538461538461
Y_DGS7 0.5384615384615384
Y_DGS10 0.5
Y_DGS20 0.5192307692307693
Y_DGS30 0.5576923076923077


 18%|█▊        | 14/79 [2:12:54<10:49:09, 599.22s/it]

Y_DGS1MO 0.625
Y_DGS3MO 0.6785714285714286
Y_DGS6MO 0.5714285714285714
Y_DGS1 0.5178571428571429
Y_DGS2 0.5714285714285714
Y_DGS3 0.6071428571428571
Y_DGS5 0.6071428571428571
Y_DGS7 0.5535714285714286
Y_DGS10 0.5178571428571429
Y_DGS20 0.5357142857142857
Y_DGS30 0.5714285714285714


 19%|█▉        | 15/79 [2:23:49<10:57:07, 616.06s/it]

Y_DGS1MO 0.6166666666666667
Y_DGS3MO 0.65
Y_DGS6MO 0.5833333333333334
Y_DGS1 0.5333333333333333
Y_DGS2 0.5666666666666667
Y_DGS3 0.6166666666666667
Y_DGS5 0.6166666666666667
Y_DGS7 0.5666666666666667
Y_DGS10 0.5166666666666667
Y_DGS20 0.5333333333333333
Y_DGS30 0.5666666666666667


 20%|██        | 16/79 [2:35:45<11:18:22, 646.07s/it]

Y_DGS1MO 0.609375
Y_DGS3MO 0.640625
Y_DGS6MO 0.59375
Y_DGS1 0.546875
Y_DGS2 0.5625
Y_DGS3 0.609375
Y_DGS5 0.609375
Y_DGS7 0.578125
Y_DGS10 0.53125
Y_DGS20 0.546875
Y_DGS30 0.578125


 22%|██▏       | 17/79 [2:44:29<10:29:41, 609.39s/it]

Y_DGS1MO 0.6323529411764706
Y_DGS3MO 0.6617647058823529
Y_DGS6MO 0.6176470588235294
Y_DGS1 0.5735294117647058
Y_DGS2 0.5882352941176471
Y_DGS3 0.6176470588235294
Y_DGS5 0.6176470588235294
Y_DGS7 0.5882352941176471
Y_DGS10 0.5441176470588235
Y_DGS20 0.5441176470588235
Y_DGS30 0.5882352941176471


 23%|██▎       | 18/79 [2:56:05<10:46:02, 635.46s/it]

Y_DGS1MO 0.625
Y_DGS3MO 0.6527777777777778
Y_DGS6MO 0.6111111111111112
Y_DGS1 0.5555555555555556
Y_DGS2 0.5972222222222222
Y_DGS3 0.6388888888888888
Y_DGS5 0.625
Y_DGS7 0.5972222222222222
Y_DGS10 0.5555555555555556
Y_DGS20 0.5555555555555556
Y_DGS30 0.5972222222222222


 24%|██▍       | 19/79 [3:04:48<10:01:33, 601.56s/it]

Y_DGS1MO 0.6447368421052632
Y_DGS3MO 0.6710526315789473
Y_DGS6MO 0.6052631578947368
Y_DGS1 0.5657894736842105
Y_DGS2 0.6052631578947368
Y_DGS3 0.6447368421052632
Y_DGS5 0.631578947368421
Y_DGS7 0.5921052631578947
Y_DGS10 0.5394736842105263
Y_DGS20 0.5394736842105263
Y_DGS30 0.5789473684210527


 25%|██▌       | 20/79 [3:13:21<9:25:28, 575.06s/it] 

Y_DGS1MO 0.6375
Y_DGS3MO 0.65
Y_DGS6MO 0.6125
Y_DGS1 0.575
Y_DGS2 0.6125
Y_DGS3 0.65
Y_DGS5 0.6375
Y_DGS7 0.6
Y_DGS10 0.55
Y_DGS20 0.5375
Y_DGS30 0.5625


 27%|██▋       | 21/79 [3:21:20<8:48:09, 546.37s/it]

Y_DGS1MO 0.6309523809523809
Y_DGS3MO 0.6666666666666666
Y_DGS6MO 0.6309523809523809
Y_DGS1 0.5952380952380952
Y_DGS2 0.6309523809523809
Y_DGS3 0.6666666666666666
Y_DGS5 0.6428571428571429
Y_DGS7 0.6071428571428571
Y_DGS10 0.5595238095238095
Y_DGS20 0.5476190476190477
Y_DGS30 0.5595238095238095


 28%|██▊       | 22/79 [3:29:16<8:18:57, 525.22s/it]

Y_DGS1MO 0.6363636363636364
Y_DGS3MO 0.6704545454545454
Y_DGS6MO 0.625
Y_DGS1 0.5909090909090909
Y_DGS2 0.625
Y_DGS3 0.6590909090909091
Y_DGS5 0.6363636363636364
Y_DGS7 0.6022727272727273
Y_DGS10 0.5568181818181818
Y_DGS20 0.5454545454545454
Y_DGS30 0.5568181818181818


 29%|██▉       | 23/79 [3:37:52<8:07:36, 522.43s/it]

Y_DGS1MO 0.6304347826086957
Y_DGS3MO 0.6739130434782609
Y_DGS6MO 0.6304347826086957
Y_DGS1 0.5978260869565217
Y_DGS2 0.6304347826086957
Y_DGS3 0.6630434782608695
Y_DGS5 0.6304347826086957
Y_DGS7 0.6086956521739131
Y_DGS10 0.5652173913043478
Y_DGS20 0.5543478260869565
Y_DGS30 0.5652173913043478


 30%|███       | 24/79 [3:46:17<7:54:06, 517.21s/it]

Y_DGS1MO 0.6354166666666666
Y_DGS3MO 0.6770833333333334
Y_DGS6MO 0.6354166666666666
Y_DGS1 0.6041666666666666
Y_DGS2 0.6145833333333334
Y_DGS3 0.65625
Y_DGS5 0.6145833333333334
Y_DGS7 0.59375
Y_DGS10 0.5520833333333334
Y_DGS20 0.5416666666666666
Y_DGS30 0.5520833333333334


 32%|███▏      | 25/79 [3:54:58<7:46:34, 518.41s/it]

Y_DGS1MO 0.64
Y_DGS3MO 0.67
Y_DGS6MO 0.65
Y_DGS1 0.61
Y_DGS2 0.61
Y_DGS3 0.65
Y_DGS5 0.61
Y_DGS7 0.59
Y_DGS10 0.55
Y_DGS20 0.54
Y_DGS30 0.55


 33%|███▎      | 26/79 [4:03:30<7:36:12, 516.45s/it]

Y_DGS1MO 0.6538461538461539
Y_DGS3MO 0.6730769230769231
Y_DGS6MO 0.6634615384615384
Y_DGS1 0.625
Y_DGS2 0.6153846153846154
Y_DGS3 0.6442307692307693
Y_DGS5 0.6057692307692307
Y_DGS7 0.5865384615384616
Y_DGS10 0.5384615384615384
Y_DGS20 0.5288461538461539
Y_DGS30 0.5384615384615384


 34%|███▍      | 27/79 [4:11:57<7:25:10, 513.66s/it]

Y_DGS1MO 0.6481481481481481
Y_DGS3MO 0.6759259259259259
Y_DGS6MO 0.6666666666666666
Y_DGS1 0.6296296296296297
Y_DGS2 0.6203703703703703
Y_DGS3 0.6481481481481481
Y_DGS5 0.6018518518518519
Y_DGS7 0.5925925925925926
Y_DGS10 0.5462962962962963
Y_DGS20 0.5370370370370371
Y_DGS30 0.5462962962962963


 35%|███▌      | 28/79 [4:20:28<7:15:50, 512.76s/it]

Y_DGS1MO 0.6607142857142857
Y_DGS3MO 0.6785714285714286
Y_DGS6MO 0.6696428571428571
Y_DGS1 0.6428571428571429
Y_DGS2 0.625
Y_DGS3 0.6517857142857143
Y_DGS5 0.5982142857142857
Y_DGS7 0.5892857142857143
Y_DGS10 0.5446428571428571
Y_DGS20 0.5357142857142857
Y_DGS30 0.5446428571428571


 37%|███▋      | 29/79 [4:29:12<7:09:58, 515.97s/it]

Y_DGS1MO 0.6551724137931034
Y_DGS3MO 0.6810344827586207
Y_DGS6MO 0.6724137931034483
Y_DGS1 0.6379310344827587
Y_DGS2 0.6206896551724138
Y_DGS3 0.6379310344827587
Y_DGS5 0.5775862068965517
Y_DGS7 0.5689655172413793
Y_DGS10 0.5258620689655172
Y_DGS20 0.5172413793103449
Y_DGS30 0.5258620689655172


 38%|███▊      | 30/79 [4:38:02<7:04:54, 520.29s/it]

Y_DGS1MO 0.6583333333333333
Y_DGS3MO 0.6833333333333333
Y_DGS6MO 0.6666666666666666
Y_DGS1 0.6416666666666667
Y_DGS2 0.625
Y_DGS3 0.6333333333333333
Y_DGS5 0.5666666666666667
Y_DGS7 0.5583333333333333
Y_DGS10 0.5166666666666667
Y_DGS20 0.5083333333333333
Y_DGS30 0.5166666666666667


 39%|███▉      | 31/79 [4:45:54<6:44:44, 505.92s/it]

Y_DGS1MO 0.6693548387096774
Y_DGS3MO 0.6854838709677419
Y_DGS6MO 0.6693548387096774
Y_DGS1 0.6370967741935484
Y_DGS2 0.6290322580645161
Y_DGS3 0.6370967741935484
Y_DGS5 0.5725806451612904
Y_DGS7 0.5645161290322581
Y_DGS10 0.5241935483870968
Y_DGS20 0.5241935483870968
Y_DGS30 0.532258064516129


 41%|████      | 32/79 [4:54:46<6:42:21, 513.64s/it]

Y_DGS1MO 0.6796875
Y_DGS3MO 0.6875
Y_DGS6MO 0.671875
Y_DGS1 0.640625
Y_DGS2 0.625
Y_DGS3 0.625
Y_DGS5 0.5625
Y_DGS7 0.5625
Y_DGS10 0.5234375
Y_DGS20 0.5234375
Y_DGS30 0.53125


 42%|████▏     | 33/79 [5:03:53<6:41:33, 523.77s/it]

Y_DGS1MO 0.6742424242424242
Y_DGS3MO 0.6742424242424242
Y_DGS6MO 0.6590909090909091
Y_DGS1 0.6363636363636364
Y_DGS2 0.6212121212121212
Y_DGS3 0.6212121212121212
Y_DGS5 0.5681818181818182
Y_DGS7 0.5681818181818182
Y_DGS10 0.5378787878787878
Y_DGS20 0.5378787878787878
Y_DGS30 0.5454545454545454


 43%|████▎     | 34/79 [5:11:42<6:20:31, 507.36s/it]

Y_DGS1MO 0.6764705882352942
Y_DGS3MO 0.6691176470588235
Y_DGS6MO 0.6691176470588235
Y_DGS1 0.6470588235294118
Y_DGS2 0.6176470588235294
Y_DGS3 0.6176470588235294
Y_DGS5 0.5735294117647058
Y_DGS7 0.5735294117647058
Y_DGS10 0.5441176470588235
Y_DGS20 0.5441176470588235
Y_DGS30 0.5514705882352942


 44%|████▍     | 35/79 [5:21:00<6:23:00, 522.29s/it]

Y_DGS1MO 0.6857142857142857
Y_DGS3MO 0.6714285714285714
Y_DGS6MO 0.6714285714285714
Y_DGS1 0.65
Y_DGS2 0.6142857142857143
Y_DGS3 0.6142857142857143
Y_DGS5 0.5714285714285714
Y_DGS7 0.5714285714285714
Y_DGS10 0.55
Y_DGS20 0.55
Y_DGS30 0.5571428571428572


 46%|████▌     | 36/79 [5:28:34<5:59:40, 501.88s/it]

Y_DGS1MO 0.6875
Y_DGS3MO 0.6805555555555556
Y_DGS6MO 0.6736111111111112
Y_DGS1 0.6458333333333334
Y_DGS2 0.6180555555555556
Y_DGS3 0.6111111111111112
Y_DGS5 0.5694444444444444
Y_DGS7 0.5694444444444444
Y_DGS10 0.5416666666666666
Y_DGS20 0.5486111111111112
Y_DGS30 0.5555555555555556


 47%|████▋     | 37/79 [5:37:35<5:59:33, 513.65s/it]

Y_DGS1MO 0.6891891891891891
Y_DGS3MO 0.6756756756756757
Y_DGS6MO 0.6756756756756757
Y_DGS1 0.6418918918918919
Y_DGS2 0.6148648648648649
Y_DGS3 0.6013513513513513
Y_DGS5 0.5608108108108109
Y_DGS7 0.5608108108108109
Y_DGS10 0.527027027027027
Y_DGS20 0.5405405405405406
Y_DGS30 0.5472972972972973


 48%|████▊     | 38/79 [5:47:05<6:02:33, 530.58s/it]

Y_DGS1MO 0.6776315789473685
Y_DGS3MO 0.6776315789473685
Y_DGS6MO 0.6776315789473685
Y_DGS1 0.631578947368421
Y_DGS2 0.6118421052631579
Y_DGS3 0.5986842105263158
Y_DGS5 0.5592105263157895
Y_DGS7 0.5592105263157895
Y_DGS10 0.5328947368421053
Y_DGS20 0.5460526315789473
Y_DGS30 0.5526315789473685


 49%|████▉     | 39/79 [5:55:51<5:52:46, 529.17s/it]

Y_DGS1MO 0.6794871794871795
Y_DGS3MO 0.6794871794871795
Y_DGS6MO 0.6794871794871795
Y_DGS1 0.6153846153846154
Y_DGS2 0.6089743589743589
Y_DGS3 0.5897435897435898
Y_DGS5 0.5641025641025641
Y_DGS7 0.5641025641025641
Y_DGS10 0.5384615384615384
Y_DGS20 0.5512820512820513
Y_DGS30 0.5576923076923077


 51%|█████     | 40/79 [6:05:41<5:55:44, 547.30s/it]

Y_DGS1MO 0.675
Y_DGS3MO 0.68125
Y_DGS6MO 0.66875
Y_DGS1 0.60625
Y_DGS2 0.6
Y_DGS3 0.5875
Y_DGS5 0.55625
Y_DGS7 0.55625
Y_DGS10 0.53125
Y_DGS20 0.55
Y_DGS30 0.55625


 52%|█████▏    | 41/79 [6:15:01<5:49:05, 551.19s/it]

Y_DGS1MO 0.6829268292682927
Y_DGS3MO 0.6646341463414634
Y_DGS6MO 0.6524390243902439
Y_DGS1 0.5914634146341463
Y_DGS2 0.5853658536585366
Y_DGS3 0.573170731707317
Y_DGS5 0.5487804878048781
Y_DGS7 0.5487804878048781
Y_DGS10 0.524390243902439
Y_DGS20 0.5426829268292683
Y_DGS30 0.5548780487804879


 53%|█████▎    | 42/79 [6:24:09<5:39:15, 550.15s/it]

Y_DGS1MO 0.6845238095238095
Y_DGS3MO 0.6547619047619048
Y_DGS6MO 0.6547619047619048
Y_DGS1 0.5892857142857143
Y_DGS2 0.5892857142857143
Y_DGS3 0.5654761904761905
Y_DGS5 0.5416666666666666
Y_DGS7 0.5416666666666666
Y_DGS10 0.5238095238095238
Y_DGS20 0.5357142857142857
Y_DGS30 0.5476190476190477


 54%|█████▍    | 43/79 [6:32:04<5:16:37, 527.71s/it]

Y_DGS1MO 0.6744186046511628
Y_DGS3MO 0.6511627906976745
Y_DGS6MO 0.6453488372093024
Y_DGS1 0.5813953488372093
Y_DGS2 0.5813953488372093
Y_DGS3 0.5581395348837209
Y_DGS5 0.5348837209302325
Y_DGS7 0.5348837209302325
Y_DGS10 0.5174418604651163
Y_DGS20 0.5290697674418605
Y_DGS30 0.5406976744186046


 56%|█████▌    | 44/79 [6:40:06<4:59:49, 513.99s/it]

Y_DGS1MO 0.6647727272727273
Y_DGS3MO 0.6534090909090909
Y_DGS6MO 0.6534090909090909
Y_DGS1 0.5738636363636364
Y_DGS2 0.5738636363636364
Y_DGS3 0.5511363636363636
Y_DGS5 0.5284090909090909
Y_DGS7 0.5284090909090909
Y_DGS10 0.5113636363636364
Y_DGS20 0.5227272727272727
Y_DGS30 0.5340909090909091


 57%|█████▋    | 45/79 [6:47:51<4:42:57, 499.34s/it]

Y_DGS1MO 0.6611111111111111
Y_DGS3MO 0.6555555555555556
Y_DGS6MO 0.6555555555555556
Y_DGS1 0.5666666666666667
Y_DGS2 0.5722222222222222
Y_DGS3 0.55
Y_DGS5 0.5333333333333333
Y_DGS7 0.5333333333333333
Y_DGS10 0.5166666666666667
Y_DGS20 0.5277777777777778
Y_DGS30 0.5333333333333333


 58%|█████▊    | 46/79 [6:57:19<4:45:56, 519.91s/it]

Y_DGS1MO 0.6467391304347826
Y_DGS3MO 0.6467391304347826
Y_DGS6MO 0.6630434782608695
Y_DGS1 0.5597826086956522
Y_DGS2 0.5652173913043478
Y_DGS3 0.5434782608695652
Y_DGS5 0.5271739130434783
Y_DGS7 0.5271739130434783
Y_DGS10 0.5108695652173914
Y_DGS20 0.5217391304347826
Y_DGS30 0.5271739130434783


 59%|█████▉    | 47/79 [7:04:59<4:27:38, 501.83s/it]

Y_DGS1MO 0.6382978723404256
Y_DGS3MO 0.648936170212766
Y_DGS6MO 0.6702127659574468
Y_DGS1 0.5585106382978723
Y_DGS2 0.5638297872340425
Y_DGS3 0.5478723404255319
Y_DGS5 0.5319148936170213
Y_DGS7 0.5319148936170213
Y_DGS10 0.5159574468085106
Y_DGS20 0.526595744680851
Y_DGS30 0.5319148936170213


 61%|██████    | 48/79 [7:12:22<4:10:13, 484.30s/it]

Y_DGS1MO 0.640625
Y_DGS3MO 0.6510416666666666
Y_DGS6MO 0.671875
Y_DGS1 0.5625
Y_DGS2 0.5625
Y_DGS3 0.5520833333333334
Y_DGS5 0.5364583333333334
Y_DGS7 0.53125
Y_DGS10 0.515625
Y_DGS20 0.5260416666666666
Y_DGS30 0.53125


 62%|██████▏   | 49/79 [7:19:46<3:56:10, 472.36s/it]

Y_DGS1MO 0.6428571428571429
Y_DGS3MO 0.6581632653061225
Y_DGS6MO 0.6785714285714286
Y_DGS1 0.5561224489795918
Y_DGS2 0.5561224489795918
Y_DGS3 0.5408163265306123
Y_DGS5 0.5255102040816326
Y_DGS7 0.5204081632653061
Y_DGS10 0.5051020408163265
Y_DGS20 0.5204081632653061
Y_DGS30 0.5255102040816326


 63%|██████▎   | 50/79 [7:27:05<3:43:23, 462.19s/it]

Y_DGS1MO 0.645
Y_DGS3MO 0.665
Y_DGS6MO 0.685
Y_DGS1 0.55
Y_DGS2 0.545
Y_DGS3 0.53
Y_DGS5 0.515
Y_DGS7 0.51
Y_DGS10 0.495
Y_DGS20 0.51
Y_DGS30 0.515


 65%|██████▍   | 51/79 [7:34:15<3:31:07, 452.41s/it]

Y_DGS1MO 0.6470588235294118
Y_DGS3MO 0.6715686274509803
Y_DGS6MO 0.6911764705882353
Y_DGS1 0.5441176470588235
Y_DGS2 0.5392156862745098
Y_DGS3 0.5245098039215687
Y_DGS5 0.5196078431372549
Y_DGS7 0.5098039215686274
Y_DGS10 0.4950980392156863
Y_DGS20 0.5098039215686274
Y_DGS30 0.5147058823529411


 66%|██████▌   | 52/79 [7:41:19<3:19:47, 443.98s/it]

Y_DGS1MO 0.6442307692307693
Y_DGS3MO 0.6730769230769231
Y_DGS6MO 0.6875
Y_DGS1 0.5432692307692307
Y_DGS2 0.5336538461538461
Y_DGS3 0.5288461538461539
Y_DGS5 0.5144230769230769
Y_DGS7 0.5048076923076923
Y_DGS10 0.4855769230769231
Y_DGS20 0.5096153846153846
Y_DGS30 0.5240384615384616


 67%|██████▋   | 53/79 [7:48:25<3:10:06, 438.71s/it]

Y_DGS1MO 0.6415094339622641
Y_DGS3MO 0.6698113207547169
Y_DGS6MO 0.6839622641509434
Y_DGS1 0.5377358490566038
Y_DGS2 0.5283018867924528
Y_DGS3 0.5235849056603774
Y_DGS5 0.5094339622641509
Y_DGS7 0.5
Y_DGS10 0.4811320754716981
Y_DGS20 0.5047169811320755
Y_DGS30 0.5235849056603774


 68%|██████▊   | 54/79 [7:55:27<3:00:39, 433.57s/it]

Y_DGS1MO 0.6435185185185185
Y_DGS3MO 0.6712962962962963
Y_DGS6MO 0.6851851851851852
Y_DGS1 0.5370370370370371
Y_DGS2 0.5324074074074074
Y_DGS3 0.5277777777777778
Y_DGS5 0.5046296296296297
Y_DGS7 0.49074074074074076
Y_DGS10 0.47685185185185186
Y_DGS20 0.5138888888888888
Y_DGS30 0.5277777777777778


 70%|██████▉   | 55/79 [8:02:32<2:52:26, 431.11s/it]

Y_DGS1MO 0.6409090909090909
Y_DGS3MO 0.6727272727272727
Y_DGS6MO 0.6909090909090909
Y_DGS1 0.5272727272727272
Y_DGS2 0.5227272727272727
Y_DGS3 0.5181818181818182
Y_DGS5 0.5
Y_DGS7 0.4863636363636364
Y_DGS10 0.4681818181818182
Y_DGS20 0.509090909090909
Y_DGS30 0.5227272727272727


 71%|███████   | 56/79 [8:09:34<2:44:09, 428.23s/it]

Y_DGS1MO 0.6383928571428571
Y_DGS3MO 0.6741071428571429
Y_DGS6MO 0.6875
Y_DGS1 0.5223214285714286
Y_DGS2 0.5223214285714286
Y_DGS3 0.5133928571428571
Y_DGS5 0.4955357142857143
Y_DGS7 0.48214285714285715
Y_DGS10 0.4642857142857143
Y_DGS20 0.5089285714285714
Y_DGS30 0.5267857142857143


 72%|███████▏  | 57/79 [8:16:33<2:36:03, 425.62s/it]

Y_DGS1MO 0.631578947368421
Y_DGS3MO 0.6754385964912281
Y_DGS6MO 0.6929824561403509
Y_DGS1 0.5175438596491229
Y_DGS2 0.5175438596491229
Y_DGS3 0.5087719298245614
Y_DGS5 0.49122807017543857
Y_DGS7 0.4780701754385965
Y_DGS10 0.4605263157894737
Y_DGS20 0.5043859649122807
Y_DGS30 0.5219298245614035


 73%|███████▎  | 58/79 [8:23:37<2:28:45, 425.02s/it]

Y_DGS1MO 0.625
Y_DGS3MO 0.6724137931034483
Y_DGS6MO 0.6939655172413793
Y_DGS1 0.5129310344827587
Y_DGS2 0.5172413793103449
Y_DGS3 0.5086206896551724
Y_DGS5 0.49137931034482757
Y_DGS7 0.4827586206896552
Y_DGS10 0.46120689655172414
Y_DGS20 0.5
Y_DGS30 0.5172413793103449


 75%|███████▍  | 59/79 [8:30:35<2:20:58, 422.94s/it]

Y_DGS1MO 0.6186440677966102
Y_DGS3MO 0.6694915254237288
Y_DGS6MO 0.690677966101695
Y_DGS1 0.5084745762711864
Y_DGS2 0.5169491525423728
Y_DGS3 0.5042372881355932
Y_DGS5 0.4915254237288136
Y_DGS7 0.4830508474576271
Y_DGS10 0.4576271186440678
Y_DGS20 0.5042372881355932
Y_DGS30 0.5169491525423728


 76%|███████▌  | 60/79 [8:37:33<2:13:29, 421.54s/it]

Y_DGS1MO 0.6125
Y_DGS3MO 0.675
Y_DGS6MO 0.6916666666666667
Y_DGS1 0.5041666666666667
Y_DGS2 0.5208333333333334
Y_DGS3 0.5041666666666667
Y_DGS5 0.49166666666666664
Y_DGS7 0.4875
Y_DGS10 0.4625
Y_DGS20 0.5
Y_DGS30 0.5125


 77%|███████▋  | 61/79 [8:44:20<2:05:08, 417.11s/it]

Y_DGS1MO 0.610655737704918
Y_DGS3MO 0.6762295081967213
Y_DGS6MO 0.6926229508196722
Y_DGS1 0.5
Y_DGS2 0.5245901639344263
Y_DGS3 0.5040983606557377
Y_DGS5 0.4959016393442623
Y_DGS7 0.4918032786885246
Y_DGS10 0.46311475409836067
Y_DGS20 0.4959016393442623
Y_DGS30 0.5081967213114754


 78%|███████▊  | 62/79 [8:51:05<1:57:08, 413.42s/it]

Y_DGS1MO 0.6088709677419355
Y_DGS3MO 0.6774193548387096
Y_DGS6MO 0.6895161290322581
Y_DGS1 0.5040322580645161
Y_DGS2 0.5282258064516129
Y_DGS3 0.5080645161290323
Y_DGS5 0.5
Y_DGS7 0.4959677419354839
Y_DGS10 0.4637096774193548
Y_DGS20 0.4959677419354839
Y_DGS30 0.5080645161290323


 80%|███████▉  | 63/79 [8:57:48<1:49:25, 410.32s/it]

Y_DGS1MO 0.6031746031746031
Y_DGS3MO 0.6746031746031746
Y_DGS6MO 0.6904761904761905
Y_DGS1 0.5079365079365079
Y_DGS2 0.5317460317460317
Y_DGS3 0.5079365079365079
Y_DGS5 0.503968253968254
Y_DGS7 0.5
Y_DGS10 0.45634920634920634
Y_DGS20 0.4880952380952381
Y_DGS30 0.5


 81%|████████  | 64/79 [9:04:29<1:41:54, 407.61s/it]

Y_DGS1MO 0.6015625
Y_DGS3MO 0.66796875
Y_DGS6MO 0.68359375
Y_DGS1 0.515625
Y_DGS2 0.52734375
Y_DGS3 0.5078125
Y_DGS5 0.50390625
Y_DGS7 0.5
Y_DGS10 0.4609375
Y_DGS20 0.4921875
Y_DGS30 0.5


 82%|████████▏ | 65/79 [9:11:08<1:34:31, 405.08s/it]

Y_DGS1MO 0.6038461538461538
Y_DGS3MO 0.6653846153846154
Y_DGS6MO 0.6807692307692308
Y_DGS1 0.5153846153846153
Y_DGS2 0.5269230769230769
Y_DGS3 0.5076923076923077
Y_DGS5 0.5038461538461538
Y_DGS7 0.5
Y_DGS10 0.46153846153846156
Y_DGS20 0.5
Y_DGS30 0.5


 84%|████████▎ | 66/79 [9:17:47<1:27:19, 403.05s/it]

Y_DGS1MO 0.6060606060606061
Y_DGS3MO 0.6666666666666666
Y_DGS6MO 0.6742424242424242
Y_DGS1 0.5189393939393939
Y_DGS2 0.5303030303030303
Y_DGS3 0.5113636363636364
Y_DGS5 0.5075757575757576
Y_DGS7 0.5037878787878788
Y_DGS10 0.4659090909090909
Y_DGS20 0.5037878787878788
Y_DGS30 0.5037878787878788


 85%|████████▍ | 67/79 [9:24:19<1:19:56, 399.69s/it]

Y_DGS1MO 0.6082089552238806
Y_DGS3MO 0.667910447761194
Y_DGS6MO 0.6716417910447762
Y_DGS1 0.5186567164179104
Y_DGS2 0.5298507462686567
Y_DGS3 0.5111940298507462
Y_DGS5 0.5074626865671642
Y_DGS7 0.503731343283582
Y_DGS10 0.4664179104477612
Y_DGS20 0.5
Y_DGS30 0.5


 86%|████████▌ | 68/79 [9:30:39<1:12:14, 394.03s/it]

Y_DGS1MO 0.6102941176470589
Y_DGS3MO 0.6654411764705882
Y_DGS6MO 0.6617647058823529
Y_DGS1 0.5110294117647058
Y_DGS2 0.5220588235294118
Y_DGS3 0.5073529411764706
Y_DGS5 0.5073529411764706
Y_DGS7 0.5036764705882353
Y_DGS10 0.46691176470588236
Y_DGS20 0.5
Y_DGS30 0.5


 87%|████████▋ | 69/79 [9:36:55<1:04:44, 388.44s/it]

Y_DGS1MO 0.6086956521739131
Y_DGS3MO 0.6666666666666666
Y_DGS6MO 0.6630434782608695
Y_DGS1 0.5144927536231884
Y_DGS2 0.5253623188405797
Y_DGS3 0.5108695652173914
Y_DGS5 0.5108695652173914
Y_DGS7 0.5072463768115942
Y_DGS10 0.47101449275362317
Y_DGS20 0.5036231884057971
Y_DGS30 0.5036231884057971


 89%|████████▊ | 70/79 [9:43:10<57:39, 384.35s/it]  

Y_DGS1MO 0.6071428571428571
Y_DGS3MO 0.6714285714285714
Y_DGS6MO 0.6642857142857143
Y_DGS1 0.5071428571428571
Y_DGS2 0.5214285714285715
Y_DGS3 0.5035714285714286
Y_DGS5 0.5035714285714286
Y_DGS7 0.5035714285714286
Y_DGS10 0.46785714285714286
Y_DGS20 0.5
Y_DGS30 0.5


 90%|████████▉ | 71/79 [9:49:24<50:50, 381.32s/it]

Y_DGS1MO 0.6091549295774648
Y_DGS3MO 0.6690140845070423
Y_DGS6MO 0.6690140845070423
Y_DGS1 0.5070422535211268
Y_DGS2 0.5246478873239436
Y_DGS3 0.5035211267605634
Y_DGS5 0.5035211267605634
Y_DGS7 0.5035211267605634
Y_DGS10 0.47183098591549294
Y_DGS20 0.5035211267605634
Y_DGS30 0.5035211267605634


 91%|█████████ | 72/79 [9:55:42<44:22, 380.41s/it]

Y_DGS1MO 0.6111111111111112
Y_DGS3MO 0.6701388888888888
Y_DGS6MO 0.6701388888888888
Y_DGS1 0.5104166666666666
Y_DGS2 0.5277777777777778
Y_DGS3 0.5069444444444444
Y_DGS5 0.5034722222222222
Y_DGS7 0.5034722222222222
Y_DGS10 0.4722222222222222
Y_DGS20 0.5069444444444444
Y_DGS30 0.5034722222222222


 92%|█████████▏| 73/79 [10:02:12<38:18, 383.14s/it]

Y_DGS1MO 0.6095890410958904
Y_DGS3MO 0.6746575342465754
Y_DGS6MO 0.6712328767123288
Y_DGS1 0.5136986301369864
Y_DGS2 0.5273972602739726
Y_DGS3 0.5068493150684932
Y_DGS5 0.5034246575342466
Y_DGS7 0.5034246575342466
Y_DGS10 0.4726027397260274
Y_DGS20 0.5068493150684932
Y_DGS30 0.5034246575342466


 94%|█████████▎| 74/79 [10:08:28<31:45, 381.15s/it]

Y_DGS1MO 0.6047297297297297
Y_DGS3MO 0.6756756756756757
Y_DGS6MO 0.668918918918919
Y_DGS1 0.5168918918918919
Y_DGS2 0.527027027027027
Y_DGS3 0.5067567567567568
Y_DGS5 0.5033783783783784
Y_DGS7 0.5033783783783784
Y_DGS10 0.47297297297297297
Y_DGS20 0.5067567567567568
Y_DGS30 0.5033783783783784


 95%|█████████▍| 75/79 [10:14:37<25:10, 377.53s/it]

Y_DGS1MO 0.61
Y_DGS3MO 0.68
Y_DGS6MO 0.6733333333333333
Y_DGS1 0.52
Y_DGS2 0.5266666666666666
Y_DGS3 0.5066666666666667
Y_DGS5 0.5066666666666667
Y_DGS7 0.5066666666666667
Y_DGS10 0.4766666666666667
Y_DGS20 0.51
Y_DGS30 0.5066666666666667


 96%|█████████▌| 76/79 [10:20:49<18:47, 375.68s/it]

Y_DGS1MO 0.6085526315789473
Y_DGS3MO 0.680921052631579
Y_DGS6MO 0.6743421052631579
Y_DGS1 0.5230263157894737
Y_DGS2 0.5296052631578947
Y_DGS3 0.5032894736842105
Y_DGS5 0.5032894736842105
Y_DGS7 0.5032894736842105
Y_DGS10 0.47368421052631576
Y_DGS20 0.506578947368421
Y_DGS30 0.5032894736842105


 97%|█████████▋| 77/79 [10:26:51<12:23, 371.66s/it]

Y_DGS1MO 0.6136363636363636
Y_DGS3MO 0.685064935064935
Y_DGS6MO 0.672077922077922
Y_DGS1 0.5227272727272727
Y_DGS2 0.525974025974026
Y_DGS3 0.4967532467532468
Y_DGS5 0.5
Y_DGS7 0.5
Y_DGS10 0.4707792207792208
Y_DGS20 0.5032467532467533
Y_DGS30 0.5


 99%|█████████▊| 78/79 [10:32:58<06:10, 370.23s/it]

Y_DGS1MO 0.6121794871794872
Y_DGS3MO 0.6826923076923077
Y_DGS6MO 0.6698717948717948
Y_DGS1 0.5192307692307693
Y_DGS2 0.5224358974358975
Y_DGS3 0.4967948717948718
Y_DGS5 0.5
Y_DGS7 0.5
Y_DGS10 0.47115384615384615
Y_DGS20 0.5032051282051282
Y_DGS30 0.5


100%|██████████| 79/79 [10:39:00<00:00, 485.32s/it]

Y_DGS1MO 0.6139240506329114
Y_DGS3MO 0.6867088607594937
Y_DGS6MO 0.6740506329113924
Y_DGS1 0.5189873417721519
Y_DGS2 0.5189873417721519
Y_DGS3 0.4936708860759494
Y_DGS5 0.49683544303797467
Y_DGS7 0.49683544303797467
Y_DGS10 0.46835443037974683
Y_DGS20 0.5
Y_DGS30 0.49683544303797467





Y_DGS1MO    0.613924
Y_DGS3MO    0.686709
Y_DGS6MO    0.674051
Y_DGS1      0.518987
Y_DGS2      0.518987
Y_DGS3      0.493671
Y_DGS5      0.496835
Y_DGS7      0.496835
Y_DGS10     0.468354
Y_DGS20     0.500000
Y_DGS30     0.496835
dtype: float64


And for the dataset with MI > 0.05:

In [None]:
# accuracies = {col: [] for col in Yw.columns}
# params_by_target = {col: [] for col in Yw.columns}
# feature_importance = {col: [] for col in Yw.columns}
# y_true = []
# y_pred = []

# for start in tqdm(range(0, len(Xw) - window_train - window_pred + 1, window_pred)):
#     end_train = start + window_train
#     end_pred = end_train + window_pred

#     Xw_train = Xw5.iloc[start:end_train]
#     Yw_train = Yw.iloc[start:end_train]
#     Xw_test = Xw5.iloc[end_train:end_pred]
#     Yw_test = Yw.iloc[end_train:end_pred]

#     y_true.append(Yw_test)

#     grid = GridSearchCV(pipe,
#                         param_grid,
#                         cv=tscv,
#                         scoring='accuracy',
#                         n_jobs=-1,
#                         verbose=0)

#     grid.fit(Xw_train, Yw_train)
#     best_model = grid.best_estimator_

#     Yw_pred = best_model.predict(Xw_test)
#     y_pred.append(Yw_pred)

#     for i, col in enumerate(Yw_train.columns):
#         acc = accuracy_score(Yw_test[col], Yw_pred[:, i])
#         accuracies[col].append(acc)
#         print(col, np.mean(accuracies[col]))

#         # feature importance pour la i-ème sortie
#         est = best_model.named_steps['xgb'].estimators_[i]   # XGBClassifier
#         importance = est.feature_importances_
#         feature_importance[col].append(importance)

#         # hyperparamètres de l'estimateur i
#         params = est.get_params()
#         params_by_target[col].append(params)



# acc = pd.DataFrame(accuracies,columns = Yw.columns)
# params = pd.DataFrame(params_by_target, columns = Yw.columns) 
# feature_imp = pd.DataFrame(feature_importance, columns = Yw.columns)


# Y_true_flat = np.array(y_true).reshape(-1, np.array(y_true).shape[2])  # aplati les 2 premières dimensions en une seule
# ytrue = pd.DataFrame(Y_true_flat, index = Yw.iloc[780:Yw.shape[0]-1].index,columns=Yw.columns)


# Y_pred_flat = np.array(y_pred).reshape(-1, np.array(y_pred).shape[2])  # aplati les 2 premières dimensions en une seule
# ypred = pd.DataFrame(Y_pred_flat,index = Yw.iloc[780:Yw.shape[0]-1].index, columns=Yw.columns)



# acc.to_csv('accuracies xgboost, MI 0.05, 15y train test.csv')
# params.to_csv('params xgboost, MI 0.05, 15y train test.csv')
# feature_imp.to_csv('feature importance xgboost, MI 0.05, 15y train test.csv')
# ypred.to_csv('forecast xgboost, MI 0.05, 15y train test.csv')
# ytrue.to_csv('true values xgboost, MI 0.05, 15y train test.csv')

# print(acc.mean())


100%|██████████| 11/11 [00:16<00:00,  1.52s/it]
  1%|▏         | 1/79 [04:08<5:23:14, 248.64s/it]

Y_DGS1MO 0.75
Y_DGS3MO 0.75
Y_DGS6MO 0.5
Y_DGS1 0.5
Y_DGS2 0.25
Y_DGS3 0.5
Y_DGS5 0.75
Y_DGS7 0.75
Y_DGS10 0.5
Y_DGS20 0.75
Y_DGS30 0.75


  3%|▎         | 2/79 [07:38<4:49:39, 225.71s/it]

Y_DGS1MO 0.625
Y_DGS3MO 0.625
Y_DGS6MO 0.375
Y_DGS1 0.5
Y_DGS2 0.25
Y_DGS3 0.5
Y_DGS5 0.625
Y_DGS7 0.625
Y_DGS10 0.5
Y_DGS20 0.625
Y_DGS30 0.625


  4%|▍         | 3/79 [11:00<4:32:13, 214.91s/it]

Y_DGS1MO 0.5833333333333334
Y_DGS3MO 0.5833333333333334
Y_DGS6MO 0.25
Y_DGS1 0.5833333333333334
Y_DGS2 0.3333333333333333
Y_DGS3 0.5
Y_DGS5 0.5833333333333334
Y_DGS7 0.5833333333333334
Y_DGS10 0.5
Y_DGS20 0.5
Y_DGS30 0.5


  5%|▌         | 4/79 [14:23<4:22:58, 210.38s/it]

Y_DGS1MO 0.5
Y_DGS3MO 0.5625
Y_DGS6MO 0.3125
Y_DGS1 0.625
Y_DGS2 0.5
Y_DGS3 0.625
Y_DGS5 0.6875
Y_DGS7 0.6875
Y_DGS10 0.625
Y_DGS20 0.5
Y_DGS30 0.5625


  6%|▋         | 5/79 [17:37<4:12:03, 204.37s/it]

Y_DGS1MO 0.55
Y_DGS3MO 0.6
Y_DGS6MO 0.35
Y_DGS1 0.6
Y_DGS2 0.5
Y_DGS3 0.6
Y_DGS5 0.65
Y_DGS7 0.65
Y_DGS10 0.55
Y_DGS20 0.5
Y_DGS30 0.55


  8%|▊         | 6/79 [20:49<4:03:28, 200.12s/it]

Y_DGS1MO 0.625
Y_DGS3MO 0.625
Y_DGS6MO 0.4583333333333333
Y_DGS1 0.6666666666666666
Y_DGS2 0.5416666666666666
Y_DGS3 0.625
Y_DGS5 0.6666666666666666
Y_DGS7 0.6666666666666666
Y_DGS10 0.5833333333333334
Y_DGS20 0.5416666666666666
Y_DGS30 0.5833333333333334


  9%|▉         | 7/79 [24:01<3:57:00, 197.51s/it]

Y_DGS1MO 0.6785714285714286
Y_DGS3MO 0.6785714285714286
Y_DGS6MO 0.5
Y_DGS1 0.6785714285714286
Y_DGS2 0.6071428571428571
Y_DGS3 0.6785714285714286
Y_DGS5 0.7142857142857143
Y_DGS7 0.7142857142857143
Y_DGS10 0.6428571428571429
Y_DGS20 0.5714285714285714
Y_DGS30 0.6071428571428571


 10%|█         | 8/79 [27:12<3:51:25, 195.57s/it]

Y_DGS1MO 0.65625
Y_DGS3MO 0.65625
Y_DGS6MO 0.5
Y_DGS1 0.6875
Y_DGS2 0.625
Y_DGS3 0.6875
Y_DGS5 0.6875
Y_DGS7 0.6875
Y_DGS10 0.625
Y_DGS20 0.5625
Y_DGS30 0.625


 11%|█▏        | 9/79 [30:23<3:46:21, 194.02s/it]

Y_DGS1MO 0.6666666666666666
Y_DGS3MO 0.6666666666666666
Y_DGS6MO 0.5277777777777778
Y_DGS1 0.6944444444444444
Y_DGS2 0.6388888888888888
Y_DGS3 0.6944444444444444
Y_DGS5 0.6944444444444444
Y_DGS7 0.6944444444444444
Y_DGS10 0.6388888888888888
Y_DGS20 0.5833333333333334
Y_DGS30 0.6388888888888888


 13%|█▎        | 10/79 [33:33<3:41:46, 192.85s/it]

Y_DGS1MO 0.65
Y_DGS3MO 0.65
Y_DGS6MO 0.5
Y_DGS1 0.65
Y_DGS2 0.6
Y_DGS3 0.675
Y_DGS5 0.675
Y_DGS7 0.675
Y_DGS10 0.625
Y_DGS20 0.575
Y_DGS30 0.6


 14%|█▍        | 11/79 [36:47<3:38:43, 192.99s/it]

Y_DGS1MO 0.6590909090909091
Y_DGS3MO 0.6818181818181818
Y_DGS6MO 0.5227272727272727
Y_DGS1 0.6590909090909091
Y_DGS2 0.6136363636363636
Y_DGS3 0.6818181818181818
Y_DGS5 0.6818181818181818
Y_DGS7 0.6818181818181818
Y_DGS10 0.6363636363636364
Y_DGS20 0.5909090909090909
Y_DGS30 0.6136363636363636


 15%|█▌        | 12/79 [40:00<3:35:34, 193.05s/it]

Y_DGS1MO 0.6875
Y_DGS3MO 0.6875
Y_DGS6MO 0.5208333333333334
Y_DGS1 0.6458333333333334
Y_DGS2 0.6041666666666666
Y_DGS3 0.6666666666666666
Y_DGS5 0.6666666666666666
Y_DGS7 0.6666666666666666
Y_DGS10 0.625
Y_DGS20 0.5625
Y_DGS30 0.5833333333333334


 16%|█▋        | 13/79 [43:16<3:33:18, 193.92s/it]

Y_DGS1MO 0.6730769230769231
Y_DGS3MO 0.6538461538461539
Y_DGS6MO 0.5192307692307693
Y_DGS1 0.6346153846153846
Y_DGS2 0.6346153846153846
Y_DGS3 0.6538461538461539
Y_DGS5 0.6730769230769231
Y_DGS7 0.6538461538461539
Y_DGS10 0.6153846153846154
Y_DGS20 0.5769230769230769
Y_DGS30 0.5961538461538461


 18%|█▊        | 14/79 [46:33<3:31:13, 194.97s/it]

Y_DGS1MO 0.6607142857142857
Y_DGS3MO 0.6428571428571429
Y_DGS6MO 0.5178571428571429
Y_DGS1 0.6428571428571429
Y_DGS2 0.6428571428571429
Y_DGS3 0.6607142857142857
Y_DGS5 0.6785714285714286
Y_DGS7 0.6607142857142857
Y_DGS10 0.625
Y_DGS20 0.5892857142857143
Y_DGS30 0.6071428571428571


 19%|█▉        | 15/79 [49:58<3:31:11, 197.99s/it]

Y_DGS1MO 0.65
Y_DGS3MO 0.6166666666666667
Y_DGS6MO 0.5333333333333333
Y_DGS1 0.65
Y_DGS2 0.6333333333333333
Y_DGS3 0.6666666666666666
Y_DGS5 0.6833333333333333
Y_DGS7 0.6666666666666666
Y_DGS10 0.6166666666666667
Y_DGS20 0.5833333333333334
Y_DGS30 0.6


 20%|██        | 16/79 [53:18<3:28:28, 198.54s/it]

Y_DGS1MO 0.640625
Y_DGS3MO 0.609375
Y_DGS6MO 0.546875
Y_DGS1 0.65625
Y_DGS2 0.625
Y_DGS3 0.65625
Y_DGS5 0.671875
Y_DGS7 0.671875
Y_DGS10 0.625
Y_DGS20 0.59375
Y_DGS30 0.609375


 22%|██▏       | 17/79 [56:40<3:26:15, 199.60s/it]

Y_DGS1MO 0.6617647058823529
Y_DGS3MO 0.6323529411764706
Y_DGS6MO 0.5735294117647058
Y_DGS1 0.6764705882352942
Y_DGS2 0.6470588235294118
Y_DGS3 0.6617647058823529
Y_DGS5 0.6764705882352942
Y_DGS7 0.6764705882352942
Y_DGS10 0.6323529411764706
Y_DGS20 0.5882352941176471
Y_DGS30 0.6176470588235294


 23%|██▎       | 18/79 [59:56<3:21:48, 198.50s/it]

Y_DGS1MO 0.6527777777777778
Y_DGS3MO 0.625
Y_DGS6MO 0.5694444444444444
Y_DGS1 0.6527777777777778
Y_DGS2 0.6527777777777778
Y_DGS3 0.6805555555555556
Y_DGS5 0.6805555555555556
Y_DGS7 0.6805555555555556
Y_DGS10 0.6388888888888888
Y_DGS20 0.5972222222222222
Y_DGS30 0.625


 24%|██▍       | 19/79 [1:03:13<3:18:03, 198.06s/it]

Y_DGS1MO 0.6710526315789473
Y_DGS3MO 0.6447368421052632
Y_DGS6MO 0.5657894736842105
Y_DGS1 0.6578947368421053
Y_DGS2 0.6578947368421053
Y_DGS3 0.6842105263157895
Y_DGS5 0.6842105263157895
Y_DGS7 0.6710526315789473
Y_DGS10 0.618421052631579
Y_DGS20 0.5789473684210527
Y_DGS30 0.6052631578947368


 25%|██▌       | 20/79 [1:06:29<3:14:04, 197.36s/it]

Y_DGS1MO 0.6625
Y_DGS3MO 0.625
Y_DGS6MO 0.575
Y_DGS1 0.6625
Y_DGS2 0.6625
Y_DGS3 0.6875
Y_DGS5 0.6875
Y_DGS7 0.675
Y_DGS10 0.625
Y_DGS20 0.575
Y_DGS30 0.5875


 27%|██▋       | 21/79 [1:09:44<3:10:17, 196.85s/it]

Y_DGS1MO 0.6547619047619048
Y_DGS3MO 0.6428571428571429
Y_DGS6MO 0.5952380952380952
Y_DGS1 0.6785714285714286
Y_DGS2 0.6785714285714286
Y_DGS3 0.7023809523809523
Y_DGS5 0.6904761904761905
Y_DGS7 0.6785714285714286
Y_DGS10 0.6309523809523809
Y_DGS20 0.5833333333333334
Y_DGS30 0.5833333333333334


 28%|██▊       | 22/79 [1:13:01<3:06:55, 196.77s/it]

Y_DGS1MO 0.6590909090909091
Y_DGS3MO 0.6477272727272727
Y_DGS6MO 0.5909090909090909
Y_DGS1 0.6704545454545454
Y_DGS2 0.6704545454545454
Y_DGS3 0.6931818181818182
Y_DGS5 0.6818181818181818
Y_DGS7 0.6704545454545454
Y_DGS10 0.625
Y_DGS20 0.5795454545454546
Y_DGS30 0.5795454545454546


 29%|██▉       | 23/79 [1:16:15<3:02:57, 196.04s/it]

Y_DGS1MO 0.6521739130434783
Y_DGS3MO 0.6521739130434783
Y_DGS6MO 0.5978260869565217
Y_DGS1 0.6739130434782609
Y_DGS2 0.6739130434782609
Y_DGS3 0.6956521739130435
Y_DGS5 0.6739130434782609
Y_DGS7 0.6739130434782609
Y_DGS10 0.6304347826086957
Y_DGS20 0.5869565217391305
Y_DGS30 0.5869565217391305


 30%|███       | 24/79 [1:19:27<2:58:27, 194.68s/it]

Y_DGS1MO 0.65625
Y_DGS3MO 0.65625
Y_DGS6MO 0.6041666666666666
Y_DGS1 0.6770833333333334
Y_DGS2 0.65625
Y_DGS3 0.6875
Y_DGS5 0.65625
Y_DGS7 0.65625
Y_DGS10 0.6145833333333334
Y_DGS20 0.5729166666666666
Y_DGS30 0.5729166666666666


 32%|███▏      | 25/79 [1:22:38<2:54:17, 193.65s/it]

Y_DGS1MO 0.66
Y_DGS3MO 0.65
Y_DGS6MO 0.62
Y_DGS1 0.68
Y_DGS2 0.65
Y_DGS3 0.68
Y_DGS5 0.65
Y_DGS7 0.65
Y_DGS10 0.61
Y_DGS20 0.57
Y_DGS30 0.57


 33%|███▎      | 26/79 [1:25:51<2:50:49, 193.38s/it]

Y_DGS1MO 0.6730769230769231
Y_DGS3MO 0.6538461538461539
Y_DGS6MO 0.6346153846153846
Y_DGS1 0.6923076923076923
Y_DGS2 0.6538461538461539
Y_DGS3 0.6730769230769231
Y_DGS5 0.6442307692307693
Y_DGS7 0.6442307692307693
Y_DGS10 0.5961538461538461
Y_DGS20 0.5576923076923077
Y_DGS30 0.5576923076923077


 34%|███▍      | 27/79 [1:29:06<2:47:58, 193.81s/it]

Y_DGS1MO 0.6666666666666666
Y_DGS3MO 0.6574074074074074
Y_DGS6MO 0.6388888888888888
Y_DGS1 0.6944444444444444
Y_DGS2 0.6574074074074074
Y_DGS3 0.6759259259259259
Y_DGS5 0.6481481481481481
Y_DGS7 0.6481481481481481
Y_DGS10 0.6018518518518519
Y_DGS20 0.5648148148148148
Y_DGS30 0.5648148148148148


 35%|███▌      | 28/79 [1:32:25<2:46:07, 195.44s/it]

Y_DGS1MO 0.6785714285714286
Y_DGS3MO 0.6607142857142857
Y_DGS6MO 0.6428571428571429
Y_DGS1 0.7053571428571429
Y_DGS2 0.6607142857142857
Y_DGS3 0.6785714285714286
Y_DGS5 0.6428571428571429
Y_DGS7 0.6428571428571429
Y_DGS10 0.5982142857142857
Y_DGS20 0.5625
Y_DGS30 0.5625


 37%|███▋      | 29/79 [1:35:54<2:46:11, 199.42s/it]

Y_DGS1MO 0.6724137931034483
Y_DGS3MO 0.6637931034482759
Y_DGS6MO 0.646551724137931
Y_DGS1 0.6982758620689655
Y_DGS2 0.6551724137931034
Y_DGS3 0.6637931034482759
Y_DGS5 0.6206896551724138
Y_DGS7 0.6206896551724138
Y_DGS10 0.5775862068965517
Y_DGS20 0.5431034482758621
Y_DGS30 0.5517241379310345


 38%|███▊      | 30/79 [1:39:17<2:43:46, 200.54s/it]

Y_DGS1MO 0.675
Y_DGS3MO 0.6666666666666666
Y_DGS6MO 0.6416666666666667
Y_DGS1 0.7
Y_DGS2 0.6583333333333333
Y_DGS3 0.6583333333333333
Y_DGS5 0.6083333333333333
Y_DGS7 0.6083333333333333
Y_DGS10 0.5666666666666667
Y_DGS20 0.5333333333333333
Y_DGS30 0.5416666666666666


 39%|███▉      | 31/79 [1:42:49<2:43:22, 204.22s/it]

Y_DGS1MO 0.6854838709677419
Y_DGS3MO 0.6693548387096774
Y_DGS6MO 0.6451612903225806
Y_DGS1 0.6935483870967742
Y_DGS2 0.6612903225806451
Y_DGS3 0.6612903225806451
Y_DGS5 0.6129032258064516
Y_DGS7 0.6129032258064516
Y_DGS10 0.5725806451612904
Y_DGS20 0.5483870967741935
Y_DGS30 0.5564516129032258


 41%|████      | 32/79 [1:46:12<2:39:39, 203.81s/it]

Y_DGS1MO 0.6953125
Y_DGS3MO 0.671875
Y_DGS6MO 0.6484375
Y_DGS1 0.6953125
Y_DGS2 0.65625
Y_DGS3 0.6484375
Y_DGS5 0.6015625
Y_DGS7 0.609375
Y_DGS10 0.5703125
Y_DGS20 0.546875
Y_DGS30 0.5546875


 42%|████▏     | 33/79 [1:49:32<2:35:16, 202.54s/it]

Y_DGS1MO 0.6893939393939394
Y_DGS3MO 0.6590909090909091
Y_DGS6MO 0.6363636363636364
Y_DGS1 0.6893939393939394
Y_DGS2 0.6515151515151515
Y_DGS3 0.6439393939393939
Y_DGS5 0.5984848484848485
Y_DGS7 0.6136363636363636
Y_DGS10 0.5833333333333334
Y_DGS20 0.5606060606060606
Y_DGS30 0.5681818181818182


 43%|████▎     | 34/79 [1:52:50<2:30:50, 201.12s/it]

Y_DGS1MO 0.6911764705882353
Y_DGS3MO 0.6544117647058824
Y_DGS6MO 0.6470588235294118
Y_DGS1 0.6985294117647058
Y_DGS2 0.6470588235294118
Y_DGS3 0.6397058823529411
Y_DGS5 0.6029411764705882
Y_DGS7 0.6176470588235294
Y_DGS10 0.5882352941176471
Y_DGS20 0.5661764705882353
Y_DGS30 0.5735294117647058


 44%|████▍     | 35/79 [1:56:08<2:26:46, 200.14s/it]

Y_DGS1MO 0.7
Y_DGS3MO 0.6571428571428571
Y_DGS6MO 0.65
Y_DGS1 0.7
Y_DGS2 0.6428571428571429
Y_DGS3 0.6357142857142857
Y_DGS5 0.6
Y_DGS7 0.6142857142857143
Y_DGS10 0.5928571428571429
Y_DGS20 0.5714285714285714
Y_DGS30 0.5785714285714286


 46%|████▌     | 36/79 [1:59:28<2:23:23, 200.08s/it]

Y_DGS1MO 0.7013888888888888
Y_DGS3MO 0.6666666666666666
Y_DGS6MO 0.6527777777777778
Y_DGS1 0.6944444444444444
Y_DGS2 0.6458333333333334
Y_DGS3 0.6319444444444444
Y_DGS5 0.5972222222222222
Y_DGS7 0.6111111111111112
Y_DGS10 0.5833333333333334
Y_DGS20 0.5694444444444444
Y_DGS30 0.5763888888888888


 47%|████▋     | 37/79 [2:02:57<2:21:55, 202.75s/it]

Y_DGS1MO 0.7027027027027027
Y_DGS3MO 0.6621621621621622
Y_DGS6MO 0.6554054054054054
Y_DGS1 0.6891891891891891
Y_DGS2 0.6418918918918919
Y_DGS3 0.6216216216216216
Y_DGS5 0.5878378378378378
Y_DGS7 0.6013513513513513
Y_DGS10 0.5675675675675675
Y_DGS20 0.5608108108108109
Y_DGS30 0.5675675675675675


 48%|████▊     | 38/79 [2:06:12<2:17:05, 200.62s/it]

Y_DGS1MO 0.6907894736842105
Y_DGS3MO 0.6644736842105263
Y_DGS6MO 0.6578947368421053
Y_DGS1 0.6776315789473685
Y_DGS2 0.6381578947368421
Y_DGS3 0.618421052631579
Y_DGS5 0.5855263157894737
Y_DGS7 0.5986842105263158
Y_DGS10 0.5723684210526315
Y_DGS20 0.5657894736842105
Y_DGS30 0.5723684210526315


 49%|████▉     | 39/79 [2:09:29<2:13:03, 199.59s/it]

Y_DGS1MO 0.6923076923076923
Y_DGS3MO 0.6666666666666666
Y_DGS6MO 0.6602564102564102
Y_DGS1 0.6602564102564102
Y_DGS2 0.6346153846153846
Y_DGS3 0.6089743589743589
Y_DGS5 0.5897435897435898
Y_DGS7 0.6025641025641025
Y_DGS10 0.5769230769230769
Y_DGS20 0.5705128205128205
Y_DGS30 0.5769230769230769


 51%|█████     | 40/79 [2:12:44<2:08:52, 198.26s/it]

Y_DGS1MO 0.6875
Y_DGS3MO 0.66875
Y_DGS6MO 0.65
Y_DGS1 0.65
Y_DGS2 0.625
Y_DGS3 0.60625
Y_DGS5 0.58125
Y_DGS7 0.59375
Y_DGS10 0.56875
Y_DGS20 0.56875
Y_DGS30 0.575


 52%|█████▏    | 41/79 [2:15:56<2:04:13, 196.15s/it]

Y_DGS1MO 0.6951219512195121
Y_DGS3MO 0.6524390243902439
Y_DGS6MO 0.6341463414634146
Y_DGS1 0.6341463414634146
Y_DGS2 0.6097560975609756
Y_DGS3 0.5914634146341463
Y_DGS5 0.573170731707317
Y_DGS7 0.5853658536585366
Y_DGS10 0.5609756097560976
Y_DGS20 0.5609756097560976
Y_DGS30 0.573170731707317


 53%|█████▎    | 42/79 [2:19:09<2:00:27, 195.33s/it]

Y_DGS1MO 0.6964285714285714
Y_DGS3MO 0.6488095238095238
Y_DGS6MO 0.6369047619047619
Y_DGS1 0.6309523809523809
Y_DGS2 0.6011904761904762
Y_DGS3 0.5833333333333334
Y_DGS5 0.5654761904761905
Y_DGS7 0.5773809523809523
Y_DGS10 0.5595238095238095
Y_DGS20 0.5535714285714286
Y_DGS30 0.5654761904761905


 54%|█████▍    | 43/79 [2:22:50<2:01:51, 203.10s/it]

Y_DGS1MO 0.686046511627907
Y_DGS3MO 0.6569767441860465
Y_DGS6MO 0.622093023255814
Y_DGS1 0.6162790697674418
Y_DGS2 0.5930232558139535
Y_DGS3 0.5697674418604651
Y_DGS5 0.5581395348837209
Y_DGS7 0.5697674418604651
Y_DGS10 0.5523255813953488
Y_DGS20 0.5465116279069767
Y_DGS30 0.5581395348837209


 56%|█████▌    | 44/79 [2:26:08<1:57:34, 201.56s/it]

Y_DGS1MO 0.6761363636363636
Y_DGS3MO 0.6590909090909091
Y_DGS6MO 0.6306818181818182
Y_DGS1 0.6022727272727273
Y_DGS2 0.5852272727272727
Y_DGS3 0.5625
Y_DGS5 0.5511363636363636
Y_DGS7 0.5625
Y_DGS10 0.5454545454545454
Y_DGS20 0.5397727272727273
Y_DGS30 0.5511363636363636


 57%|█████▋    | 45/79 [2:29:25<1:53:26, 200.20s/it]

Y_DGS1MO 0.6722222222222223
Y_DGS3MO 0.6555555555555556
Y_DGS6MO 0.6333333333333333
Y_DGS1 0.6
Y_DGS2 0.5833333333333334
Y_DGS3 0.5666666666666667
Y_DGS5 0.5555555555555556
Y_DGS7 0.5666666666666667
Y_DGS10 0.55
Y_DGS20 0.5444444444444444
Y_DGS30 0.55


 58%|█████▊    | 46/79 [2:32:43<1:49:43, 199.50s/it]

Y_DGS1MO 0.657608695652174
Y_DGS3MO 0.6630434782608695
Y_DGS6MO 0.6413043478260869
Y_DGS1 0.5978260869565217
Y_DGS2 0.5760869565217391
Y_DGS3 0.5597826086956522
Y_DGS5 0.5489130434782609
Y_DGS7 0.5597826086956522
Y_DGS10 0.5434782608695652
Y_DGS20 0.5380434782608695
Y_DGS30 0.5434782608695652


 59%|█████▉    | 47/79 [2:36:00<1:45:56, 198.63s/it]

Y_DGS1MO 0.6436170212765957
Y_DGS3MO 0.6648936170212766
Y_DGS6MO 0.648936170212766
Y_DGS1 0.5957446808510638
Y_DGS2 0.574468085106383
Y_DGS3 0.5638297872340425
Y_DGS5 0.5531914893617021
Y_DGS7 0.5638297872340425
Y_DGS10 0.5478723404255319
Y_DGS20 0.5425531914893617
Y_DGS30 0.5478723404255319


 61%|██████    | 48/79 [2:39:14<1:41:55, 197.28s/it]

Y_DGS1MO 0.6458333333333334
Y_DGS3MO 0.6666666666666666
Y_DGS6MO 0.6510416666666666
Y_DGS1 0.5989583333333334
Y_DGS2 0.5729166666666666
Y_DGS3 0.5677083333333334
Y_DGS5 0.5572916666666666
Y_DGS7 0.5625
Y_DGS10 0.546875
Y_DGS20 0.5416666666666666
Y_DGS30 0.546875


 62%|██████▏   | 49/79 [2:42:34<1:39:00, 198.00s/it]

Y_DGS1MO 0.6479591836734694
Y_DGS3MO 0.673469387755102
Y_DGS6MO 0.6581632653061225
Y_DGS1 0.5918367346938775
Y_DGS2 0.5663265306122449
Y_DGS3 0.5561224489795918
Y_DGS5 0.5459183673469388
Y_DGS7 0.5510204081632653
Y_DGS10 0.5357142857142857
Y_DGS20 0.5357142857142857
Y_DGS30 0.5408163265306123


 63%|██████▎   | 50/79 [2:45:49<1:35:18, 197.19s/it]

Y_DGS1MO 0.65
Y_DGS3MO 0.68
Y_DGS6MO 0.665
Y_DGS1 0.585
Y_DGS2 0.555
Y_DGS3 0.545
Y_DGS5 0.535
Y_DGS7 0.54
Y_DGS10 0.525
Y_DGS20 0.525
Y_DGS30 0.53


 65%|██████▍   | 51/79 [2:49:02<1:31:29, 196.06s/it]

Y_DGS1MO 0.6519607843137255
Y_DGS3MO 0.6862745098039216
Y_DGS6MO 0.6715686274509803
Y_DGS1 0.5784313725490197
Y_DGS2 0.553921568627451
Y_DGS3 0.5441176470588235
Y_DGS5 0.5392156862745098
Y_DGS7 0.5441176470588235
Y_DGS10 0.5294117647058824
Y_DGS20 0.5196078431372549
Y_DGS30 0.5245098039215687


 66%|██████▌   | 52/79 [2:52:15<1:27:44, 194.97s/it]

Y_DGS1MO 0.6490384615384616
Y_DGS3MO 0.6875
Y_DGS6MO 0.6682692307692307
Y_DGS1 0.5769230769230769
Y_DGS2 0.5480769230769231
Y_DGS3 0.5384615384615384
Y_DGS5 0.5336538461538461
Y_DGS7 0.5336538461538461
Y_DGS10 0.5192307692307693
Y_DGS20 0.5192307692307693
Y_DGS30 0.5144230769230769


 67%|██████▋   | 53/79 [2:55:29<1:24:25, 194.84s/it]

Y_DGS1MO 0.6462264150943396
Y_DGS3MO 0.6839622641509434
Y_DGS6MO 0.6650943396226415
Y_DGS1 0.5707547169811321
Y_DGS2 0.5424528301886793
Y_DGS3 0.5330188679245284
Y_DGS5 0.5283018867924528
Y_DGS7 0.5283018867924528
Y_DGS10 0.5141509433962265
Y_DGS20 0.5141509433962265
Y_DGS30 0.5141509433962265


 68%|██████▊   | 54/79 [2:58:42<1:20:55, 194.22s/it]

Y_DGS1MO 0.6481481481481481
Y_DGS3MO 0.6851851851851852
Y_DGS6MO 0.6666666666666666
Y_DGS1 0.5787037037037037
Y_DGS2 0.5462962962962963
Y_DGS3 0.5370370370370371
Y_DGS5 0.5231481481481481
Y_DGS7 0.5277777777777778
Y_DGS10 0.5185185185185185
Y_DGS20 0.5231481481481481
Y_DGS30 0.5185185185185185


 70%|██████▉   | 55/79 [3:01:54<1:17:23, 193.47s/it]

Y_DGS1MO 0.6454545454545455
Y_DGS3MO 0.6863636363636364
Y_DGS6MO 0.6727272727272727
Y_DGS1 0.5681818181818182
Y_DGS2 0.5363636363636364
Y_DGS3 0.5272727272727272
Y_DGS5 0.5136363636363637
Y_DGS7 0.5181818181818182
Y_DGS10 0.509090909090909
Y_DGS20 0.5181818181818182
Y_DGS30 0.5136363636363637


 71%|███████   | 56/79 [3:05:04<1:13:44, 192.37s/it]

Y_DGS1MO 0.6473214285714286
Y_DGS3MO 0.6875
Y_DGS6MO 0.6696428571428571
Y_DGS1 0.5625
Y_DGS2 0.5357142857142857
Y_DGS3 0.5267857142857143
Y_DGS5 0.5133928571428571
Y_DGS7 0.5178571428571429
Y_DGS10 0.5089285714285714
Y_DGS20 0.5178571428571429
Y_DGS30 0.5178571428571429


 72%|███████▏  | 57/79 [3:08:16<1:10:31, 192.35s/it]

Y_DGS1MO 0.6359649122807017
Y_DGS3MO 0.6885964912280702
Y_DGS6MO 0.6754385964912281
Y_DGS1 0.5570175438596491
Y_DGS2 0.5307017543859649
Y_DGS3 0.5219298245614035
Y_DGS5 0.5087719298245614
Y_DGS7 0.5131578947368421
Y_DGS10 0.5043859649122807
Y_DGS20 0.5131578947368421
Y_DGS30 0.5131578947368421


 73%|███████▎  | 58/79 [3:11:29<1:07:21, 192.44s/it]

Y_DGS1MO 0.6293103448275862
Y_DGS3MO 0.6853448275862069
Y_DGS6MO 0.6767241379310345
Y_DGS1 0.5517241379310345
Y_DGS2 0.5301724137931034
Y_DGS3 0.521551724137931
Y_DGS5 0.5043103448275862
Y_DGS7 0.5129310344827587
Y_DGS10 0.5043103448275862
Y_DGS20 0.5086206896551724
Y_DGS30 0.5086206896551724


 75%|███████▍  | 59/79 [3:14:38<1:03:52, 191.60s/it]

Y_DGS1MO 0.6228813559322034
Y_DGS3MO 0.6822033898305084
Y_DGS6MO 0.673728813559322
Y_DGS1 0.5550847457627118
Y_DGS2 0.5296610169491526
Y_DGS3 0.5169491525423728
Y_DGS5 0.5042372881355932
Y_DGS7 0.5127118644067796
Y_DGS10 0.5
Y_DGS20 0.5127118644067796
Y_DGS30 0.5084745762711864


 76%|███████▌  | 60/79 [3:17:49<1:00:34, 191.28s/it]

Y_DGS1MO 0.6166666666666667
Y_DGS3MO 0.6875
Y_DGS6MO 0.675
Y_DGS1 0.5625
Y_DGS2 0.5333333333333333
Y_DGS3 0.5166666666666667
Y_DGS5 0.5041666666666667
Y_DGS7 0.5125
Y_DGS10 0.5041666666666667
Y_DGS20 0.5125
Y_DGS30 0.5083333333333333


 77%|███████▋  | 61/79 [3:21:00<57:25, 191.41s/it]  

Y_DGS1MO 0.6229508196721312
Y_DGS3MO 0.6885245901639344
Y_DGS6MO 0.6762295081967213
Y_DGS1 0.5614754098360656
Y_DGS2 0.5368852459016393
Y_DGS3 0.5122950819672131
Y_DGS5 0.4959016393442623
Y_DGS7 0.5122950819672131
Y_DGS10 0.4959016393442623
Y_DGS20 0.5081967213114754
Y_DGS30 0.5040983606557377


 78%|███████▊  | 62/79 [3:24:08<53:55, 190.35s/it]

Y_DGS1MO 0.6209677419354839
Y_DGS3MO 0.6895161290322581
Y_DGS6MO 0.6733870967741935
Y_DGS1 0.5604838709677419
Y_DGS2 0.5403225806451613
Y_DGS3 0.5120967741935484
Y_DGS5 0.49193548387096775
Y_DGS7 0.5040322580645161
Y_DGS10 0.4959677419354839
Y_DGS20 0.5080645161290323
Y_DGS30 0.5040322580645161


 80%|███████▉  | 63/79 [3:27:17<50:37, 189.86s/it]

Y_DGS1MO 0.6150793650793651
Y_DGS3MO 0.6865079365079365
Y_DGS6MO 0.6746031746031746
Y_DGS1 0.5595238095238095
Y_DGS2 0.5436507936507936
Y_DGS3 0.5119047619047619
Y_DGS5 0.48412698412698413
Y_DGS7 0.5079365079365079
Y_DGS10 0.4880952380952381
Y_DGS20 0.5
Y_DGS30 0.49603174603174605


 81%|████████  | 64/79 [3:30:22<47:05, 188.39s/it]

Y_DGS1MO 0.609375
Y_DGS3MO 0.6796875
Y_DGS6MO 0.66796875
Y_DGS1 0.55859375
Y_DGS2 0.54296875
Y_DGS3 0.515625
Y_DGS5 0.48828125
Y_DGS7 0.5078125
Y_DGS10 0.4921875
Y_DGS20 0.50390625
Y_DGS30 0.5


 82%|████████▏ | 65/79 [3:33:30<43:57, 188.37s/it]

Y_DGS1MO 0.6115384615384616
Y_DGS3MO 0.676923076923077
Y_DGS6MO 0.6653846153846154
Y_DGS1 0.5576923076923077
Y_DGS2 0.5423076923076923
Y_DGS3 0.5153846153846153
Y_DGS5 0.48846153846153845
Y_DGS7 0.5076923076923077
Y_DGS10 0.49230769230769234
Y_DGS20 0.5115384615384615
Y_DGS30 0.5038461538461538


 84%|████████▎ | 66/79 [3:36:34<40:31, 187.03s/it]

Y_DGS1MO 0.6136363636363636
Y_DGS3MO 0.678030303030303
Y_DGS6MO 0.6590909090909091
Y_DGS1 0.5606060606060606
Y_DGS2 0.5454545454545454
Y_DGS3 0.5189393939393939
Y_DGS5 0.49242424242424243
Y_DGS7 0.5113636363636364
Y_DGS10 0.4962121212121212
Y_DGS20 0.5151515151515151
Y_DGS30 0.5075757575757576


 85%|████████▍ | 67/79 [3:39:39<37:15, 186.26s/it]

Y_DGS1MO 0.6156716417910447
Y_DGS3MO 0.6791044776119403
Y_DGS6MO 0.664179104477612
Y_DGS1 0.5597014925373134
Y_DGS2 0.5447761194029851
Y_DGS3 0.5186567164179104
Y_DGS5 0.4925373134328358
Y_DGS7 0.5111940298507462
Y_DGS10 0.4962686567164179
Y_DGS20 0.5111940298507462
Y_DGS30 0.503731343283582


 86%|████████▌ | 68/79 [3:42:44<34:06, 186.02s/it]

Y_DGS1MO 0.6176470588235294
Y_DGS3MO 0.6764705882352942
Y_DGS6MO 0.6544117647058824
Y_DGS1 0.5514705882352942
Y_DGS2 0.5367647058823529
Y_DGS3 0.5147058823529411
Y_DGS5 0.49264705882352944
Y_DGS7 0.5110294117647058
Y_DGS10 0.4963235294117647
Y_DGS20 0.5110294117647058
Y_DGS30 0.5036764705882353


 87%|████████▋ | 69/79 [3:45:49<30:56, 185.68s/it]

Y_DGS1MO 0.6159420289855072
Y_DGS3MO 0.6702898550724637
Y_DGS6MO 0.6557971014492754
Y_DGS1 0.5543478260869565
Y_DGS2 0.5398550724637681
Y_DGS3 0.5181159420289855
Y_DGS5 0.4963768115942029
Y_DGS7 0.5144927536231884
Y_DGS10 0.5
Y_DGS20 0.5144927536231884
Y_DGS30 0.5072463768115942


 89%|████████▊ | 70/79 [3:48:53<27:47, 185.26s/it]

Y_DGS1MO 0.6142857142857143
Y_DGS3MO 0.6607142857142857
Y_DGS6MO 0.6571428571428571
Y_DGS1 0.5464285714285714
Y_DGS2 0.5357142857142857
Y_DGS3 0.5107142857142857
Y_DGS5 0.48928571428571427
Y_DGS7 0.5107142857142857
Y_DGS10 0.49642857142857144
Y_DGS20 0.5107142857142857
Y_DGS30 0.5035714285714286


 90%|████████▉ | 71/79 [3:51:57<24:38, 184.85s/it]

Y_DGS1MO 0.6161971830985915
Y_DGS3MO 0.6584507042253521
Y_DGS6MO 0.6584507042253521
Y_DGS1 0.5492957746478874
Y_DGS2 0.5387323943661971
Y_DGS3 0.5105633802816901
Y_DGS5 0.4894366197183099
Y_DGS7 0.5105633802816901
Y_DGS10 0.5
Y_DGS20 0.5140845070422535
Y_DGS30 0.5070422535211268


 91%|█████████ | 72/79 [3:55:03<21:35, 185.08s/it]

Y_DGS1MO 0.6180555555555556
Y_DGS3MO 0.6597222222222222
Y_DGS6MO 0.65625
Y_DGS1 0.5520833333333334
Y_DGS2 0.5416666666666666
Y_DGS3 0.5138888888888888
Y_DGS5 0.4930555555555556
Y_DGS7 0.5104166666666666
Y_DGS10 0.5
Y_DGS20 0.5173611111111112
Y_DGS30 0.5069444444444444


 92%|█████████▏| 73/79 [3:58:06<18:27, 184.51s/it]

Y_DGS1MO 0.6164383561643836
Y_DGS3MO 0.6643835616438356
Y_DGS6MO 0.6575342465753424
Y_DGS1 0.5547945205479452
Y_DGS2 0.541095890410959
Y_DGS3 0.5136986301369864
Y_DGS5 0.4931506849315068
Y_DGS7 0.5102739726027398
Y_DGS10 0.5
Y_DGS20 0.5171232876712328
Y_DGS30 0.5068493150684932


 94%|█████████▎| 74/79 [4:01:08<15:18, 183.62s/it]

Y_DGS1MO 0.6114864864864865
Y_DGS3MO 0.6655405405405406
Y_DGS6MO 0.6554054054054054
Y_DGS1 0.5574324324324325
Y_DGS2 0.5405405405405406
Y_DGS3 0.5135135135135135
Y_DGS5 0.49324324324324326
Y_DGS7 0.5101351351351351
Y_DGS10 0.5
Y_DGS20 0.5168918918918919
Y_DGS30 0.5067567567567568


 95%|█████████▍| 75/79 [4:04:09<12:12, 183.00s/it]

Y_DGS1MO 0.6166666666666667
Y_DGS3MO 0.67
Y_DGS6MO 0.66
Y_DGS1 0.56
Y_DGS2 0.54
Y_DGS3 0.5133333333333333
Y_DGS5 0.49666666666666665
Y_DGS7 0.5133333333333333
Y_DGS10 0.5033333333333333
Y_DGS20 0.52
Y_DGS30 0.51


 96%|█████████▌| 76/79 [4:07:11<09:07, 182.60s/it]

Y_DGS1MO 0.6151315789473685
Y_DGS3MO 0.6710526315789473
Y_DGS6MO 0.6611842105263158
Y_DGS1 0.5625
Y_DGS2 0.5427631578947368
Y_DGS3 0.5098684210526315
Y_DGS5 0.4934210526315789
Y_DGS7 0.5098684210526315
Y_DGS10 0.5
Y_DGS20 0.5164473684210527
Y_DGS30 0.506578947368421


 97%|█████████▋| 77/79 [4:10:14<06:05, 182.64s/it]

Y_DGS1MO 0.6201298701298701
Y_DGS3MO 0.6753246753246753
Y_DGS6MO 0.6590909090909091
Y_DGS1 0.5616883116883117
Y_DGS2 0.538961038961039
Y_DGS3 0.5032467532467533
Y_DGS5 0.4902597402597403
Y_DGS7 0.5064935064935064
Y_DGS10 0.4967532467532468
Y_DGS20 0.512987012987013
Y_DGS30 0.5032467532467533


 99%|█████████▊| 78/79 [4:13:18<03:03, 183.13s/it]

Y_DGS1MO 0.6185897435897436
Y_DGS3MO 0.6730769230769231
Y_DGS6MO 0.657051282051282
Y_DGS1 0.5576923076923077
Y_DGS2 0.5352564102564102
Y_DGS3 0.5032051282051282
Y_DGS5 0.4935897435897436
Y_DGS7 0.5096153846153846
Y_DGS10 0.5
Y_DGS20 0.5160256410256411
Y_DGS30 0.5064102564102564


100%|██████████| 79/79 [4:16:19<00:00, 194.67s/it]

Y_DGS1MO 0.620253164556962
Y_DGS3MO 0.6772151898734177
Y_DGS6MO 0.6550632911392406
Y_DGS1 0.5569620253164557
Y_DGS2 0.5316455696202531
Y_DGS3 0.5
Y_DGS5 0.49050632911392406
Y_DGS7 0.5063291139240507
Y_DGS10 0.49683544303797467
Y_DGS20 0.5126582278481012
Y_DGS30 0.5031645569620253





Y_DGS1MO    0.620253
Y_DGS3MO    0.677215
Y_DGS6MO    0.655063
Y_DGS1      0.556962
Y_DGS2      0.531646
Y_DGS3      0.500000
Y_DGS5      0.490506
Y_DGS7      0.506329
Y_DGS10     0.496835
Y_DGS20     0.512658
Y_DGS30     0.503165
dtype: float64


## D) Random Forests

We'll proceed as for the logistic regression and XGBoost models when training random forests. The pipelines (with and without PCA) are defined below:

In [None]:
window_train = 52 * 15
window_pred = 4
tscv = TimeSeriesSplit(n_splits=4)



param_grid = {
    'rf__estimator__n_estimators': [150],
    'rf__estimator__max_depth': [4,7],
    'rf__estimator__min_samples_leaf': [5,10]
}

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('rf', MultiOutputClassifier(
        RandomForestClassifier(
            n_jobs=-1,
            random_state = 42
            )))
])

pipepca = Pipeline([
    ('scaler', StandardScaler()),
    ('pca',PCA(n_components=0.99)),
    ('rf', MultiOutputClassifier(
        RandomForestClassifier(
            n_jobs=-1,
            random_state = 42
            )))
])


#we will cross validate the model the following way: 
grid = GridSearchCV(pipe,  #or pipepca
                        param_grid,
                        cv = tscv,
                        scoring = 'accuracy',
                        n_jobs=-1
                        )

In [None]:
for threshold in threshold_list:

    print(f'training model on dataset with features with mutual information above {threshold}')

    Xw = datasets[threshold]

    # for the dataset containing features with mutual information above 0.03:
    # we'll train our random forest model with and without PCA since there are many features in this dataset
    if threshold == 0.03:

        accuracies = {col: [] for col in Yw.columns}
        params_by_target = {col: [] for col in Yw.columns}
        feature_importance = {col: [] for col in Yw.columns}
        y_true = []
        y_pred = []

        for start in tqdm(range(0, len(Xw) - window_train - window_pred + 1, window_pred)):
            end_train = start + window_train
            end_pred = end_train + window_pred

            Xw_train = Xw.iloc[start:end_train]
            Yw_train = Yw.iloc[start:end_train]
            Xw_test = Xw.iloc[end_train:end_pred]
            Yw_test = Yw.iloc[end_train:end_pred]

            y_true.append(Yw_test)

            grid = GridSearchCV(pipe,
                                param_grid,
                                cv=tscv,
                                scoring='accuracy',
                                n_jobs=-1,
                                verbose=0)

            grid.fit(Xw_train, Yw_train)
            best_model = grid.best_estimator_

            Yw_pred = best_model.predict(Xw_test)
            y_pred.append(Yw_pred)

            for i, col in enumerate(Yw_train.columns):
                acc = accuracy_score(Yw_test[col], Yw_pred[:, i])
                accuracies[col].append(acc)
                print(col, np.mean(accuracies[col]))

                # feature importance pour la i-ème sortie
                est = best_model.named_steps['rf'].estimators_[i]  
                importance = est.feature_importances_
                feature_importance[col].append(importance)

                # hyperparamètres de l'estimateur i
                params = est.get_params()
                params_by_target[col].append(params)



        acc = pd.DataFrame(accuracies,columns = Yw.columns)
        params = pd.DataFrame(params_by_target, columns = Yw.columns) 
        feature_imp = pd.DataFrame(feature_importance, columns = Yw.columns)
        Y_true_flat = np.array(y_true).reshape(-1, np.array(y_true).shape[2])  # aplati les 2 premières dimensions en une seule
        ytrue = pd.DataFrame(Y_true_flat, index = Yw.iloc[780:Yw.shape[0]-1].index,columns=Yw.columns)
        Y_pred_flat = np.array(y_pred).reshape(-1, np.array(y_pred).shape[2])  # aplati les 2 premières dimensions en une seule
        ypred = pd.DataFrame(Y_pred_flat,index = Yw.iloc[780:Yw.shape[0]-1].index, columns=Yw.columns)

        print(f'For each yield, the average out-of-sample accuracy over the 79 rolling windows on the dataset with mutual information > {threshold} is:')
        print(acc.mean())
        acc.to_csv('results of models/accuracies random forest, 15y train test.csv')
        params.to_csv('results of models/params random forest, 15y train test.csv')
        feature_imp.to_csv('results of models/feature importance random forest, 15y train test.csv')
        ypred.to_csv('results of models/forecast random forest, 15y train test.csv')
        ytrue.to_csv('results of models/true values random forest, 15y train test.csv')





        #### Now we do the same training but this time we add the PCA in the pipeline before training the model
        accuracies = {col: [] for col in Yw.columns}
        params_by_target = {col: [] for col in Yw.columns}
        feature_importance = {col: [] for col in Yw.columns}
        y_true = []
        y_pred = []


        
        for start in tqdm(range(0, len(Xw) - window_train - window_pred + 1, window_pred)):
            end_train = start + window_train
            end_pred = end_train + window_pred

            Xw_train = Xw.iloc[start:end_train]
            Yw_train = Yw.iloc[start:end_train]
            Xw_test = Xw.iloc[end_train:end_pred]
            Yw_test = Yw.iloc[end_train:end_pred]

            y_true.append(Yw_test)

            grid = GridSearchCV(pipepca,
                                param_grid,
                                cv=tscv,
                                scoring='accuracy',
                                n_jobs=-1,
                                verbose=0)

            grid.fit(Xw_train, Yw_train)
            best_model = grid.best_estimator_

            Yw_pred = best_model.predict(Xw_test)
            y_pred.append(Yw_pred)

            for i, col in enumerate(Yw_train.columns):
                acc = accuracy_score(Yw_test[col], Yw_pred[:, i])
                accuracies[col].append(acc)
                print(col, np.mean(accuracies[col]))

                # feature importance pour la i-ème sortie
                est = best_model.named_steps['rf'].estimators_[i]   
                importance = est.feature_importances_
                feature_importance[col].append(importance)

                # hyperparamètres de l'estimateur i
                params = est.get_params()
                params_by_target[col].append(params)



        acc = pd.DataFrame(accuracies,columns = Yw.columns)
        params = pd.DataFrame(params_by_target, columns = Yw.columns) 
        feature_imp = pd.DataFrame(feature_importance, columns = Yw.columns)
        Y_true_flat = np.array(y_true).reshape(-1, np.array(y_true).shape[2])  # aplati les 2 premières dimensions en une seule
        ytrue = pd.DataFrame(Y_true_flat, index = Yw.iloc[780:Yw.shape[0]-1].index,columns=Yw.columns)
        Y_pred_flat = np.array(y_pred).reshape(-1, np.array(y_pred).shape[2])  # aplati les 2 premières dimensions en une seule
        ypred = pd.DataFrame(Y_pred_flat,index = Yw.iloc[780:Yw.shape[0]-1].index, columns=Yw.columns)

        print(f'For each yield, the average out-of-sample accuracy over the 79 rolling windows on the dataset with mutual information > {threshold} and PCA is:')
        print(acc.mean())
        acc.to_csv('results of models/accuracies random forest, pca, 15y train test.csv')
        params.to_csv('results of models/params random forest, pca, 15y train test.csv')
        feature_imp.to_csv('results of models/feature importance random forest, pca, 15y train test.csv')
        ypred.to_csv('results of models/forecast random forest, pca, 15y train test.csv')
        ytrue.to_csv('results of models/true values random forest, pca, 15y train test.csv')
    

    # for other datasets with larger mutual information threshold, we don't apply PCA and train the model directly 
    else:
        accuracies = {col: [] for col in Yw.columns}
        params_by_target = {col: [] for col in Yw.columns}
        feature_importance = {col: [] for col in Yw.columns}
        y_true = []
        y_pred = []

        for start in tqdm(range(0, len(Xw) - window_train - window_pred + 1, window_pred)):
            end_train = start + window_train
            end_pred = end_train + window_pred

            Xw_train = Xw.iloc[start:end_train]
            Yw_train = Yw.iloc[start:end_train]
            Xw_test = Xw.iloc[end_train:end_pred]
            Yw_test = Yw.iloc[end_train:end_pred]

            y_true.append(Yw_test)

            grid = GridSearchCV(pipe,
                                param_grid,
                                cv=tscv,
                                scoring='accuracy',
                                n_jobs=-1,
                                verbose=0)

            grid.fit(Xw_train, Yw_train)
            best_model = grid.best_estimator_

            Yw_pred = best_model.predict(Xw_test)
            y_pred.append(Yw_pred)

            for i, col in enumerate(Yw_train.columns):
                acc = accuracy_score(Yw_test[col], Yw_pred[:, i])
                accuracies[col].append(acc)
                print(col, np.mean(accuracies[col]))

                # feature importance pour la i-ème sortie
                est = best_model.named_steps['rf'].estimators_[i]  
                importance = est.feature_importances_
                feature_importance[col].append(importance)

                # hyperparamètres de l'estimateur i
                params = est.get_params()
                params_by_target[col].append(params)


        
        acc = pd.DataFrame(accuracies,columns = Yw.columns)
        params = pd.DataFrame(params_by_target, columns = Yw.columns) 
        feature_imp = pd.DataFrame(feature_importance, columns = Yw.columns)
        Y_true_flat = np.array(y_true).reshape(-1, np.array(y_true).shape[2])  # aplati les 2 premières dimensions en une seule
        ytrue = pd.DataFrame(Y_true_flat, index = Yw.iloc[780:Yw.shape[0]-1].index,columns=Yw.columns)
        Y_pred_flat = np.array(y_pred).reshape(-1, np.array(y_pred).shape[2])  # aplati les 2 premières dimensions en une seule
        ypred = pd.DataFrame(Y_pred_flat,index = Yw.iloc[780:Yw.shape[0]-1].index, columns=Yw.columns)

        print(f'For each yield, the average out-of-sample accuracy over the 79 rolling windows on the dataset with mutual information > {threshold} is:')
        print(acc.mean())
        acc.to_csv(f'results of models/accuracies random forest, MI {threshold}, 15y train test.csv')
        params.to_csv(f'results of models/params random forest, MI {threshold}, 15y train test.csv')
        feature_imp.to_csv(f'results of models/feature importance random forest, MI {threshold}, 15y train test.csv')
        ypred.to_csv(f'results of models/forecast random forest, MI {threshold}, 15y train test.csv')
        ytrue.to_csv(f'results of models/true values random forest, MI {threshold}, 15y train test.csv')


We now train a random forest model on the dataset with features with a mutual information higher than 0.03: 

In [None]:
# accuracies = {col: [] for col in Yw.columns}
# param = {col: [] for col in Yw.columns}
# feature_importance = {col: [] for col in Yw.columns}
# y_true = []
# y_pred = []


# for start in tqdm(range(0, len(Xw) - window_train - window_pred + 1, window_pred)):
#     end_train = start + window_train
#     end_pred = end_train + window_pred

#     Xw_train = Xw.iloc[start:end_train]
#     Yw_train = Yw.iloc[start:end_train]
#     Xw_test = Xw.iloc[end_train:end_pred]
#     Yw_test = Yw.iloc[end_train:end_pred]

#     y_true.append(Yw_test)


#     grid = GridSearchCV(pipe,
#                         param_grid,
#                         cv = tscv,
#                         scoring = 'accuracy',
#                         n_jobs=-1
#                         )

#     grid.fit(Xw_train,Yw_train)
#     best_model = grid.best_estimator_

#     Yw_pred = best_model.predict(Xw_test)

#     y_pred.append(Yw_pred)

#     for i, col in enumerate(Yw_train.columns):
#         acc = accuracy_score(Yw_test[col], Yw_pred[:, i])
#         accuracies[col].append(acc)
#         # feature importance
#         importance = best_model.named_steps['rf'].estimators_[i].feature_importances_
#         feature_importance[col].append(importance)
#         # hyperparameters
#         params = best_model.named_steps['rf'].estimators_[i].get_params()
#         param[col].append(param)

# acc = pd.DataFrame(accuracies,columns = Yw.columns)
# params = pd.DataFrame(param, columns = Yw.columns)
# feature_imp = pd.DataFrame(feature_importance, columns = Yw.columns)
# Y_true_flat = np.array(y_true).reshape(-1, np.array(y_true).shape[2])  # aplati les 2 premières dimensions en une seule
# ytrue = pd.DataFrame(Y_true_flat, index = Yw.iloc[780:Yw.shape[0]-1].index,columns=Yw.columns)
# Y_pred_flat = np.array(y_pred).reshape(-1, np.array(y_pred).shape[2])  # aplati les 2 premières dimensions en une seule
# ypred = pd.DataFrame(Y_pred_flat,index = Yw.iloc[780:Yw.shape[0]-1].index, columns=Yw.columns)

# acc.to_csv('accuracies random forest, no pca, 15y train test.csv')
# params.to_csv('params random forest, no pca, 15y train test.csv')
# feature_imp.to_csv('feature importance random forest, no pca, 15y train test.csv')
# ypred.to_csv('forecast random forest, no pca, 15y train test.csv')
# ytrue.to_csv('true values random forest, no pca, 15y train test.csv')

For the dataset with MI >0.03, and applying a PCA before training the random forest: 

In [None]:
# accuracies = {col: [] for col in Yw.columns}
# param = {col: [] for col in Yw.columns}
# feature_importance = {col: [] for col in Yw.columns}
# y_true = []
# y_pred = []


# for start in tqdm(range(0, len(Xw) - window_train - window_pred + 1, window_pred)):
#     end_train = start + window_train
#     end_pred = end_train + window_pred

#     Xw_train = Xw.iloc[start:end_train]
#     Yw_train = Yw.iloc[start:end_train]
#     Xw_test = Xw.iloc[end_train:end_pred]
#     Yw_test = Yw.iloc[end_train:end_pred]

#     y_true.append(Yw_test)


#     grid = GridSearchCV(pipepca,
#                         param_grid,
#                         cv = tscv,
#                         scoring = 'accuracy',
#                         n_jobs=-1
#                         )

#     grid.fit(Xw_train,Yw_train)
#     best_model = grid.best_estimator_

#     Yw_pred = best_model.predict(Xw_test)

#     y_pred.append(Yw_pred)

#     for i, col in enumerate(Yw_train.columns):
#         acc = accuracy_score(Yw_test[col], Yw_pred[:, i])
#         accuracies[col].append(acc)
#         # feature importance
#         importance = best_model.named_steps['rf'].estimators_[i].feature_importances_
#         feature_importance[col].append(importance)
#         # hyperparameters
#         params = best_model.named_steps['rf'].estimators_[i].get_params()
#         param[col].append(param)

# acc = pd.DataFrame(accuracies,columns = Yw.columns)
# params = pd.DataFrame(param, columns = Yw.columns)
# feature_imp = pd.DataFrame(feature_importance, columns = Yw.columns)
# Y_true_flat = np.array(y_true).reshape(-1, np.array(y_true).shape[2])  # aplati les 2 premières dimensions en une seule
# ytrue = pd.DataFrame(Y_true_flat, index = Yw.iloc[780:Yw.shape[0]-1].index,columns=Yw.columns)
# Y_pred_flat = np.array(y_pred).reshape(-1, np.array(y_pred).shape[2])  # aplati les 2 premières dimensions en une seule
# ypred = pd.DataFrame(Y_pred_flat,index = Yw.iloc[780:Yw.shape[0]-1].index, columns=Yw.columns)

# acc.to_csv('accuracies random forest pca, 15y train test.csv')
# params.to_csv('params random forest pca, 15y train test.csv')
# feature_imp.to_csv('feature importance random forest pca, 15y train test.csv')
# ypred.to_csv('forecast random forest pca, 15y train test.csv')
# ytrue.to_csv('true values random forest pca, 15y train test.csv')

For the dataset with MI>0.035:

In [None]:
# accuracies = {col: [] for col in Yw.columns}
# param = {col: [] for col in Yw.columns}
# feature_importance = {col: [] for col in Yw.columns}
# y_true = []
# y_pred = []


# for start in tqdm(range(0, len(Xw) - window_train - window_pred + 1, window_pred)):
#     end_train = start + window_train
#     end_pred = end_train + window_pred

#     Xw_train = Xw35.iloc[start:end_train]
#     Yw_train = Yw.iloc[start:end_train]
#     Xw_test = Xw35.iloc[end_train:end_pred]
#     Yw_test = Yw.iloc[end_train:end_pred]

#     y_true.append(Yw_test)


#     grid = GridSearchCV(pipe,
#                         param_grid,
#                         cv = tscv,
#                         scoring = 'accuracy',
#                         n_jobs=-1
#                         )

#     grid.fit(Xw_train,Yw_train)
#     best_model = grid.best_estimator_

#     Yw_pred = best_model.predict(Xw_test)

#     y_pred.append(Yw_pred)

#     for i, col in enumerate(Yw_train.columns):
#         acc = accuracy_score(Yw_test[col], Yw_pred[:, i])
#         accuracies[col].append(acc)
#         # feature importance
#         importance = best_model.named_steps['rf'].estimators_[i].feature_importances_
#         feature_importance[col].append(importance)
#         # hyperparameters
#         params = best_model.named_steps['rf'].estimators_[i].get_params()
#         param[col].append(param)





# acc = pd.DataFrame(accuracies,columns = Yw.columns)
# params = pd.DataFrame(param, columns = Yw.columns)
# feature_imp = pd.DataFrame(feature_importance, columns = Yw.columns)


# Y_true_flat = np.array(y_true).reshape(-1, np.array(y_true).shape[2])  # aplati les 2 premières dimensions en une seule
# ytrue = pd.DataFrame(Y_true_flat, index = Yw.iloc[780:Yw.shape[0]-1].index,columns=Yw.columns)


# Y_pred_flat = np.array(y_pred).reshape(-1, np.array(y_pred).shape[2])  # aplati les 2 premières dimensions en une seule
# ypred = pd.DataFrame(Y_pred_flat,index = Yw.iloc[780:Yw.shape[0]-1].index, columns=Yw.columns)



# acc.to_csv('accuracies random forest, MI 0.035, 15y train test.csv')
# params.to_csv('params random forest, MI 0.035, 15y train test.csv')
# feature_imp.to_csv('feature importance random forest, MI 0.035, 15y train test.csv')
# ypred.to_csv('forecast random forest, MI 0.035, 15y train test.csv')
# ytrue.to_csv('true values random forest, MI 0.035, 15y train test.csv')

100%|██████████| 11/11 [00:32<00:00,  2.92s/it]
100%|██████████| 79/79 [39:07<00:00, 29.71s/it] 


For the dataset with MI > 0.04:

In [None]:
# accuracies = {col: [] for col in Yw.columns}
# param = {col: [] for col in Yw.columns}
# feature_importance = {col: [] for col in Yw.columns}
# y_true = []
# y_pred = []


# for start in tqdm(range(0, len(Xw) - window_train - window_pred + 1, window_pred)):
#     end_train = start + window_train
#     end_pred = end_train + window_pred

#     Xw_train = Xw4.iloc[start:end_train]
#     Yw_train = Yw.iloc[start:end_train]
#     Xw_test = Xw4.iloc[end_train:end_pred]
#     Yw_test = Yw.iloc[end_train:end_pred]

#     y_true.append(Yw_test)


#     grid = GridSearchCV(pipe,
#                         param_grid,
#                         cv = tscv,
#                         scoring = 'accuracy',
#                         n_jobs=-1
#                         )

#     grid.fit(Xw_train,Yw_train)
#     best_model = grid.best_estimator_

#     Yw_pred = best_model.predict(Xw_test)

#     y_pred.append(Yw_pred)

#     for i, col in enumerate(Yw_train.columns):
#         acc = accuracy_score(Yw_test[col], Yw_pred[:, i])
#         accuracies[col].append(acc)
#         # feature importance
#         importance = best_model.named_steps['rf'].estimators_[i].feature_importances_
#         feature_importance[col].append(importance)
#         # hyperparameters
#         params = best_model.named_steps['rf'].estimators_[i].get_params()
#         param[col].append(param)





# acc = pd.DataFrame(accuracies,columns = Yw.columns)
# params = pd.DataFrame(param, columns = Yw.columns)
# feature_imp = pd.DataFrame(feature_importance, columns = Yw.columns)
# Y_true_flat = np.array(y_true).reshape(-1, np.array(y_true).shape[2])  # aplati les 2 premières dimensions en une seule
# ytrue = pd.DataFrame(Y_true_flat, index = Yw.iloc[780:Yw.shape[0]-1].index,columns=Yw.columns)
# Y_pred_flat = np.array(y_pred).reshape(-1, np.array(y_pred).shape[2])  # aplati les 2 premières dimensions en une seule
# ypred = pd.DataFrame(Y_pred_flat,index = Yw.iloc[780:Yw.shape[0]-1].index, columns=Yw.columns)



# acc.to_csv('accuracies random forest, MI 0.04, 15y train test.csv')
# params.to_csv('params random forest, MI 0.04, 15y train test.csv')
# feature_imp.to_csv('feature importance random forest, MI 0.04, 15y train test.csv')
# ypred.to_csv('forecast random forest, MI 0.04, 15y train test.csv')
# ytrue.to_csv('true values random forest, MI 0.04, 15y train test.csv')

For the dataset with MI > 0.05:

In [None]:
# accuracies = {col: [] for col in Yw.columns}
# param = {col: [] for col in Yw.columns}
# feature_importance = {col: [] for col in Yw.columns}
# y_true = []
# y_pred = []


# for start in tqdm(range(0, len(Xw) - window_train - window_pred + 1, window_pred)):
#     end_train = start + window_train
#     end_pred = end_train + window_pred

#     Xw_train = Xw5.iloc[start:end_train]
#     Yw_train = Yw.iloc[start:end_train]
#     Xw_test = Xw5.iloc[end_train:end_pred]
#     Yw_test = Yw.iloc[end_train:end_pred]

#     y_true.append(Yw_test)


#     grid = GridSearchCV(pipe,
#                         param_grid,
#                         cv = tscv,
#                         scoring = 'accuracy',
#                         n_jobs=-1
#                         )

#     grid.fit(Xw_train,Yw_train)
#     best_model = grid.best_estimator_

#     Yw_pred = best_model.predict(Xw_test)

#     y_pred.append(Yw_pred)

#     for i, col in enumerate(Yw_train.columns):
#         acc = accuracy_score(Yw_test[col], Yw_pred[:, i])
#         accuracies[col].append(acc)
#         # feature importance
#         importance = best_model.named_steps['rf'].estimators_[i].feature_importances_
#         feature_importance[col].append(importance)
#         # hyperparameters
#         params = best_model.named_steps['rf'].estimators_[i].get_params()
#         param[col].append(param)





# acc = pd.DataFrame(accuracies,columns = Yw.columns)
# params = pd.DataFrame(param, columns = Yw.columns)
# feature_imp = pd.DataFrame(feature_importance, columns = Yw.columns)
# Y_true_flat = np.array(y_true).reshape(-1, np.array(y_true).shape[2])  # aplati les 2 premières dimensions en une seule
# ytrue = pd.DataFrame(Y_true_flat, index = Yw.iloc[780:Yw.shape[0]-1].index,columns=Yw.columns)
# Y_pred_flat = np.array(y_pred).reshape(-1, np.array(y_pred).shape[2])  # aplati les 2 premières dimensions en une seule
# ypred = pd.DataFrame(Y_pred_flat,index = Yw.iloc[780:Yw.shape[0]-1].index, columns=Yw.columns)



# acc.to_csv('accuracies random forest, MI 0.05, 15y train test.csv')
# params.to_csv('params random forest, MI 0.05, 15y train test.csv')
# feature_imp.to_csv('feature importance random forest, MI 0.05, 15y train test.csv')
# ypred.to_csv('forecast random forest, MI 0.05, 15y train test.csv')
# ytrue.to_csv('true values random forest, MI 0.05, 15y train test.csv')