In [1]:
import numpy as np
import xarray as xr
import pandas as pd
import copy
from datetime import datetime, timedelta
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import brier_score_loss
from sklearn.metrics import make_scorer
from sklearn.metrics import log_loss
from sklearn.utils.class_weight import compute_class_weight
import sys
import os
import joblib
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
import glob 


from sklearn import datasets, ensemble
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import TimeSeriesSplit
from sklearn.utils import class_weight
import json

import xgboost as xgb
from bayes_opt import BayesianOptimization
from bayes_opt import acquisition
import shap

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def get_train_val_test_periods(full_df):
    dic_train_val = {}
    dic_test = {}
    
    start_of_test_periods = np.arange(1981,2021,10)
    end_of_test_periods = start_of_test_periods + 9
    
    for iperiod in range(len(start_of_test_periods)):
        df_test_temp = full_df[str(start_of_test_periods[iperiod]):str(end_of_test_periods[iperiod])]
        df_trainval_temp = full_df.drop(df_test_temp.index)
        
        dic_train_val[start_of_test_periods[iperiod]] = df_trainval_temp
        dic_test[start_of_test_periods[iperiod]] = df_test_temp
    return dic_train_val, dic_test

In [3]:

def generate_random_forecast(df_week_0, seed_value=42):
    # Set the random seed for reproducibility
    np.random.seed(seed_value)
    
    # Step 1: Get unique classes and their frequencies
    values = df_week_0[df_week_0.keys()[0]].value_counts()
    
    # Step 2: Calculate the probabilities for each class
    classes = values.index  # Unique classes
    probabilities = values / values.sum()  # Normalize to get probability distribution
    
    # Step 3: Generate a random forecast based on the probabilities
    random_forecast = np.random.choice(classes, size=len(df_week_0), p=probabilities)
    
    # Step 4: Return the random forecast as a DataFrame or Series
    forecast_df = pd.DataFrame(random_forecast, index=df_week_0.index, columns=['y_predicted'])
    
    return forecast_df

def generate_random_forecast_probabilities(df_week_0, seed_value=42):
    # Set the random seed for reproducibility
    np.random.seed(seed_value)
    # Step 1: Get unique classes and their frequencies
    values = df_week_0[df_week_0.keys()[0]].value_counts()
    
    # Step 2: Calculate the probabilities for each class
    classes = values.index  # Unique classes
    probabilities = values / values.sum()  # Normalize to get probability distribution
    
    # Step 3: Create a probability forecast for each sample
    # Create a 2D array where each row is the same probability distribution
    prob_matrix = np.tile(probabilities.values, (len(df_week_0), 1))
    
    # Step 4: Return the probability matrix as a DataFrame
    forecast_df = pd.DataFrame(prob_matrix, index=df_week_0.index, columns=classes)[np.arange(len(classes))]
    
    return forecast_df

def generate_random_forecast_with_monthly_probabilities(df_week_0, seed_value=42):
    # Set the random seed for reproducibility
    np.random.seed(seed_value)
    
    # Extract the month from the index (assuming the index is a datetime index)
    df_week_0['month'] = df_week_0.index.month
    
    # Prepare an empty list to store the random forecast
    forecasts = []
    
    # Loop through each month
    for month in range(1, 13):  # Loop through months 1 to 12
        # Filter data for the current month
        month_data = df_week_0[df_week_0['month'] == month]
        
        # Step 1: Get unique classes and their frequencies for the current month
        values = month_data[df_week_0.keys()[0]].value_counts()
        
        # Step 2: Calculate the probabilities for each class in the current month
        classes = values.index  # Unique classes
        probabilities = values / values.sum()  # Normalize to get probability distribution
        
        # Step 3: Generate random forecasts for the current month based on the probabilities
        month_forecast = np.random.choice(classes, size=len(month_data), p=probabilities)
        
        # Store the forecast for the current month
        forecasts.append(pd.Series(month_forecast, index=month_data.index))
    
    # Combine all monthly forecasts into one DataFrame
    forecast_df = pd.concat(forecasts)
    forecast_df = forecast_df.sort_index()  # Sort the index to preserve the original order
    forecast_df = pd.DataFrame(forecast_df,columns=['y_predicted'])
    return forecast_df
    
def generate_probability_forecast_with_monthly_probabilities(df_week_0, seed_value=42):
    # Set the random seed for reproducibility
    np.random.seed(seed_value)
    
    # Extract the month from the index (assuming the index is a datetime index)
    df_week_0['month'] = df_week_0.index.month
    
    # Prepare an empty DataFrame to store the probability forecasts
    all_probabilities = pd.DataFrame(index=df_week_0.index)
    
    # Loop through each month
    for month in range(1, 13):  # Loop through months 1 to 12
        # Filter data for the current month
        month_data = df_week_0[df_week_0['month'] == month]
        
        if month_data.empty:
            continue  # Skip if there's no data for the month
        
        # Step 1: Get unique classes and their frequencies for the current month
        values = month_data[df_week_0.keys()[0]].value_counts()
        
        # Step 2: Calculate the probabilities for each class in the current month
        classes = values.index  # Unique classes
        probabilities = values / values.sum()  # Normalize to get probability distribution
        
        # Step 3: Create a probability matrix for the current month
        prob_matrix = np.tile(probabilities.values, (len(month_data), 1))
        
        # Create a DataFrame for this month's probabilities with appropriate columns
        month_prob_df = pd.DataFrame(prob_matrix, index=month_data.index, columns=classes)
        
        # Append this month's DataFrame to the overall probability DataFrame
        all_probabilities = pd.concat([all_probabilities, month_prob_df])
    
    # Sort the index to match the original order
    all_probabilities = all_probabilities.sort_index()
    
    # Fill missing columns with zeros for months that do not include certain classes
    all_classes = df_week_0[df_week_0.keys()[0]].unique()
    all_probabilities = all_probabilities.reindex(columns=all_classes, fill_value=0).dropna()[np.arange(len(classes))]
    
    return all_probabilities

In [4]:
import os

# Run nvidia-smi to get GPU information
os.system('nvidia-smi')

Wed Dec 25 20:51:37 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.03              Driver Version: 560.35.03      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-80GB          On  |   00000000:81:00.0 Off |                    0 |
| N/A   32C    P0             61W /  500W |       9MiB /  81920MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
|   1  NVIDIA A100-SXM4-80GB          On  |   00

0

In [5]:
gpu_id = 0

# Explore ranges weeks 6 and 3

In [6]:
wr_original_series = pd.read_csv('/glade/work/jhayron/Data4Predictability/WR_Series_vSLtest.csv',\
                index_col=0,names=['week0','dist'],skiprows=1,parse_dates=True)
# Rolling window for mode
rolling_mode = (
    wr_original_series.rolling('7d', center=True,min_periods=7)
    .apply(lambda x: x.mode()[0] if not x.mode().empty else float('nan'))
).shift(-3)

# Rolling window for the count of the mode
rolling_mode_count = (
    wr_original_series.rolling('7d', center=True,min_periods=7)
    .apply(lambda x: (x == x.mode()[0]).sum() if not x.mode().empty else 0)
).shift(-3)

# If duration of WR during week was less than 4, assing NO WR class
rolling_mode.loc[rolling_mode_count['week0']<4,'week0'] = 4
wr_series_mode = copy.deepcopy(rolling_mode)
time_index = pd.to_datetime(wr_series_mode.index).dayofweek
wr_series_mode = wr_series_mode.iloc[time_index.isin([0,3])].dropna()
wr_series = copy.deepcopy(wr_series_mode)

for wk in range(2,10):
    series_temp = copy.deepcopy(wr_series["week0"])
    series_temp.index = series_temp.index - timedelta(weeks = wk-1)
    series_temp.name = f'week{wk-1}'
    if wk==2:
        df_shifts = pd.concat([pd.DataFrame(wr_series["week0"]),pd.DataFrame(series_temp)],axis=1)  
    else:
        df_shifts = pd.concat([df_shifts,pd.DataFrame(series_temp)],axis=1)

In [7]:
list_files_anoms = np.sort(glob.glob('/glade/derecho/scratch/jhayron/Data4Predictability/WeeklyAnoms_DetrendedStd_v3_2dg/*.nc'))
list_vars = [list_files_anoms[i].split('/')[-1][:-3] for i in range(len(list_files_anoms))]

In [8]:
for ivar,var in enumerate(list_vars):
    print(ivar,var)

0 IC_SODA
1 IT_SODA
2 MLD_SODA
3 OHC100_SODA
4 OHC200_SODA
5 OHC300_SODA
6 OHC50_SODA
7 OHC700_SODA
8 OLR_ERA5
9 SD_ERA5
10 SSH_SODA
11 SST_OISSTv2
12 SST_SODA
13 STL_1m_ERA5
14 STL_28cm_ERA5
15 STL_7cm_ERA5
16 STL_full_ERA5
17 SWVL_1m_ERA5
18 SWVL_28cm_ERA5
19 SWVL_7cm_ERA5
20 SWVL_full_ERA5
21 U10_ERA5
22 U200_ERA5
23 Z500_ERA5


In [9]:
ivar = 3

In [10]:
def f1_eval(preds, dtrain):
    labels = dtrain.get_label()
    preds = np.argmax(preds, axis=1)  # Get the predicted class
    f1 = f1_score(labels, preds, average='micro')
    return 'f1_eval', f1

In [11]:
print(list_vars[ivar])

path_weekly_anoms = '/glade/derecho/scratch/jhayron/Data4Predictability/WeeklyAnoms_DetrendedStd_v3_2dg/'
path_nc_anoms = f'{path_weekly_anoms}{list_vars[ivar]}.nc'
anoms = xr.open_dataset(path_nc_anoms)
anoms = anoms.assign_coords(time=pd.DatetimeIndex(anoms.time).normalize())
var_name_nc = list(anoms.data_vars.keys())[0]

# Define a boxcar filter function
def boxcar_filter(data, size):
    kernel = np.ones((size, size)) / (size * size)
    from scipy.signal import convolve2d
    return convolve2d(data, kernel, mode="same", boundary="fill", fillvalue=np.nan)

# Apply boxcar filter
smoothed_anoms = xr.apply_ufunc(
    boxcar_filter,
    anoms,
    kwargs={"size": 3},  # Adjust window size (e.g., 5x5 grid cells)
    input_core_dims=[["lat", "lon"]],
    output_core_dims=[["lat", "lon"]],
    vectorize=True,
)

anoms_flattened = smoothed_anoms[var_name_nc].stack(flat_spatial=('lat', 'lon'))
anoms_flattened_og = copy.deepcopy(anoms_flattened)
# anoms_flattened_og.data[:,anoms_flattened.columns] = anoms_flattened.values WITH THIS LINES I CAN COME BACK
# anoms_flattened_og.unstack('flat_spatial') WITH THIS LINES I CAN COME BACK
anoms_flattened = pd.DataFrame(anoms_flattened,index = anoms_flattened.time)
anoms_flattened = anoms_flattened.dropna(axis=1, how='any')

combined_df = copy.deepcopy(anoms_flattened)
combined_df['day_sin'] = np.sin(2 * np.pi * combined_df.index.day_of_year / 365)
combined_df['day_cos'] = np.cos(2 * np.pi * combined_df.index.day_of_year / 365)

OHC100_SODA


In [12]:
f1s_test2 = []
f1s_random2 = []
f1s_persistence2 = []
f1s_climatology2 = []

# for week_out in range(0,9):
for week_out in [3,6]:
    print(f'WEEK: {week_out}')
    week_out_str = f'week{week_out}'

    fully_combined_df = pd.concat([combined_df,df_shifts[week_out_str]],axis=1)
    fully_combined_df = fully_combined_df.dropna()

    dic_trainval, dic_test = get_train_val_test_periods(fully_combined_df)
    start_of_test_periods = np.arange(1981,2021,10)
    
    df_week_forecast = df_shifts[[week_out_str]].dropna()
    
    random_forecast = generate_random_forecast(df_week_forecast,
                                               seed_value=42)
    climatology_forecast = generate_random_forecast_with_monthly_probabilities(df_week_forecast, 
                                                                               seed_value=42)
    random_forecast_probs = generate_random_forecast_probabilities(df_week_forecast)
    climatology_forecast_probs = generate_probability_forecast_with_monthly_probabilities(df_week_forecast)

    list_results = []
    list_results_probs = []
    
    for iperiod in range(len(start_of_test_periods)):
        print(iperiod)
        X_trainval = dic_trainval[start_of_test_periods[iperiod]].iloc[:,:-1].values
        y_trainval = dic_trainval[start_of_test_periods[iperiod]].iloc[:,-1]
        
        X_test = dic_test[start_of_test_periods[iperiod]].iloc[:,:-1].values
        y_test = dic_test[start_of_test_periods[iperiod]].iloc[:,-1]
        #### HERE DEFINITION OF THE MODEL ####
        hyperparams = {
            "n_estimators": 20,       # Number of boosting rounds, balances speed and depth
            "max_depth": 3,            # Controls tree depth; deeper trees model complexity better but risk overfitting
            "learning_rate": 10**(-1),      # Step size shrinkage for reducing overfitting risk
            "subsample": 0.85,          # Fraction of training samples used per tree, helps generalize
            "colsample_bytree": 0.7,   # Fraction of features used per tree, improves robustness
            "colsample_bylevel": 1,   # Fraction of features used per tree, improves robustness
            "gamma": 2.5,                # Minimum loss reduction to make a split, prevents overfitting
            "min_child_weight": 10,     # Minimum sum of instance weight needed in a leaf, controls complexity
            "reg_alpha": 10**(1.1),            # L1 regularization term for weights, often fine at 0 unless high sparsity is needed
            "reg_lambda": 10**(2)            # L2 regularization term for weights, controls overfitting
        }
        cw = class_weight.compute_sample_weight(
            class_weight='balanced',
            y=y_trainval
        )
        cw = cw**0.8
        model = xgb.XGBClassifier(n_estimators=hyperparams['n_estimators'],
                            max_depth=hyperparams['max_depth'],
                            learning_rate=hyperparams['learning_rate'],
                            subsample=hyperparams['subsample'],
                            colsample_bytree=hyperparams['colsample_bytree'],
                            colsample_bylevel=hyperparams['colsample_bylevel'],
                            gamma=hyperparams['gamma'],
                            reg_alpha=hyperparams['reg_alpha'],
                            reg_lambda=hyperparams['reg_lambda'],
                            num_class=5,
                            objective = "multi:softprob",
                            tree_method='hist',
                            device = f'cuda:{gpu_id}')
        model.fit(X_trainval, y_trainval, sample_weight=cw)
        y_predicted = model.predict(X_test)
        print(f1_score(y_test,y_predicted,average='micro'))
        y_predicted_probs = model.predict_proba(X_test)
        y_predicted_probs = pd.DataFrame(y_predicted_probs,index=y_test.index)
        df_results_temp = pd.DataFrame(np.array([y_test.values,y_predicted]).T,
                                       index=y_test.index,
                                       columns=['y_true','y_predicted'])
        list_results.append(df_results_temp)
        list_results_probs.append(y_predicted_probs)
        
    df_results_full = pd.concat(list_results,axis=0)
    df_results_probs_full = pd.concat(list_results_probs,axis=0)
    print('**** Micro results ****')
    f1_results = f1_score(df_results_full['y_true'],df_results_full['y_predicted'],average='micro')
    
    f1_random = f1_score(df_results_full['y_true'],
             random_forecast['y_predicted'].loc[df_results_full['y_true'].index],average='micro')
    df_week_0 = df_shifts[['week0']].dropna()
    persistence_forecast = df_week_0.loc[df_results_full['y_true'].index]
    f1_persistence = f1_score(df_results_full['y_true'],
             persistence_forecast['week0'].loc[df_results_full['y_true'].index],average='micro')
    
    f1_climatology = f1_score(df_results_full['y_true'],
             climatology_forecast['y_predicted'].loc[df_results_full['y_true'].index],average='micro')
    print(f'F1 XGBoost:', f1_results)
    print('F1 Random:',f1_random)
    print('F1 Persistence:',f1_persistence)
    print('F1 Climatology:',f1_climatology)
    f1s_test2.append(f1_results)
    f1s_random2.append(f1_random)
    f1s_persistence2.append(f1_persistence)
    f1s_climatology2.append(f1_climatology)
    print(f'F1 XGBoost:', f1_results)
    print('F1 Random:',f1_random)
    print('F1 Persistence:',f1_persistence)
    print('F1 Climatology:',f1_climatology)
    print('**** Frequencies ****')
    print('True distribution:')
    print(np.bincount(df_results_full['y_true'])/np.sum(np.bincount(df_results_full['y_true'])))
    print('Forecasted distribution:')
    print(np.bincount(df_results_full['y_predicted'])/np.sum(np.bincount(df_results_full['y_predicted'])))

WEEK: 3
0


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




0.17641418983700863
1
0.25311601150527324
2
0.22413793103448276
3
0.19922630560928434
**** Micro results ****
F1 XGBoost: 0.2132564841498559
F1 Random: 0.20292987512007685
F1 Persistence: 0.22478386167146974
F1 Climatology: 0.21805955811719502
**** Macro results ****
F1 XGBoost: 0.20601898701039528
F1 Random: 0.19645202516507373
F1 Persistence: 0.21859992915184262
F1 Climatology: 0.21241307411836083
**** Frequencies ****
True distribution:
[0.25336215 0.2240634  0.19164265 0.15465898 0.17627281]
Forecasted distribution:
[0.25216138 0.2660903  0.19116234 0.15682037 0.13376561]
WEEK: 6
0
0.23873441994247363
1
0.24544582933844677
2
0.22126436781609196
3
0.23689320388349513
**** Micro results ****
F1 XGBoost: 0.23557692307692307
F1 Random: 0.203125
F1 Persistence: 0.21322115384615384
F1 Climatology: 0.20721153846153847
**** Macro results ****
F1 XGBoost: 0.21617468756924235
F1 Random: 0.19664654104730958
F1 Persistence: 0.20614731171892373
F1 Climatology: 0.20120935238977591
**** Frequenci

# do hyperparameter search

In [13]:
def optimize_xgboost(X_trainval,y_trainval,path_save = None):
    ## Apply Bayesian optimization to XGBoost parameters

    def crossval_xgboost(max_depth,
                         log10_learning_rate,
                         subsample,
                         colsample_bytree,
                         colsample_bylevel,
                         gamma,
                         min_child_weight,
                         log10_reg_alpha,
                         log10_reg_lambda,
                         beta_class_weights):
        
        max_depth = int(max_depth)
        learning_rate = 10 ** log10_learning_rate
        reg_alpha = 10 ** log10_reg_alpha
        reg_lambda = 10 ** log10_reg_lambda
        
        # Instantiate the XGBoost model
        clf = xgb.XGBClassifier(
            max_depth=max_depth,
            learning_rate=learning_rate,
            subsample=subsample,
            colsample_bytree=colsample_bytree,
            colsample_bylevel=colsample_bylevel,
            gamma=gamma,
            min_child_weight = min_child_weight,
            reg_alpha=reg_alpha,
            reg_lambda=reg_lambda,
            num_class=5,
            device=f'cuda:{gpu_id}',
            tree_method='hist',
            objective='multi:softprob',
            random_state=42
        )
        
        dic_params_cv = clf.get_xgb_params()
        
        # Custom cross-validation with TimeSeriesSplit
        tscv = TimeSeriesSplit(n_splits=3)  # Adjust number of splits as needed
        scores = []
        for train_index, test_index in tscv.split(X_trainval):
            X_train, X_test = X_trainval[train_index], X_trainval[test_index]
            y_train, y_test = y_trainval.iloc[train_index], y_trainval.iloc[test_index]

            class_weights_arr = compute_class_weight('balanced', 
                                                     classes=np.unique(y_train), y=y_train)
            class_weight_dict = dict(zip(np.unique(y_train), class_weights_arr))
            train_weight = np.array([class_weight_dict[label] for label in y_train])**beta_class_weights
            
            dtrain = xgb.DMatrix(X_train, y_train, weight=train_weight)
            dtest = xgb.DMatrix(X_test, y_test)
            
            # Train the model with early stopping
            clf = xgb.train(
                    dic_params_cv,
                    dtrain,
                    num_boost_round=20  # Use the best boosting rounds
                )
            # Predict and evaluate
            preds = clf.predict(dtest)
            ###### WITH F1 SCORE ########
            score = f1_score(y_test, np.argmax(preds, axis=1), average='micro')  # Use your chosen metric
            if (np.bincount(np.argmax(preds, axis=1))\
                /np.sum(np.bincount(np.argmax(preds, axis=1)))).max() > 0.4:
                score = score * 0.5
            scores.append(score)
        # print(scores)
        # print(np.mean(scores))
        return np.mean(scores)

    pbounds = {
        # Tree-specific hyperparameters
        'max_depth': (2, 20),  # Moderate depth to prevent overfitting
        'min_child_weight': (1, 20),  # Prevent overly small leaves
        'subsample': (0.7, 0.9),  # Balance between under- and over-sampling
        'colsample_bytree': (0.6, 0.9),  # Use a subset of features to reduce variance
        'colsample_bylevel': (0.75, 1),  # Similar to colsample_bytree but at each split
    
        # Learning task-specific hyperparameters
        'log10_learning_rate': (-4, -1),  # Learning rate in log10 space to explore lower values
        'gamma': (0, 5),  # Regularization term to prevent over-complex trees
        'log10_reg_lambda': (0, 2.5),  # L2 regularization
        'log10_reg_alpha': (0.6, 1.6),  # L1 regularization
    
        # General
        'beta_class_weights': (0, 2.5),  # Use class weights if needed for imbalanced data
    }
    
    # acq = acquisition.UpperConfidenceBound(kappa=0.1)
    acq = acquisition.ExpectedImprovement(xi=0.) ## CHOSEN ONE xi->0 full exploitation
    # acq = acquisition.ProbabilityOfImprovement(xi=1e-4)
    # acq = GreedyAcquisition(random_state=42)
    optimizer = BayesianOptimization(
        f=crossval_xgboost,
        pbounds=pbounds,
        random_state=42,
        verbose=1,
        acquisition_function=acq)
    
    # optimizer.maximize(init_points=10, n_iter=50)
    optimizer.maximize(
        init_points=50,
        n_iter=20,
        # acq='pi'  # Options: 'ei', 'pi', 'ucb'
    )
    
    best_params = optimizer.max['params']
    if path_save:
        results_df = pd.DataFrame(optimizer.res)
        params_df = pd.json_normalize(results_df['params'])
        final_df = pd.concat([params_df, results_df['target']], axis=1)
        final_df.to_csv(path_save)
    return best_params

In [14]:
f1s_test2 = []
f1s_random2 = []
f1s_persistence2 = []
f1s_climatology2 = []

# for week_out in range(0,9):
for week_out in [3,6]:
    start_time = datetime.now()
    print(f'WEEK: {week_out}')
    week_out_str = f'week{week_out}'

    fully_combined_df = pd.concat([combined_df,df_shifts[week_out_str]],axis=1)
    fully_combined_df = fully_combined_df.dropna()

    dic_trainval, dic_test = get_train_val_test_periods(fully_combined_df)
    start_of_test_periods = np.arange(1981,2021,10)
    
    df_week_forecast = df_shifts[[week_out_str]].dropna()
    
    random_forecast = generate_random_forecast(df_week_forecast,
                                               seed_value=42)
    climatology_forecast = generate_random_forecast_with_monthly_probabilities(df_week_forecast, 
                                                                               seed_value=42)
    random_forecast_probs = generate_random_forecast_probabilities(df_week_forecast)
    climatology_forecast_probs = generate_probability_forecast_with_monthly_probabilities(df_week_forecast)

    list_results = []
    list_results_probs = []
    
    for iperiod in range(len(start_of_test_periods)):
        print(iperiod)
        X_trainval = dic_trainval[start_of_test_periods[iperiod]].iloc[:,:-1].values
        y_trainval = dic_trainval[start_of_test_periods[iperiod]].iloc[:,-1]
        
        X_test = dic_test[start_of_test_periods[iperiod]].iloc[:,:-1].values
        y_test = dic_test[start_of_test_periods[iperiod]].iloc[:,-1]
        best_params = optimize_xgboost(X_trainval,
               y_trainval,
               f'ResultsTests_Hyperparams_5Classes/df_hyperparams_{list_vars[ivar]}_{week_out_str}_{iperiod}.csv')
        with open(f'ResultsTests_Hyperparams_5Classes/besthyperparams_{list_vars[ivar]}_{week_out_str}_{iperiod}.json', 'w') as json_file:
            json.dump(best_params, json_file)
            
        #### HERE DEFINITION OF THE MODEL ####
        
        cw = class_weight.compute_sample_weight(
            class_weight='balanced',
            y=y_trainval
        )
        cw = cw**best_params['beta_class_weights']
        
        model = xgb.XGBClassifier(n_estimators=20,
                            max_depth=int(best_params['max_depth']),
                            learning_rate=10**best_params['log10_learning_rate'],
                            subsample=best_params['subsample'],
                            colsample_bytree=best_params['colsample_bytree'],
                            colsample_bylevel=best_params['colsample_bylevel'],
                            gamma=best_params['gamma'],
                            reg_alpha=10**best_params['log10_reg_alpha'],
                            reg_lambda=10**best_params['log10_reg_lambda'],
                            num_class=5,
                            objective = "multi:softprob",
                            tree_method='hist',
                            device = f'cuda:{gpu_id}')
        
        model.fit(X_trainval, y_trainval, sample_weight=cw)
        y_predicted = model.predict(X_test)
        print(f1_score(y_test,y_predicted,average='micro'))
        y_predicted_probs = model.predict_proba(X_test)
        y_predicted_probs = pd.DataFrame(y_predicted_probs,index=y_test.index)
        df_results_temp = pd.DataFrame(np.array([y_test.values,y_predicted]).T,
                                       index=y_test.index,
                                       columns=['y_true','y_predicted'])
        list_results.append(df_results_temp)
        list_results_probs.append(y_predicted_probs)
        
    df_results_full = pd.concat(list_results,axis=0)
    df_results_probs_full = pd.concat(list_results_probs,axis=0)
    print('**** Micro results ****')
    f1_results = f1_score(df_results_full['y_true'],df_results_full['y_predicted'],average='micro')
    
    f1_random = f1_score(df_results_full['y_true'],
             random_forecast['y_predicted'].loc[df_results_full['y_true'].index],average='micro')
    df_week_0 = df_shifts[['week0']].dropna()
    persistence_forecast = df_week_0.loc[df_results_full['y_true'].index]
    f1_persistence = f1_score(df_results_full['y_true'],
             persistence_forecast['week0'].loc[df_results_full['y_true'].index],average='micro')
    
    f1_climatology = f1_score(df_results_full['y_true'],
             climatology_forecast['y_predicted'].loc[df_results_full['y_true'].index],average='micro')
    print(f'F1 XGBoost:', f1_results)
    print('F1 Random:',f1_random)
    print('F1 Persistence:',f1_persistence)
    print('F1 Climatology:',f1_climatology)
    f1s_test2.append(f1_results)
    f1s_random2.append(f1_random)
    f1s_persistence2.append(f1_persistence)
    f1s_climatology2.append(f1_climatology)
    print('**** Frequencies ****')
    print('True distribution:')
    print(np.bincount(df_results_full['y_true'])/np.sum(np.bincount(df_results_full['y_true'])))
    print('Forecasted distribution:')
    print(np.bincount(df_results_full['y_predicted'])/np.sum(np.bincount(df_results_full['y_predicted'])))
    end_time = datetime.now()
    print('---------------> Running Time:',(end_time-start_time).seconds/60,' minutes.')

WEEK: 3
0
|   iter    |  target   | beta_c... | colsam... | colsam... |   gamma   | log10_... | log10_... | log10_... | max_depth | min_ch... | subsample |
-------------------------------------------------------------------------------------------------------------------------------------------------
| [35m7        [39m | [35m0.169    [39m | [35m0.9717   [39m | [35m0.8178   [39m | [35m0.8486   [39m | [35m1.784    [39m | [35m-3.157   [39m | [35m1.143    [39m | [35m0.3523   [39m | [35m16.44    [39m | [35m2.416    [39m | [35m0.8974   [39m |
| [35m14       [39m | [35m0.1991   [39m | [35m1.044    [39m | [35m0.8055   [39m | [35m0.636    [39m | [35m1.688    [39m | [35m-1.171   [39m | [35m0.9232   [39m | [35m1.297    [39m | [35m14.65    [39m | [35m7.909    [39m | [35m0.8944   [39m |
| [35m34       [39m | [35m0.206    [39m | [35m0.8899   [39m | [35m0.9395   [39m | [35m0.6043   [39m | [35m0.5804   [39m | [35m-3.862   [39m | [35m0.640

KeyboardInterrupt: 