In [1]:
import xarray as xr
import cfgrib
import numpy as np
import optuna

from functools import partial
from sklearn.metrics import mean_squared_error
import sys
sys.path.append("..")

from baselines.data_processor import DataProcessor
from baselines.linear_reg.linear_regressor import LinearRegressor

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
grib_data = cfgrib.open_datasets('../data2022.grib')
surface = grib_data[0] 
hybrid = grib_data[1] 

In [3]:
feature_list = ['t2m', 'sp', 'tcc', 'u10', 'v10', 'tp']
t2m = surface.t2m.to_numpy() - 273.15  # -> C
sp = surface.sp.to_numpy() / 100       # -> hPa
tcc = surface.tcc.to_numpy()
u10 = surface.u10.to_numpy()
v10 = surface.v10.to_numpy()
tp = hybrid.tp.to_numpy().reshape((-1,) + hybrid.tp.shape[2:])
data = np.stack((t2m, sp, tcc, u10, v10, tp), axis=-1)

In [4]:
def objective(trial, data, feature_list):
    #Parameters for optimization
    s = trial.suggest_int('s', 3, 8)  
    # fh = trial.suggest_int('fh', 1, 3) 
    fh = 1
    alpha = trial.suggest_float('alpha', 0.1, 2.1, step=0.2)
    regressor_type = trial.suggest_categorical('regressor_type', ['lasso', 'ridge', 'elastic_net'])

    processor = DataProcessor(data)
    X, y = processor.preprocess(s, fh, use_neighbours=False)
    X_train, X_test, y_train, y_test = processor.train_test_split(X, y)
    
    
    linearreg = LinearRegressor(X.shape, fh, feature_list, regressor_type=regressor_type, alpha=alpha)
    linearreg.train(X_train, y_train, normalize=True)
    y_hat = linearreg.predict_(X_test, y_test)
    
    rmse_values = linearreg.get_rmse(y_hat, y_test, normalize=True)
    mean_rmse = np.mean(rmse_values)

    return mean_rmse

In [5]:
#create an optuna study and objective function
study = optuna.create_study(direction='minimize')
objective_func = partial(objective, data=data, feature_list=feature_list)

[I 2023-11-10 17:25:20,613] A new study created in memory with name: no-name-9edfb2a5-b0bf-4935-a5dc-2eff45ee8b05


In [6]:
n_trials = 5  # Number of trials to run
study.optimize(objective_func, n_trials=n_trials)

[I 2023-11-10 17:25:54,777] Trial 0 finished with value: 0.08449999999999998 and parameters: {'s': 4, 'alpha': 1.7000000000000002, 'regressor_type': 'elastic_net'}. Best is trial 0 with value: 0.08449999999999998.
[I 2023-11-10 17:26:06,607] Trial 1 finished with value: 0.06449999999999999 and parameters: {'s': 5, 'alpha': 1.9000000000000001, 'regressor_type': 'ridge'}. Best is trial 1 with value: 0.06449999999999999.
[I 2023-11-10 17:26:15,953] Trial 2 finished with value: 0.06449999999999999 and parameters: {'s': 5, 'alpha': 0.30000000000000004, 'regressor_type': 'ridge'}. Best is trial 1 with value: 0.06449999999999999.
[I 2023-11-10 17:27:28,921] Trial 3 finished with value: 0.08299999999999999 and parameters: {'s': 6, 'alpha': 1.1, 'regressor_type': 'elastic_net'}. Best is trial 1 with value: 0.06449999999999999.
[I 2023-11-10 17:28:02,388] Trial 4 finished with value: 0.08366666666666665 and parameters: {'s': 4, 'alpha': 1.1, 'regressor_type': 'elastic_net'}. Best is trial 1 with

In [7]:
best_s = study.best_params['s']
# best_fh = study.best_params['fh']
best_regressor_type = study.best_params['regressor_type']
best_alpha = study.best_params['alpha']

In [8]:
print('Best hyperparameters:')
print(f"Best input window: {best_s}")
print(f"Best regressor type: {best_regressor_type}")
print(f"Best regularization constant: {best_alpha}")

Best hyperparameters:
Best input window: 5
Best regressor type: ridge
Best regularization constant: 1.9000000000000001
