In [None]:
import os
from pathlib import Path

import xarray as xr
import pandas as pd
import numpy as np

from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
DIR_DATA = Path(os.path.dirname(os.path.abspath(''))).resolve() / "data"
DIR_SOURCE_O = DIR_DATA / "final"
DIR_SOURCE_P = DIR_DATA / "processed"

In [None]:
SEED = 0
LR = 1e-3
L2 = 1e-3
BATCH_SIZE = 4
EPOCHS = 20
CV = 5

In [None]:
X = xr.open_dataset(DIR_SOURCE_O / "oceanographic_data.nc")
y = pd.read_hdf(DIR_SOURCE_P / "TOPP" / "SHARKS.h5")

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=SEED, shuffle=True)

In [None]:
parameter_grid_xgb = {
        'n_estimators': [100, 500, 750],
    'max_depth': [3, 5, 10],
    'learning_rate': [LR, LR*0.1, LR*10],
    'min_child_weight': [1, 2, 3],
    "random_state": [SEED]
}

xgb = XGBRegressor()

grid_search_xgb = GridSearchCV(xgb, parameter_grid_xgb, cv=CV, scoring="rmse", verbose=3)
grid_search_xgb.fit(X_train, y_train)

print("Gradient Boosting Regressor")
print("Best hyperparameters: ", grid_search_xgb.best_params_)
print("Best accuracy: ", grid_search_xgb.best_score_)

r2_xgb = grid_search_xgb.score(X_test, y_test)
print('RMSE score: {:.2f}'.format(r2_xgb))

xgb = XGBRegressor(**grid_search_xgb.best_params_)
xgb.fit(X_train, y_train)