In [1]:
import os
from copy import deepcopy
from tqdm import tqdm
from datetime import date, datetime
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, root_mean_squared_error
from sklearn.linear_model import LinearRegression, Lasso
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import optuna
from optuna.samplers import RandomSampler
import warnings
warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv("NMXLNT_df.csv")
df["datetime"] = pd.to_datetime(df["datetime"])
print(df["Location"].unique())
df.head()

['BAY MAU.csv' 'BTLVT.csv' 'CAU NGA.csv' 'TU SON.csv' 'YENSO.csv']


Unnamed: 0,datetime,flow_in,flow_out1,flow_out2,flow_out3,temp,ph,tss,do,cod,...,do_prev_8,cod_prev_8,bod_prev_8,toc_prev_8,no3_prev_8,nh4_prev_8,po4_prev_8,total_n_prev_8,total_p_prev_8,Location_prev_8
0,2024-01-01 08:00:00,542.08,129.61,116.77,515.14,26.99,7.19,7.23,,25.36,...,,27.16,,,,0.89,1.0,,,BAY MAU.csv
1,2024-01-01 08:05:00,538.07,127.68,114.95,510.16,26.99,7.19,7.21,,25.33,...,,27.23,,,,0.89,1.01,,,BAY MAU.csv
2,2024-01-01 08:10:00,537.03,126.99,113.83,531.9,26.99,7.19,7.18,,25.36,...,,27.27,,,,0.89,1.01,,,BAY MAU.csv
3,2024-01-01 08:15:00,537.77,125.35,113.12,530.93,26.99,7.2,7.19,,25.37,...,,27.36,,,,0.89,1.0,,,BAY MAU.csv
4,2024-01-01 08:20:00,537.07,125.23,113.72,531.85,26.99,7.19,7.18,,25.36,...,,27.38,,,,0.88,1.01,,,BAY MAU.csv


Make CV split & compare with baseline

In [3]:
def create_cv_split(df, features_used, cv = 5):
    X_train_arr = []
    X_test_arr = []
    Y_train_arr = []
    Y_test_arr = []
    start_month = 13 - cv
    for i in range(cv):
        train = deepcopy(df[df["datetime"].dt.month < start_month + i].reset_index().drop("index", axis = 1))
        test = deepcopy(df[(df["datetime"].dt.month >= start_month + i) & (df["datetime"].dt.month < start_month + 1 + i)].reset_index().drop("index", axis = 1))
        X_train_arr.append(train[features_used])
        X_test_arr.append(test[features_used])
        Y_train_arr.append(train["cod"])
        Y_test_arr.append(test["cod"])
    return X_train_arr, X_test_arr, Y_train_arr, Y_test_arr

In [4]:
def create_cv_split_location(df, features_used, loc, cv = 5):
    df = df[df["Location"] == loc].reset_index().drop("index", axis = 1)
    X_train_arr = []
    X_test_arr = []
    Y_train_arr = []
    Y_test_arr = []
    start_month = 13 - cv
    for i in range(cv):
        train = deepcopy(df[df["datetime"].dt.month < start_month + i].reset_index().drop("index", axis = 1))
        test = deepcopy(df[(df["datetime"].dt.month >= start_month + i) & (df["datetime"].dt.month < start_month + 1 + i)].reset_index().drop("index", axis = 1))
        X_train_arr.append(train[features_used])
        X_test_arr.append(test[features_used])
        Y_train_arr.append(train["cod"])
        Y_test_arr.append(test["cod"])
    return X_train_arr, X_test_arr, Y_train_arr, Y_test_arr

In [5]:
def create_cv_split_diff(df, features_used, time_diff = 4, cv = 5):
    X_train_arr = []
    X_test_arr = []
    Y_train_arr = []
    Y_test_arr = []
    start_month = 13 - cv
    for i in range(cv):
        train = deepcopy(df[df["datetime"].dt.month < start_month + i].reset_index().drop("index", axis = 1))
        test = deepcopy(df[(df["datetime"].dt.month >= start_month + i) & (df["datetime"].dt.month < start_month + 1 + i)].reset_index().drop("index", axis = 1))
        X_train_arr.append(train[features_used])
        X_test_arr.append(test[features_used])
        Y_train_arr.append(train[f"cod_diff_{time_diff}"])
        Y_test_arr.append(test[f"cod_diff_{time_diff}"])
    return X_train_arr, X_test_arr, Y_train_arr, Y_test_arr

Make the model and finetune

In [6]:
# Finetuning XGBoost
def objective_xgboost(trial):
    params = {
        "n_estimators": 100,
        "max_depth": trial.suggest_int("max_depth", 1, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.1, log = True),
        "verbosity": 0,
        "subsample": trial.suggest_float("subsample", 0.05, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0, 1),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "enable_categorical": True,
        "random_state": 101
    }

    xgbr = XGBRegressor(**params)
    cv_rmse = 0

    for i in range(5):
        X_train, X_test = X_train_arr[i], X_test_arr[i]
        Y_train, Y_test = Y_train_arr[i], Y_test_arr[i]
        xgbr.fit(X_train, Y_train)
        Y_pred = xgbr.predict(X_test)
        cv_rmse += root_mean_squared_error(Y_test, Y_pred)
    
    return cv_rmse / 5

def objective_lightgbm(trial):
    params = {
        "n_estimators": 100,
        "verbosity": -1,
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.1, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 2**10),
        "subsample": trial.suggest_float("subsample", 0.05, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1.0),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100),
        "random_state": 101
    }

    lgbr = LGBMRegressor(**params)
    cv_rmse = 0

    for i in range(5):
        X_train, X_test = X_train_arr[i], X_test_arr[i]
        Y_train, Y_test = Y_train_arr[i], Y_test_arr[i]
        lgbr.fit(X_train, Y_train)
        Y_pred = lgbr.predict(X_test)
        cv_rmse += root_mean_squared_error(Y_test, Y_pred)
    
    return cv_rmse / 5

In [7]:
def optimize_xgboost(study_name, storage_name, objective_function=objective_xgboost, n_trials = 50):
    print("Conduct hyperparam opt for XGBoost")
    study = optuna.create_study(
        study_name = study_name,
        direction ='minimize',
        storage = f"sqlite:///{storage_name}.db",
        sampler = RandomSampler(seed = 101),
        load_if_exists=True
    )
    study.optimize(objective_function, n_trials=n_trials, n_jobs=-1)
    print('Best hyperparameters:', study.best_params)
    print('Best RMSE:', study.best_value)
    return study.best_params

def optimize_lightgbm(study_name, storage_name, objective_function=objective_lightgbm, n_trials = 50):
    print("Conduct hyperparam opt for LightGBM")
    study = optuna.create_study(
        study_name = study_name,
        direction='minimize',
        storage = f"sqlite:///{storage_name}.db",
        sampler = RandomSampler(seed = 101),
        load_if_exists=True
    )
    study.optimize(objective_function, n_trials=n_trials, n_jobs=-1)
    print('Best hyperparameters:', study.best_params)
    print('Best MSE:', study.best_value)
    return study.best_params

In [8]:
# use 1 month for test and previous montsh for predict
# Take 1: only use features appear in all data
features_used = [f"cod_prev_{i}" for i in range(4, 9)] + \
                [f"temp_prev_{i}" for i in range(4, 9)] + \
                [f"ph_prev_{i}" for i in range(4, 9)] + \
                [f"tss_prev_{i}" for i in range(4, 9)] 
#                [f"nh4_prev_{i}" for i in range(4, 9)]
X_train_arr, X_test_arr, Y_train_arr, Y_test_arr = create_cv_split(df, features_used)

# baseline: using last k hours to predict
baseline_score = {
    i: 0 for i in range(4, 9)
}
for j in range(4, 9):
    for i in range(5):
        baseline_score[j] += np.sqrt(np.mean((Y_test_arr[i] - X_test_arr[i][f"cod_prev_{j}"])**2))
    baseline_score[j] /= 5
baseline_score

{4: np.float64(3.2628565506763394),
 5: np.float64(3.4706192048817934),
 6: np.float64(3.628221882592013),
 7: np.float64(3.7476215562206967),
 8: np.float64(3.839878002858179)}

In [9]:
best_params_xgboost = optimize_xgboost(
    f"xgboost_study_{str(date.today())}", 
    f"xgboost_study_{str(date.today())}"
)

Conduct hyperparam opt for XGBoost


[I 2025-04-23 23:05:13,230] A new study created in RDB with name: xgboost_study_2025-04-23
[I 2025-04-23 23:05:41,200] Trial 3 finished with value: 5.862001733142196 and parameters: {'max_depth': 2, 'learning_rate': 0.001208945601585532, 'subsample': 0.2250882289279471, 'colsample_bytree': 0.6434277655210398, 'min_child_weight': 6}. Best is trial 3 with value: 5.862001733142196.
[I 2025-04-23 23:05:57,268] Trial 2 finished with value: 3.602968483867875 and parameters: {'max_depth': 4, 'learning_rate': 0.012039187427908764, 'subsample': 0.5816754324104184, 'colsample_bytree': 0.3723099927909129, 'min_child_weight': 5}. Best is trial 2 with value: 3.602968483867875.
[I 2025-04-23 23:06:17,399] Trial 1 finished with value: 5.938270149758241 and parameters: {'max_depth': 6, 'learning_rate': 0.0010670607287452452, 'subsample': 0.7636085127789584, 'colsample_bytree': 0.37868296915944744, 'min_child_weight': 10}. Best is trial 2 with value: 3.602968483867875.
[I 2025-04-23 23:06:25,878] Trial

Best hyperparameters: {'max_depth': 2, 'learning_rate': 0.05244989750226547, 'subsample': 0.24394226277371367, 'colsample_bytree': 0.477401488428844, 'min_child_weight': 7}
Best RMSE: 3.0486235193616094


In [10]:
best_params_lightgbm = optimize_lightgbm(
    f"lightgbm_study_{str(date.today())}", 
    f"lightgbm_study_{str(date.today())}"
)

Conduct hyperparam opt for LightGBM


[I 2025-04-23 23:17:02,255] A new study created in RDB with name: lightgbm_study_2025-04-23
[I 2025-04-23 23:17:16,317] Trial 0 finished with value: 4.6815832221873155 and parameters: {'learning_rate': 0.007885783133805738, 'num_leaves': 13, 'subsample': 0.15693313865647068, 'colsample_bytree': 0.1713134659252067, 'min_data_in_leaf': 73}. Best is trial 0 with value: 4.6815832221873155.
[I 2025-04-23 23:17:37,472] Trial 1 finished with value: 3.21730690705608 and parameters: {'learning_rate': 0.03966977160887868, 'num_leaves': 86, 'subsample': 0.6079466170277738, 'colsample_bytree': 0.8988273143956002, 'min_data_in_leaf': 43}. Best is trial 1 with value: 3.21730690705608.
[I 2025-04-23 23:17:55,875] Trial 2 finished with value: 3.7078286641480553 and parameters: {'learning_rate': 0.025223110968389743, 'num_leaves': 306, 'subsample': 0.5046030013773114, 'colsample_bytree': 0.20308669603250434, 'min_data_in_leaf': 98}. Best is trial 1 with value: 3.21730690705608.
[I 2025-04-23 23:18:35,2

Best hyperparameters: {'learning_rate': 0.03966977160887868, 'num_leaves': 86, 'subsample': 0.6079466170277738, 'colsample_bytree': 0.8988273143956002, 'min_data_in_leaf': 43}
Best MSE: 3.21730690705608


Testing on best configuration

In [11]:
params = {
    "n_estimators": 100,
    "verbosity": 0,
    "enable_categorical": True,
    "random_state": 101
}
for p in best_params_xgboost:
    params[p] = best_params_xgboost[p]

feature_importances = {}

xgbr = XGBRegressor(**params)
for i in range(5):
    X_train = X_train_arr[i]
    Y_train = Y_train_arr[i]
    xgbr.fit(X_train, Y_train)
    features = xgbr.feature_names_in_.tolist()
    features_i = xgbr.feature_importances_.tolist()
    for inx, feat in enumerate(features):
        feature_importances[feat] = feature_importances.get(feat, 0) + features_i[inx]

feature_importances

{'cod_prev_4': 0.8652815669775009,
 'cod_prev_5': 1.7632245123386383,
 'cod_prev_6': 0.7793352752923965,
 'cod_prev_7': 0.560568280518055,
 'cod_prev_8': 0.25911475345492363,
 'temp_prev_4': 0.04172888211905956,
 'temp_prev_5': 0.06516173854470253,
 'temp_prev_6': 0.029207467567175627,
 'temp_prev_7': 0.03486428735777736,
 'temp_prev_8': 0.05589774437248707,
 'ph_prev_4': 0.04341968148946762,
 'ph_prev_5': 0.09082755912095308,
 'ph_prev_6': 0.06169927411247045,
 'ph_prev_7': 0.05203904211521149,
 'ph_prev_8': 0.03799473727121949,
 'tss_prev_4': 0.060369422659277916,
 'tss_prev_5': 0.06045685335993767,
 'tss_prev_6': 0.05391111923381686,
 'tss_prev_7': 0.03782812086865306,
 'tss_prev_8': 0.04706973722204566}

In [12]:
params = {
    "n_estimators": 100,
    "verbosity": -1,
    "random_state": 101
}
for p in best_params_lightgbm:
    params[p] = best_params_lightgbm[p]

feature_importances = {}

lgbr = LGBMRegressor(**params)
for i in range(5):
    X_train = X_train_arr[i]
    Y_train = Y_train_arr[i]
    lgbr.fit(X_train, Y_train)
    features = lgbr.feature_names_in_.tolist()
    features_i = lgbr.feature_importances_.tolist()
    for inx, feat in enumerate(features):
        feature_importances[feat] = feature_importances.get(feat, 0) + features_i[inx]

feature_importances

{'cod_prev_4': 4984,
 'cod_prev_5': 1695,
 'cod_prev_6': 1334,
 'cod_prev_7': 1322,
 'cod_prev_8': 3603,
 'temp_prev_4': 2438,
 'temp_prev_5': 1400,
 'temp_prev_6': 1363,
 'temp_prev_7': 1520,
 'temp_prev_8': 2533,
 'ph_prev_4': 3297,
 'ph_prev_5': 1908,
 'ph_prev_6': 1502,
 'ph_prev_7': 1348,
 'ph_prev_8': 2515,
 'tss_prev_4': 3042,
 'tss_prev_5': 1999,
 'tss_prev_6': 1305,
 'tss_prev_7': 1472,
 'tss_prev_8': 1920}

Try to train with only previous CODs insteads

In [13]:
features_used = [f"cod_prev_{i}" for i in range(4, 9)]
X_train_arr, X_test_arr, Y_train_arr, Y_test_arr = create_cv_split(df, features_used)

In [14]:
best_params_xgboost_only_cod = optimize_xgboost(
    f"xgboost_study_only_cod_{str(date.today())}",
    f"xgboost_study_only_cod_{str(date.today())}",
)

Conduct hyperparam opt for XGBoost


[I 2025-04-23 23:33:24,572] A new study created in RDB with name: xgboost_study_only_cod_2025-04-23
[I 2025-04-23 23:33:44,086] Trial 0 finished with value: 5.888435618413108 and parameters: {'max_depth': 2, 'learning_rate': 0.0011546592160210291, 'subsample': 0.42678063100026403, 'colsample_bytree': 0.5935492448873017, 'min_child_weight': 4}. Best is trial 0 with value: 5.888435618413108.
[I 2025-04-23 23:33:49,712] Trial 3 finished with value: 3.693286319141863 and parameters: {'max_depth': 3, 'learning_rate': 0.011027500056010713, 'subsample': 0.40919916267706463, 'colsample_bytree': 0.37211431959102925, 'min_child_weight': 4}. Best is trial 3 with value: 3.693286319141863.
[I 2025-04-23 23:34:00,494] Trial 2 finished with value: 3.176998688311221 and parameters: {'max_depth': 5, 'learning_rate': 0.03578482641148419, 'subsample': 0.7675985633177862, 'colsample_bytree': 0.09327273984634199, 'min_child_weight': 1}. Best is trial 2 with value: 3.176998688311221.
[I 2025-04-23 23:34:14,

Best hyperparameters: {'max_depth': 4, 'learning_rate': 0.04106252559374539, 'subsample': 0.8253836496043956, 'colsample_bytree': 0.6570325084686885, 'min_child_weight': 8}
Best RMSE: 2.986193485263522


In [15]:
params = {
    "n_estimators": 100,
    "verbosity": 0,
    "enable_categorical": True,
    "random_state": 101
}
for p in best_params_xgboost_only_cod:
    params[p] = best_params_xgboost_only_cod[p]

feature_importances = {}

xgbr = XGBRegressor(**params)
for i in range(5):
    X_train = X_train_arr[i]
    Y_train = Y_train_arr[i]
    xgbr.fit(X_train, Y_train)
    features = xgbr.feature_names_in_.tolist()
    features_i = xgbr.feature_importances_.tolist()
    for inx, feat in enumerate(features):
        feature_importances[feat] = feature_importances.get(feat, 0) + features_i[inx]

feature_importances

{'cod_prev_4': 2.3614585995674133,
 'cod_prev_5': 1.4944610893726349,
 'cod_prev_6': 0.7436188161373138,
 'cod_prev_7': 0.1704165916889906,
 'cod_prev_8': 0.2300451174378395}

Try to using only features from last 4 hours, not using earlier features

In [16]:
# use 1 month for test and previous montsh for predict
# Take 1: only use features appear in all data
features_used = ["cod_prev_4", "temp_prev_4", "ph_prev_4", "tss_prev_4"]
X_train_arr, X_test_arr, Y_train_arr, Y_test_arr = create_cv_split(df, features_used)

In [17]:
best_params_lightgbm_only_4h = optimize_lightgbm(
    f"lightgbm_study_only_4h_{str(date.today())}",
    f"lightgbm_study_only_4h_{str(date.today())}"
)

Conduct hyperparam opt for LightGBM


[I 2025-04-23 23:43:47,888] A new study created in RDB with name: lightgbm_study_only_4h_2025-04-23
[I 2025-04-23 23:44:17,697] Trial 2 finished with value: 3.5341175815157326 and parameters: {'learning_rate': 0.05779532690220248, 'num_leaves': 241, 'subsample': 0.6357279193292777, 'colsample_bytree': 0.20239070318581792, 'min_data_in_leaf': 88}. Best is trial 2 with value: 3.5341175815157326.
[I 2025-04-23 23:44:20,421] Trial 0 finished with value: 5.190133393887054 and parameters: {'learning_rate': 0.010442644143315872, 'num_leaves': 984, 'subsample': 0.7574435903300136, 'colsample_bytree': 0.11323349666768429, 'min_data_in_leaf': 94}. Best is trial 2 with value: 3.5341175815157326.
[I 2025-04-23 23:44:21,087] Trial 3 finished with value: 6.029625298798715 and parameters: {'learning_rate': 0.00255921674064164, 'num_leaves': 815, 'subsample': 0.527350775081116, 'colsample_bytree': 0.2782711075339377, 'min_data_in_leaf': 68}. Best is trial 2 with value: 3.5341175815157326.
[I 2025-04-2

Best hyperparameters: {'learning_rate': 0.06372289493829554, 'num_leaves': 726, 'subsample': 0.7361205237020157, 'colsample_bytree': 0.15596739919834596, 'min_data_in_leaf': 75}
Best MSE: 3.478896693589154


In [18]:
params = {
    "n_estimators": 100,
    "verbosity": -1,
    "random_state": 101
}
for p in best_params_lightgbm_only_4h:
    params[p] = best_params_lightgbm_only_4h[p]

feature_importances = {}

lgbr = LGBMRegressor(**params)
for i in range(5):
    X_train = X_train_arr[i]
    Y_train = Y_train_arr[i]
    lgbr.fit(X_train, Y_train)
    features = lgbr.feature_names_in_.tolist()
    features_i = lgbr.feature_importances_.tolist()
    for inx, feat in enumerate(features):
        feature_importances[feat] = feature_importances.get(feat, 0) + features_i[inx]

feature_importances

{'cod_prev_4': 31750,
 'temp_prev_4': 31625,
 'ph_prev_4': 28242,
 'tss_prev_4': 31750}

Idea: try to use LGBM specifically on difference from 1 cod to the other due to the leaf growth of lgbm can handle this

In [19]:
df["cod_diff_4"] = df["cod"] - df["cod_prev_4"]
features_used = ["ph_prev_4", "tss_prev_4", "temp_prev_4", "nh4_prev_4"]
X_train_arr, X_test_arr, Y_train_arr, Y_test_arr = create_cv_split_diff(df, features_used)

# baseline
baseline_score = 0
for i in range(5):
    baseline_score += np.sqrt(np.mean(Y_test_arr[i]**2))
baseline_score / 5

np.float64(3.2628565506763394)

In [20]:
best_params = optimize_lightgbm(
    f"lightgbm_study_diff_4_{str(date.today())}",
    f"lightgbm_study_diff_4_{str(date.today())}",
) 

Conduct hyperparam opt for LightGBM


[I 2025-04-23 23:58:37,841] A new study created in RDB with name: lightgbm_study_diff_4_2025-04-23
[I 2025-04-23 23:58:42,541] Trial 0 finished with value: 3.269485173747525 and parameters: {'learning_rate': 0.05178649238948697, 'num_leaves': 3, 'subsample': 0.6911526225996422, 'colsample_bytree': 0.46559941707321767, 'min_data_in_leaf': 68}. Best is trial 0 with value: 3.269485173747525.
[I 2025-04-23 23:59:04,005] Trial 1 finished with value: 3.262880530212185 and parameters: {'learning_rate': 0.0011253913145589203, 'num_leaves': 205, 'subsample': 0.16950093940830574, 'colsample_bytree': 0.6415555833246097, 'min_data_in_leaf': 22}. Best is trial 1 with value: 3.262880530212185.
[I 2025-04-23 23:59:06,904] Trial 2 finished with value: 3.514967505906623 and parameters: {'learning_rate': 0.05955822034174013, 'num_leaves': 216, 'subsample': 0.5119559242557542, 'colsample_bytree': 0.47177545799503795, 'min_data_in_leaf': 96}. Best is trial 1 with value: 3.262880530212185.
[I 2025-04-23 23

Best hyperparameters: {'learning_rate': 0.003180087028423869, 'num_leaves': 64, 'subsample': 0.6134094820495632, 'colsample_bytree': 0.2648641353141975, 'min_data_in_leaf': 62}
Best MSE: 3.2610113178545097


Try to look at specific region

In [21]:
baymau_df = df[df["Location"] == "BAY MAU.csv"].reset_index().drop("index", axis = 1)

In [22]:
features_used = [f"cod_prev_{i}" for i in range(4, 9)]
X_train_arr, X_test_arr, Y_train_arr, Y_test_arr = create_cv_split(baymau_df, features_used)

# baseline
baseline_score = 0
for i in range(5):
    baseline_score += np.sqrt(np.mean((Y_test_arr[i] - X_test_arr[i][f"cod_prev_4"])**2))
baseline_score / 5

np.float64(2.744651352208893)

In [23]:
best_params_xgboost_only_cod_baymau = optimize_xgboost(
    f"xgboost_study_only_cod_baymau_{str(date.today())}",
    f"xgboost_study_only_cod_baymau_{str(date.today())}",
)

Conduct hyperparam opt for XGBoost


[I 2025-04-24 00:05:54,987] A new study created in RDB with name: xgboost_study_only_cod_baymau_2025-04-24
[I 2025-04-24 00:06:01,605] Trial 3 finished with value: 2.7059896830176355 and parameters: {'max_depth': 4, 'learning_rate': 0.020531424268661596, 'subsample': 0.3122913048836874, 'colsample_bytree': 0.38858669511539023, 'min_child_weight': 5}. Best is trial 3 with value: 2.7059896830176355.
[I 2025-04-24 00:06:01,961] Trial 2 finished with value: 2.5701873448538266 and parameters: {'max_depth': 4, 'learning_rate': 0.0832651530658168, 'subsample': 0.1552719985865663, 'colsample_bytree': 0.8661047111537575, 'min_child_weight': 1}. Best is trial 2 with value: 2.5701873448538266.
[I 2025-04-24 00:06:04,926] Trial 1 finished with value: 4.15674962810595 and parameters: {'max_depth': 6, 'learning_rate': 0.0011251283180771982, 'subsample': 0.9866719944502651, 'colsample_bytree': 0.4643543031080264, 'min_child_weight': 6}. Best is trial 2 with value: 2.5701873448538266.
[I 2025-04-24 00

Best hyperparameters: {'max_depth': 4, 'learning_rate': 0.06916388237937654, 'subsample': 0.38968037525812527, 'colsample_bytree': 0.8072223250551891, 'min_child_weight': 10}
Best RMSE: 2.5367407480329787


In [24]:
btlvt_df = df[df["Location"] == "BTLVT.csv"].reset_index().drop("index", axis = 1)

In [25]:
features_used = [f"cod_prev_{i}" for i in range(4, 9)]
X_train_arr, X_test_arr, Y_train_arr, Y_test_arr = create_cv_split(btlvt_df, features_used)

# baseline
baseline_score = 0
for i in range(5):
    baseline_score += np.sqrt(np.mean((Y_test_arr[i] - X_test_arr[i][f"cod_prev_4"])**2))
baseline_score / 5

np.float64(2.3756942826068537)

In [26]:
best_params_xgboost_only_cod_btlvt = optimize_xgboost(
    f"xgboost_study_only_cod_btlvt_{str(date.today())}",
    f"xgboost_study_only_cod_btlvt_{str(date.today())}",
)

Conduct hyperparam opt for XGBoost


[I 2025-04-24 00:08:02,059] A new study created in RDB with name: xgboost_study_only_cod_btlvt_2025-04-24
[I 2025-04-24 00:08:07,929] Trial 3 finished with value: 1.852482534023587 and parameters: {'max_depth': 4, 'learning_rate': 0.00592905167573085, 'subsample': 0.829776803366005, 'colsample_bytree': 0.5795259287251285, 'min_child_weight': 9}. Best is trial 3 with value: 1.852482534023587.
[I 2025-04-24 00:08:09,584] Trial 2 finished with value: 1.9012281413965 and parameters: {'max_depth': 5, 'learning_rate': 0.0015086311330384542, 'subsample': 0.15732194835487834, 'colsample_bytree': 0.7248578950580455, 'min_child_weight': 4}. Best is trial 3 with value: 1.852482534023587.
[I 2025-04-24 00:08:16,861] Trial 5 finished with value: 1.8657700652970206 and parameters: {'max_depth': 4, 'learning_rate': 0.003562928693756486, 'subsample': 0.7606584009424804, 'colsample_bytree': 0.9372667665766923, 'min_child_weight': 8}. Best is trial 3 with value: 1.852482534023587.
[I 2025-04-24 00:08:17

Best hyperparameters: {'max_depth': 2, 'learning_rate': 0.03873415390648434, 'subsample': 0.984743826401194, 'colsample_bytree': 0.4704691594653049, 'min_child_weight': 6}
Best RMSE: 1.802203177165166


In [27]:
caunga_df = df[df["Location"] == "CAU NGA.csv"].reset_index().drop("index", axis = 1)

In [28]:
features_used = [f"cod_prev_{i}" for i in range(4, 9)]
X_train_arr, X_test_arr, Y_train_arr, Y_test_arr = create_cv_split(caunga_df, features_used)

# baseline
baseline_score = 0
for i in range(5):
    baseline_score += np.sqrt(np.mean((Y_test_arr[i] - X_test_arr[i][f"cod_prev_4"])**2))
baseline_score / 5

np.float64(3.4733060948212384)

In [29]:
best_params_xgboost_only_cod_caunga = optimize_xgboost(
    f"xgboost_study_only_cod_caunga_{str(date.today())}",
    f"xgboost_study_only_cod_caunga_{str(date.today())}",
)

Conduct hyperparam opt for XGBoost


[I 2025-04-24 00:10:03,753] A new study created in RDB with name: xgboost_study_only_cod_caunga_2025-04-24
[I 2025-04-24 00:10:06,873] Trial 2 finished with value: 3.8128656857411003 and parameters: {'max_depth': 1, 'learning_rate': 0.07369375931334861, 'subsample': 0.234304999183157, 'colsample_bytree': 0.3626635681960755, 'min_child_weight': 6}. Best is trial 2 with value: 3.8128656857411003.
[I 2025-04-24 00:10:11,704] Trial 1 finished with value: 3.4717979073954885 and parameters: {'max_depth': 5, 'learning_rate': 0.0244454389558755, 'subsample': 0.613274555264901, 'colsample_bytree': 0.5039247118954308, 'min_child_weight': 2}. Best is trial 1 with value: 3.4717979073954885.
[I 2025-04-24 00:10:12,001] Trial 4 finished with value: 3.687768439405359 and parameters: {'max_depth': 3, 'learning_rate': 0.052197416676250684, 'subsample': 0.4277505699909304, 'colsample_bytree': 0.12775850795279653, 'min_child_weight': 3}. Best is trial 1 with value: 3.4717979073954885.
[I 2025-04-24 00:10

Best hyperparameters: {'max_depth': 2, 'learning_rate': 0.05342931709957374, 'subsample': 0.5302279061353488, 'colsample_bytree': 0.8877678962147231, 'min_child_weight': 9}
Best RMSE: 3.3749010776733868


Try to combine 4 hour features + pastcods

In [30]:
features_used = [f"cod_prev_{i}" for i in range(4, 9)] + ["tss_prev_4", "temp_prev_4", "ph_prev_4"]
X_train_arr, X_test_arr, Y_train_arr, Y_test_arr = create_cv_split(df, features_used)

In [31]:
best_params_xgboost_only_cod_and_4h = optimize_xgboost(
    f"xgboost_study_only_cod_and_4h_{str(date.today())}",
    f"xgboost_study_only_cod_and_4h_{str(date.today())}",
)

Conduct hyperparam opt for XGBoost


[I 2025-04-24 00:29:23,746] A new study created in RDB with name: xgboost_study_only_cod_and_4h_2025-04-24
[I 2025-04-24 00:29:49,530] Trial 0 finished with value: 3.212559309586676 and parameters: {'max_depth': 3, 'learning_rate': 0.09947730437578066, 'subsample': 0.6219620362216319, 'colsample_bytree': 0.037627881734031865, 'min_child_weight': 1}. Best is trial 0 with value: 3.212559309586676.
[I 2025-04-24 00:29:56,557] Trial 2 finished with value: 5.8461335219769115 and parameters: {'max_depth': 4, 'learning_rate': 0.0013043325862234089, 'subsample': 0.2552504519794591, 'colsample_bytree': 0.334954140726881, 'min_child_weight': 4}. Best is trial 0 with value: 3.212559309586676.
[I 2025-04-24 00:30:03,295] Trial 1 finished with value: 3.393527314728393 and parameters: {'max_depth': 5, 'learning_rate': 0.013935972746705669, 'subsample': 0.7826469542705068, 'colsample_bytree': 0.80633075259131, 'min_child_weight': 5}. Best is trial 0 with value: 3.212559309586676.
[I 2025-04-24 00:30:

Best hyperparameters: {'max_depth': 3, 'learning_rate': 0.04392763980967041, 'subsample': 0.9160986578677838, 'colsample_bytree': 0.4672474311855018, 'min_child_weight': 2}
Best RMSE: 3.017517712708816


In [33]:
params = {
    "n_estimators": 100,
    "verbosity": 0,
    "enable_categorical": True,
    "random_state": 101
}
for p in best_params_xgboost_only_cod_and_4h:
    params[p] = best_params_xgboost_only_cod_and_4h[p]

feature_importances = {}

xgbr = XGBRegressor(**params)
for i in range(5):
    X_train = X_train_arr[i]
    Y_train = Y_train_arr[i]
    xgbr.fit(X_train, Y_train)
    features = xgbr.feature_names_in_.tolist()
    features_i = xgbr.feature_importances_.tolist()
    for inx, feat in enumerate(features):
        feature_importances[feat] = feature_importances.get(feat, 0) + features_i[inx]

feature_importances

{'cod_prev_4': 1.3036236017942429,
 'cod_prev_5': 1.4741715490818024,
 'cod_prev_6': 1.023155465722084,
 'cod_prev_7': 0.5316119939088821,
 'cod_prev_8': 0.4792364090681076,
 'tss_prev_4': 0.057925707660615444,
 'temp_prev_4': 0.05643754079937935,
 'ph_prev_4': 0.07383780740201473}