In [None]:
import pandas as pd
import numpy as np
import os
from scipy import stats
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error


adelaide_df = pd.read_csv('energy/Adelaide_Data.csv')
perth_df = _df = pd.read_csv('energy/Perth_Data.csv')
sydney_df = pd.read_csv('energy/Sydney_Data.csv')
tasmania_df = _df = pd.read_csv('energy/Tasmania_Data.csv')

In [None]:
#check if dataset has any wrong values
print("--- Adelaide DataFrame Info ---")
adelaide_df.info()
print("\n--- Adelaide DataFrame Null Counts ---")
print(adelaide_df.isnull().sum())
print(f"\n--- Adelaide DataFrame Duplicate Rows: {adelaide_df.duplicated().sum()} ---")
print("\n--- Adelaide DataFrame Descriptive Statistics ---")
display(adelaide_df.describe())

print("\n\n--- Perth DataFrame Info ---")
perth_df.info()
print("\n--- Perth DataFrame Null Counts ---")
print(perth_df.isnull().sum())
print(f"\n--- Perth DataFrame Duplicate Rows: {perth_df.duplicated().sum()} ---")
print("\n--- Perth DataFrame Descriptive Statistics ---")
display(perth_df.describe())

print("\n\n--- Sydney DataFrame Info ---")
sydney_df.info()
print("\n--- Sydney DataFrame Null Counts ---")
print(sydney_df.isnull().sum())
print(f"\n--- Sydney DataFrame Duplicate Rows: {sydney_df.duplicated().sum()} ---")
print("\n--- Sydney DataFrame Descriptive Statistics ---")
display(sydney_df.describe())

print("\n\n--- Tasmania DataFrame Info ---")
tasmania_df.info()
print("\n--- Tasmania DataFrame Null Counts ---")
print(tasmania_df.isnull().sum())
print(f"\n--- Tasmania DataFrame Duplicate Rows: {tasmania_df.duplicated().sum()} ---")
print("\n--- Tasmania DataFrame Descriptive Statistics ---")
display(tasmania_df.describe())


In [None]:
results = {}

# cols 1-16 represent X coordinates
# cols 17-32 represent Y coordinates
# cols 33 - 48 represent power of single containers
# col 49 represents sum of power in the farm
# We want to calculate Powerall depending on X, Y coordinates

X_cols = [f"X{i}" for i in range(1, 17)]
Y_cols = [f"Y{i}" for i in range(1, 17)]
powerall_col = ["Powerall"]

def split_data(subset, powerall):
    X_train, X_temp, y_train, y_temp = train_test_split(subset, powerall, test_size=0.4, random_state=44)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=44)
    return X_train, X_val, X_test, y_train, y_val, y_test


def prepare_subsets(dataframe):
    subset = dataframe[X_cols + Y_cols]
    powerall = dataframe[powerall_col]
    split_data(subset, powerall)


In [None]:
def plot_predicted_vs_actual(y_test, y_predict, name, r2, rmse, mape, show_plot=False):
    """
    Generates a scatter plot of predicted vs. actual values.
    
    y_test: Actual values
    y_predict: Predicted values
    title: Title for the plot
    """
    plt.figure(figsize=(8, 6))
    plt.scatter(y_test, y_predict, alpha=0.5, label='Predicted vs Actual')
   
    # Ideal line (y=x)
    min_val = min(y_test.min().iloc[0], y_predict.min())
    max_val = max(y_test.max().iloc[0], y_predict.max())
    plt.plot([min_val, max_val], [min_val, max_val], 'k--', lw=2, label='Ideal Fit (y=x)')
   
    plt.xlabel("Actual Values")
    plt.ylabel("Predicted Values")
    plt.title(f"Predicted vs Actual for {name}")
    plt.legend()
    plt.grid(True)

    # Display R2, RMSE, and MAPE values on the plot
    textstr = f"R2: {r2:.4f}\nRMSE: {rmse:.2f}\nMAPE: {mape:.2f}%"
    plt.gca().text(0.05, 0.95, textstr, transform=plt.gca().transAxes, fontsize=10,
                   verticalalignment='top', bbox=dict(boxstyle='round', facecolor='white', alpha=0.5))
    
    if show_plot:
        plt.show()
    plots_dir = 'plots/predicted_vs_actualy'
    if not os.path.exists(plots_dir):
        os.makedirs(plots_dir)
   
    filename = name.replace(' ', '_') + '.png'
    filepath = os.path.join(plots_dir, filename)
   
    plt.savefig(filepath)
    plt.close()


def plot_residuals(y_test, y_predict, name,  r2, rmse, mape, show_plot=False):
    """
    Generates a scatter plot of residuals vs. predicted values.
       
    y_test: Actual values
    y_predict: Predicted values
    title: Title for the plot
    name: Name for saving the file
    """
    residuals = y_test.values.flatten() - y_predict.flatten()
        
    plt.figure(figsize=(8, 6))
    plt.scatter(y_predict, residuals, alpha=0.5)
        
    # Horizontal line at y=0
    plt.axhline(y=0, color='k', linestyle='--', lw=2)
        
    plt.xlabel("Predicted Values")
    plt.ylabel("Residuals (Actual - Predicted)")
    plt.title(f"Residuals vs Predicted Values for {name}")
    plt.grid(True)

    # Display R2, RMSE, and MAPE values on the plot
    textstr = f"R2: {r2:.4f}\nRMSE: {rmse:.2f}\nMAPE: {mape:.2f}%"
    plt.gca().text(0.05, 0.95, textstr, transform=plt.gca().transAxes, fontsize=10,
                   verticalalignment='top', bbox=dict(boxstyle='round', facecolor='white', alpha=0.5))
    
        
    if show_plot:
        plt.show()
        
    plots_dir = 'plots/residuals'
    if not os.path.exists(plots_dir):
        os.makedirs(plots_dir)
        
    filename = name.replace(' ', '_') + '_residuals.png'
    filepath = os.path.join(plots_dir, filename)
        
    plt.savefig(filepath)
    plt.close()

def plot_residuals_distribution(y_test, y_predict, name,  r2, rmse, mape, show_plot=False):
    """
    Generates a histogram of the residuals to show their distribution.
        
    y_test: Actual values
    y_predict: Predicted values
    title: Title for the plot
    name: Name for saving the file
    """
    residuals = y_test.values.flatten() - y_predict.flatten()
        
    plt.figure(figsize=(8, 6))
        
    plt.hist(residuals, bins=50, density=True, alpha=0.6, color='g', label='Residuals Histogram')
        
    # Fit a normal distribution to the data
    mu, std = stats.norm.fit(residuals)
    xmin, xmax = plt.xlim()
    x = np.linspace(xmin, xmax, 100)
    p = stats.norm.pdf(x, mu, std)
    plt.plot(x, p, 'k', linewidth=2, label='Normal Distribution Fit')
        
    plt.axvline(x=0, color='r', linestyle='--', lw=2, label='Zero Residuals')

    plt.xlabel("Residuals")
    plt.ylabel("Density")
    plt.title(f"Residuals Distribution for {name}")
    plt.legend()
    plt.grid(True)

    # Display R2, RMSE, and MAPE values on the plot
    textstr = f"R2: {r2:.4f}\nRMSE: {rmse:.2f}\nMAPE: {mape:.2f}%"
    plt.gca().text(0.05, 0.95, textstr, transform=plt.gca().transAxes, fontsize=10,
                   verticalalignment='top', bbox=dict(boxstyle='round', facecolor='white', alpha=0.5))
        
    if show_plot:
        plt.show()
            
    plots_dir = 'plots/residuals_distribution'
    if not os.path.exists(plots_dir):
        os.makedirs(plots_dir)
            
    filename = name.replace(' ', '_') + '_residuals_dist.png'
    filepath = os.path.join(plots_dir, filename)
            
    plt.savefig(filepath)
    plt.close()

def make_plots(y_test, y_predict, name, r2, rmse, mape):
    plot_predicted_vs_actual(y_test, y_predict, name, r2, rmse, mape)
    plot_residuals(y_test, y_predict, name, r2, rmse, mape)
    plot_residuals_distribution(y_test, y_predict, name, r2, rmse, mape)

    




In [None]:
def train_poly_regression(dataframe, degree, name):
    
   X_train, X_val, X_test, y_train, y_val, y_test = prepare_subsets(dataframe)

   poly_features = PolynomialFeatures(degree=degree, include_bias=False)
   poly = poly_features.fit_transform(subset)

   X_train, X_test, y_train, y_test = train_test_split(poly, powerall, test_size=0.2, random_state=44)

   model = LinearRegression()
   
   model.fit(X_train, y_train)
   y_predict = model.predict(X_test)
   r2 = model.score(X_test, y_test)
   rmse = np.sqrt(mean_squared_error(y_test, y_predict))
   mape = mean_absolute_percentage_error(y_test, y_predict) * 100
   print(f"{name}\nScore: {r2}")
   print(f"RMSE: {np.sqrt(rmse):.3f}")
   print(f"MAPE: {mape:.3f}%\n")

   results[name] = {
      "score": r2,
      "rmse": rmse,
      "mape": mape
   }

   make_plots(y_test, y_predict, name, r2, rmse, mape)

In [None]:
train_poly_regression(adelaide_df, 1, "Adelaide - Linear Regression")
train_poly_regression(perth_df, 1, "Perth - Linear Regression")
train_poly_regression(sydney_df, 1, "Sydney - Linear Regression")
train_poly_regression(tasmania_df, 1, "Tasmania - Linear Regression")

train_poly_regression(adelaide_df, 2, "Adelaide - Polynomial Regression Degree 2")
train_poly_regression(perth_df, 2, "Perth - Polynomial Regression Degree 2")
train_poly_regression(sydney_df, 2, "Sydney - Polynomial Regression Degree 2")
train_poly_regression(tasmania_df, 2, "Tasmania - Polynomial Regression Degree 2")



In [None]:
import xgboost as xgb

def train_xgb_model(dataframe, name):

    subset = dataframe[X_cols + Y_cols]
    powerall = dataframe[powerall_col]


    X_train, X_test, y_train, y_test = train_test_split(subset, powerall, test_size=0.2, random_state=44)

    xgb_model = xgb.XGBRegressor (objective="reg:squarederror", n_estimators = 1500, random_state=44, subsample= 0.8, learning_rate= 0.07)
    xgb_model.fit(X_train, y_train)
    y_predict = xgb_model.predict(X_test)

    r2 = r2_score(y_test, y_predict)
    rmse = np.sqrt(mean_squared_error(y_test, y_predict))
    mape = mean_absolute_percentage_error(y_test, y_predict) * 100

    print(f"{name}\nScore: {r2}")
    print(f"RMSE: {rmse:.3f}")
    print(f"MAPE: {mape:.3f}%\n")

    results[name] = {
        "score": r2,
        "rmse": rmse,
        "mape": mape
        }
    
    make_plots(y_test, y_predict, name, r2, rmse, mape)

In [None]:
train_xgb_model(adelaide_df, "Adelaide - XGBoost Regressor")
train_xgb_model(perth_df, "Perth - XGBoost Regressor")
train_xgb_model(sydney_df, "Sydney - XGBoost Regressor")
train_xgb_model(tasmania_df, "Tasmania - XGBoost Regressor")

In [None]:
import lightgbm as lgb

def train_lgb_model(dataframe, name):

    subset = dataframe[X_cols + Y_cols]
    powerall = dataframe[powerall_col]

    X_train, X_test, y_train, y_test = train_test_split(subset, powerall, test_size=0.2, random_state=44)

    lgb_model = lgb.LGBMRegressor(
        objective="regression", 
        n_estimators=1500, 
        random_state=44, 
        subsample=0.8, 
        learning_rate=0.07
    )
    
    lgb_model.fit(X_train, y_train)
    y_predict = lgb_model.predict(X_test)

    r2 = r2_score(y_test, y_predict)
    rmse = np.sqrt(mean_squared_error(y_test, y_predict))
    mape = mean_absolute_percentage_error(y_test, y_predict) * 100

    print(f"{name}\nScore: {r2}")
    print(f"RMSE: {rmse:.3f}")
    print(f"MAPE: {mape:.3f}%\n")

    results[name] = {
        "score": r2,
        "rmse": rmse,
        "mape": mape
    }

    make_plots(y_test, y_predict, name, r2, rmse, mape)

In [None]:
train_lgb_model(adelaide_df, "Adelaide - LightGBM Regressor")
train_lgb_model(perth_df, "Perth - LightGBM Regressor")
train_lgb_model(sydney_df, "Sydney - LightGBM Regressor")
train_lgb_model(tasmania_df, "Tasmania - LightGBM Regressor")

In [None]:
import catboost as cb

def train_catboost_model(dataframe, name):
    X_cols = [f"X{i}" for i in range(1, 17)]
    Y_cols = [f"Y{i}" for i in range(1, 17)]
    powerall_col = ["Powerall"]
    subset = dataframe[X_cols + Y_cols]
    powerall = dataframe[powerall_col]

    X_train, X_temp, y_train, y_temp = train_test_split(subset, powerall, test_size=0.4, random_state=44)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=44)

    catboost_model = cb.CatBoostRegressor(
        iterations=3000,
        learning_rate=0.05,
        depth=7,
        l2_leaf_reg=3,
        random_seed=44,
        verbose=False,
        early_stopping_rounds=100
    )
    
    catboost_model.fit(
        X_train, 
        y_train,
        eval_set=(X_val, y_val),
        verbose=False
    )
    
    y_predict = catboost_model.predict(X_test)

    r2 = r2_score(y_test, y_predict)
    rmse = np.sqrt(mean_squared_error(y_test, y_predict))
    mape = mean_absolute_percentage_error(y_test, y_predict) * 100

    print(f"{name}\nScore: {r2:.4f}")
    print(f"RMSE: {rmse:.3f}")
    print(f"MAPE: {mape:.3f}%\n")

    results[name] = {
        "score": r2,
        "rmse": rmse,
        "mape": mape
    }

    make_plots(y_test, y_predict, name, r2, rmse, mape)

In [None]:
train_catboost_model(adelaide_df, "Adelaide - CatBoost Regressor")
train_catboost_model(perth_df, "Perth - CatBoost Regressor")
train_catboost_model(sydney_df, "Sydney - CatBoost Regressor")
train_catboost_model(tasmania_df, "Tasmania - CatBoost Regressor")

In [None]:
def train_ensemble_model(dataframe, name):
    X_cols = [f"X{i}" for i in range(1, 17)]
    Y_cols = [f"Y{i}" for i in range(1, 17)]
    powerall_col = ["Powerall"]
    subset = dataframe[X_cols + Y_cols]
    powerall = dataframe[powerall_col]

    X_train, X_temp, y_train, y_temp = train_test_split(subset, powerall, test_size=0.4, random_state=44)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=44)


    xgb_model = xgb.XGBRegressor(objective="reg:squarederror", n_estimators=3000, 
                                  learning_rate=0.05, max_depth=7, subsample=0.8, 
                                  random_state=44, colsample_bytree=0.8)
    
    lgb_model = lgb.LGBMRegressor(objective="regression", n_estimators=3000, 
                                   learning_rate=0.05, max_depth=7, subsample=0.8, 
                                   random_state=44, colsample_bytree=0.8, verbose=-1)
    
    catboost_model = cb.CatBoostRegressor(iterations=3000, learning_rate=0.05, 
                                          depth=7, random_seed=44, verbose=False)

 
    xgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)
    lgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], 
                  callbacks=[lgb.early_stopping(stopping_rounds=100)])
    catboost_model.fit(X_train, y_train, eval_set=(X_val, y_val), verbose=False)


    pred_xgb = xgb_model.predict(X_test)
    pred_lgb = lgb_model.predict(X_test)
    pred_cat = catboost_model.predict(X_test)


    y_predict = 0.36 * pred_xgb + 0.36 * pred_lgb + 0.28 * pred_cat

    r2 = r2_score(y_test, y_predict)
    rmse = np.sqrt(mean_squared_error(y_test, y_predict))
    mape = mean_absolute_percentage_error(y_test, y_predict) * 100

    print(f"{name}\nScore: {r2:.4f}")
    print(f"RMSE: {rmse:.3f}")
    print(f"MAPE: {mape:.3f}%\n")

    results[name] = {
        "score": r2,
        "rmse": rmse,
        "mape": mape
    }

    make_plots(y_test, y_predict, name, r2, rmse, mape)


In [None]:
train_ensemble_model(adelaide_df, "Adelaide - Ensemble (XGB+LGB+Cat)")
train_ensemble_model(perth_df, "Perth - Ensemble (XGB+LGB+Cat)")
train_ensemble_model(sydney_df, "Sydney - Ensemble (XGB+LGB+Cat)")
train_ensemble_model(tasmania_df, "Tasmania - Ensemble (XGB+LGB+Cat)")

In [None]:
for name, metrics in results.items():
    print(f"\n{name}:")
    for metric, value in metrics.items():
        print(f"  {metric}: {value}")

In [None]:
import optuna
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)



def objective(trial, X_train, y_train, X_val, y_val):

    params = {
        "objective": "reg:squarederror",
        "eval_metric": "rmse",
        "n_estimators": 2000,
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "gamma": trial.suggest_float("gamma", 1e-8, 1.0, log=True),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 10.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 10.0, log=True),
        "random_state": 44,
        "n_jobs": -1
    }
    
    model = xgb.XGBRegressor(
        **params,
        early_stopping_rounds=50
    )

    model.fit(
        X_train, 
        y_train, 
        eval_set=[(X_val, y_val)],
        verbose=False
    )

    trial.set_user_attr("best_iteration", model.best_iteration)
    
    
    y_predict = model.predict(X_val)
    
    
    rmse = np.sqrt(mean_squared_error(y_val, y_predict))
    return rmse

def train_xgb_with_optuna(dataframe, name, n_trials=50):
    
    print(f"--- Optuna tuning for: {name} ---")
    
    X_cols = [f"X{i}" for i in range(1, 17)]
    Y_cols = [f"Y{i}" for i in range(1, 17)]
    powerall_col = ["Powerall"]
    subset = dataframe[X_cols + Y_cols]
    powerall = dataframe[powerall_col]


    # 60% train, 40% temp
    X_train, X_temp, y_train, y_temp = train_test_split(subset, powerall, test_size=0.4, random_state=44)
    # 50% of temp -> validation
    # 50% of temp -> test
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=44)


    study = optuna.create_study(direction="minimize")
    study.optimize(
        lambda trial: objective(trial, X_train, y_train, X_val, y_val), 
        n_trials=n_trials
    )

    print(f"\n--- Tuning for: {name} ended ---")
    print(f"The best iteration (Validation RMSE): {study.best_value:.3f}")
    print("The best hiperparameters:")
    print(study.best_params)

    best_params = study.best_params
    best_iteration = study.best_trial.user_attrs["best_iteration"]
    
    # Combine train (60%) and validation (20%) sets for final training
    X_train_full = pd.concat([X_train, X_val])
    y_train_full = pd.concat([y_train, y_val])
    
    final_model = xgb.XGBRegressor(
        **best_params,
        n_estimators=best_iteration,
        random_state=44,
        n_jobs=4
    )
    
    final_model.fit(X_train_full, y_train_full)
    
    y_predict = final_model.predict(X_test)
    
    r2 = r2_score(y_test, y_predict)
    rmse = np.sqrt(mean_squared_error(y_test, y_predict))
    mape = mean_absolute_percentage_error(y_test, y_predict) * 100

    print(f"\n--- Results for {name} ---")
    print(f"Score (R2): {r2:.4f}")
    print(f"RMSE: {rmse:.3f}")
    print(f"MAPE: {mape:.3f}%\\n")
    results[f"{name}_Optuna_R2"] = {"score": r2, "rmse": rmse, "mape": mape}
    make_plots(y_test, y_predict, name + " - Optuna Tuning", r2, rmse, mape)

In [None]:
train_xgb_with_optuna(adelaide_df, "Adelaide - XGBoost with Optuna HPT", n_trials=50)
train_xgb_with_optuna(perth_df, "Perth - XGBoost with Optuna HPT", n_trials=50)
train_xgb_with_optuna(sydney_df, "Sydney - XGBoost with Optuna HPT", n_trials=50)
train_xgb_with_optuna(tasmania_df, "Tasmania - XGBoost with Optuna HPT", n_trials=50)

In [None]:
def objective_lgb(trial, X_train, y_train, X_val, y_val):
    
    params = {
        "objective": "regression_l2",
        "metric": "rmse",
        "n_estimators": 2000,
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 20, 150),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 10.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 10.0, log=True),
        "random_state": 44,
        "n_jobs": -1,
        "verbose": -1
    }

    model = lgb.LGBMRegressor(**params)

    model.fit(
        X_train,
        y_train,
        eval_set=[(X_val, y_val)],
        eval_metric="rmse",
        callbacks=[lgb.early_stopping(50, verbose=False)]
    )

    trial.set_user_attr("best_iteration", model.best_iteration_)
    
    y_predict = model.predict(X_val)
    
    rmse = np.sqrt(mean_squared_error(y_val, y_predict))
    return rmse

def train_lgb_with_optuna(dataframe, name, n_trials=50):
    
    print(f"--- Optuna tuning for: {name} ---")
    
    X_cols = [f"X{i}" for i in range(1, 17)]
    Y_cols = [f"Y{i}" for i in range(1, 17)]
    powerall_col = ["Powerall"]
    subset = dataframe[X_cols + Y_cols]
    powerall = dataframe[powerall_col]

    # 60% train, 40% temp
    X_train, X_temp, y_train, y_temp = train_test_split(subset, powerall, test_size=0.4, random_state=44)
    # 50% of temp -> validation
    # 50% of temp -> test
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=44)


    study = optuna.create_study(direction="minimize")
    study.optimize(
        lambda trial: objective_lgb(trial, X_train, y_train, X_val, y_val), 
        n_trials=n_trials
    )

    print(f"\n--- Tuning for: {name} ended ---")
    print(f"The best iteration (Validation RMSE): {study.best_value:.3f}")
    print("The best hiperparameters:")
    print(study.best_params)

    best_params = study.best_params
    best_iteration = study.best_trial.user_attrs["best_iteration"]
    
    # Combine train (60%) and validation (20%) sets for final training
    X_train_full = pd.concat([X_train, X_val])
    y_train_full = pd.concat([y_train, y_val])
    
    final_model = lgb.LGBMRegressor(
        **best_params,
        n_estimators=best_iteration,
        random_state=44,
        n_jobs=-1,
        verbose=-1
    )
    
    final_model.fit(X_train_full, y_train_full)
    
    y_predict = final_model.predict(X_test)
    
    r2 = r2_score(y_test, y_predict)
    rmse = np.sqrt(mean_squared_error(y_test, y_predict))
    mape = mean_absolute_percentage_error(y_test, y_predict) * 100

    print(f"\n--- Results for {name} ---")
    print(f"Score (R2): {r2:.4f}")
    print(f"RMSE: {rmse:.3f}")
    print(f"MAPE: {mape:.3f}%\\n")
    results[f"{name}_Optuna_LGB"] = {"score": r2, "rmse": rmse, "mape": mape}
    make_plots(y_test, y_predict, name + " - Optuna Tuning", r2, rmse, mape)

In [None]:
train_lgb_with_optuna(adelaide_df, "Adelaide - LightGBM with Optuna HPT", n_trials=50)
train_lgb_with_optuna(perth_df, "Perth - LightGBM with Optuna HPT", n_trials=50)
train_lgb_with_optuna(sydney_df, "Sydney - LightGBM with Optuna HPT", n_trials=50)
train_lgb_with_optuna(tasmania_df, "Tasmania - LightGBM with Optuna HPT", n_trials=50)

In [None]:
import catboost as cb

def objective_cb(trial, X_train, y_train, X_val, y_val):
    
    params = {
        "objective": "RMSE",
        "eval_metric": "RMSE",
        "iterations": 2000,
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "depth": trial.suggest_int("depth", 3, 10),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-8, 10.0, log=True),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "random_strength": trial.suggest_float("random_strength", 1e-8, 10.0, log=True),
        "random_seed": 44,
        "thread_count": -1,
        "verbose": False
    }

    model = cb.CatBoostRegressor(
        **params,
        early_stopping_rounds=50
    )

    model.fit(
        X_train,
        y_train,
        eval_set=(X_val, y_val),
        verbose=False
    )

    trial.set_user_attr("best_iteration", model.get_best_iteration())
    
    y_predict = model.predict(X_val)
    
    rmse = np.sqrt(mean_squared_error(y_val, y_predict))
    return rmse

def train_cb_with_optuna(dataframe, name, n_trials=50):
    
    print(f"--- Optuna tuning for: {name} ---")
    
    X_cols = [f"X{i}" for i in range(1, 17)]
    Y_cols = [f"Y{i}" for i in range(1, 17)]
    powerall_col = ["Powerall"]
    subset = dataframe[X_cols + Y_cols]
    powerall = dataframe[powerall_col]

    # 60% train, 40% temp
    X_train, X_temp, y_train, y_temp = train_test_split(subset, powerall, test_size=0.4, random_state=44)
    # 50% of temp -> validation
    # 50% of temp -> test
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=44)


    study = optuna.create_study(direction="minimize")
    study.optimize(
        lambda trial: objective_cb(trial, X_train, y_train, X_val, y_val), 
        n_trials=n_trials
    )

    print(f"\n--- Tuning for: {name} ended ---")
    print(f"The best iteration (Validation RMSE): {study.best_value:.3f}")
    print("The best hiperparameters:")
    print(study.best_params)

    best_params = study.best_params
    best_iteration = study.best_trial.user_attrs["best_iteration"]
    
    # Combine train (60%) and validation (20%) sets for final training
    X_train_full = pd.concat([X_train, X_val])
    y_train_full = pd.concat([y_train, y_val])
    
    final_model = cb.CatBoostRegressor(
        **best_params,
        iterations=best_iteration,
        random_seed=44,
        thread_count=-1,
        verbose=False
    )
    
    final_model.fit(X_train_full, y_train_full, verbose=False)
    
    y_predict = final_model.predict(X_test)
    
    r2 = r2_score(y_test, y_predict)
    rmse = np.sqrt(mean_squared_error(y_test, y_predict))
    mape = mean_absolute_percentage_error(y_test, y_predict) * 100

    print(f"\n--- Results for {name} ---")
    print(f"Score (R2): {r2:.4f}")
    print(f"RMSE: {rmse:.3f}")
    print(f"MAPE: {mape:.3f}%\\n")
    results[f"{name}_Optuna_CB"] = {"score": r2, "rmse": rmse, "mape": mape}
    make_plots(y_test, y_predict, name + " - Optuna Tuning", r2, rmse, mape)

In [None]:
train_cb_with_optuna(adelaide_df, "Adelaide - CatBoost with Optuna HPT", n_trials=50)

In [None]:
train_cb_with_optuna(perth_df, "Perth - CatBoost with Optuna HPT", n_trials=50)

In [None]:
train_cb_with_optuna(sydney_df, "Sydney - CatBoost with Optuna HPT", n_trials=50)

In [None]:
train_cb_with_optuna(tasmania_df, "Tasmania - CatBoost with Optuna HPT", n_trials=50)

In [None]:
print(results)