In [1]:
import pandas as pd
import numpy as np
import os
from scipy import stats
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error
import xgboost as xgb
import lightgbm as lgb
import catboost as cb

# Create column names: X1-X16 for first 16 cols, Y1-Y16 for next 16 cols, power columns for rest
column_names = [f"X{i}" for i in range(1, 17)] + [f"Y{i}" for i in range(1, 17)] + [f"Power{i}" for i in range(1, 17)] + ["Powerall"]

adelaide_df = pd.read_csv('energy/Adelaide_Data.csv', header=None, names=column_names)
perth_df  = pd.read_csv('energy/Perth_Data.csv', header=None, names=column_names)
sydney_df = pd.read_csv('energy/Sydney_Data.csv', header=None, names=column_names)
tasmania_df = pd.read_csv('energy/Tasmania_Data.csv', header=None, names=column_names)

In [2]:
#check if dataset has any wrong values
print("--- Adelaide DataFrame Info ---")
adelaide_df.info()
print("\n--- Adelaide DataFrame Null Counts ---")
print(adelaide_df.isnull().sum())
print(f"\n--- Adelaide DataFrame Duplicate Rows: {adelaide_df.duplicated().sum()} ---")
print("\n--- Adelaide DataFrame Descriptive Statistics ---")
display(adelaide_df.describe())

print("\n\n--- Perth DataFrame Info ---")
perth_df.info()
print("\n--- Perth DataFrame Null Counts ---")
print(perth_df.isnull().sum())
print(f"\n--- Perth DataFrame Duplicate Rows: {perth_df.duplicated().sum()} ---")
print("\n--- Perth DataFrame Descriptive Statistics ---")
display(perth_df.describe())

print("\n\n--- Sydney DataFrame Info ---")
sydney_df.info()
print("\n--- Sydney DataFrame Null Counts ---")
print(sydney_df.isnull().sum())
print(f"\n--- Sydney DataFrame Duplicate Rows: {sydney_df.duplicated().sum()} ---")
print("\n--- Sydney DataFrame Descriptive Statistics ---")
display(sydney_df.describe())

print("\n\n--- Tasmania DataFrame Info ---")
tasmania_df.info()
print("\n--- Tasmania DataFrame Null Counts ---")
print(tasmania_df.isnull().sum())
print(f"\n--- Tasmania DataFrame Duplicate Rows: {tasmania_df.duplicated().sum()} ---")
print("\n--- Tasmania DataFrame Descriptive Statistics ---")
display(tasmania_df.describe())


--- Adelaide DataFrame Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71999 entries, 0 to 71998
Data columns (total 49 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   X1        71999 non-null  float64
 1   X2        71999 non-null  float64
 2   X3        71999 non-null  float64
 3   X4        71999 non-null  float64
 4   X5        71999 non-null  float64
 5   X6        71999 non-null  float64
 6   X7        71999 non-null  float64
 7   X8        71999 non-null  float64
 8   X9        71999 non-null  float64
 9   X10       71999 non-null  float64
 10  X11       71999 non-null  float64
 11  X12       71999 non-null  float64
 12  X13       71999 non-null  float64
 13  X14       71999 non-null  float64
 14  X15       71999 non-null  float64
 15  X16       71999 non-null  float64
 16  Y1        71999 non-null  float64
 17  Y2        71999 non-null  float64
 18  Y3        71999 non-null  float64
 19  Y4        71999 non-null  float64
 

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,...,Power8,Power9,Power10,Power11,Power12,Power13,Power14,Power15,Power16,Powerall
count,71999.0,71999.0,71999.0,71999.0,71999.0,71999.0,71999.0,71999.0,71999.0,71999.0,...,71999.0,71999.0,71999.0,71999.0,71999.0,71999.0,71999.0,71999.0,71999.0,71999.0
mean,281.278924,279.31603,294.136617,263.82401,290.186913,246.810469,252.476041,322.610209,280.743361,288.407746,...,88436.529876,88332.518976,87564.244287,88660.640939,88424.979052,87185.488018,87703.940233,89191.14504,88471.467381,1410073.0
std,178.31954,178.036825,182.13773,194.870821,179.637096,191.377291,178.339336,178.435581,190.275108,178.158506,...,10108.67048,10156.721862,10174.872639,10515.790588,10489.965853,10565.601127,10430.102374,10442.279774,10572.633956,56007.3
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,51444.747,51304.0216,49409.8915,49814.0453,51660.4106,50205.3528,51141.5997,50628.5528,47273.9836,1191378.0
25%,117.00735,116.47795,116.2697,76.5733,123.0882,67.4957,89.1128,166.90775,103.62165,126.4304,...,81445.71865,81102.52895,80546.90715,80803.23445,80788.75625,79083.97535,79920.3957,81584.29825,80400.6157,1371208.0
50%,282.7396,280.6737,315.4656,247.2332,286.1849,214.9223,233.6756,356.3331,267.1704,288.2039,...,90310.572,90083.7654,88073.7867,91008.814,89940.0282,87663.5828,89003.8159,92114.119,90699.1335,1402170.0
75%,437.2938,444.1201,455.15265,444.6728,460.145,433.3104,408.797,481.6711,464.0141,448.3743,...,97418.7038,97448.10615,97336.08315,97813.29715,97720.3836,97475.99935,97501.5398,98073.00195,97663.05125,1446064.0
max,566.0,566.0,566.0,566.0,566.0,566.0,566.0,566.0,566.0,566.0,...,105790.3745,105526.2996,105420.1074,105395.2793,105539.9866,105447.76,105301.2025,104602.3188,105390.3271,1583052.0




--- Perth DataFrame Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72000 entries, 0 to 71999
Data columns (total 49 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   X1        72000 non-null  float64
 1   X2        72000 non-null  float64
 2   X3        72000 non-null  float64
 3   X4        72000 non-null  float64
 4   X5        72000 non-null  float64
 5   X6        72000 non-null  float64
 6   X7        72000 non-null  float64
 7   X8        72000 non-null  float64
 8   X9        72000 non-null  float64
 9   X10       72000 non-null  float64
 10  X11       72000 non-null  float64
 11  X12       72000 non-null  float64
 12  X13       72000 non-null  float64
 13  X14       72000 non-null  float64
 14  X15       72000 non-null  float64
 15  X16       72000 non-null  float64
 16  Y1        72000 non-null  float64
 17  Y2        72000 non-null  float64
 18  Y3        72000 non-null  float64
 19  Y4        72000 non-null  float64
 2

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,...,Power8,Power9,Power10,Power11,Power12,Power13,Power14,Power15,Power16,Powerall
count,72000.0,72000.0,72000.0,72000.0,72000.0,72000.0,72000.0,72000.0,72000.0,72000.0,...,72000.0,72000.0,72000.0,72000.0,72000.0,72000.0,72000.0,72000.0,72000.0,72000.0
mean,307.701005,263.069113,310.032429,280.009987,271.573004,271.825573,271.628205,265.36676,300.024726,265.180707,...,87172.791411,87227.400248,87479.702421,87259.608082,86416.545859,86879.938895,86110.179312,88026.263288,87450.093836,1394475.0
std,186.406373,182.58038,188.616824,180.758049,183.357609,181.907878,180.591345,182.81316,179.271019,183.327812,...,10574.754639,10196.63605,10087.831693,10470.692266,10574.289769,9942.085387,10371.908246,10471.593551,10477.559853,52250.27
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,46239.8045,50114.9291,47183.0478,49291.778,49549.9906,49465.6856,47027.6258,48982.4382,46881.5737,1177711.0
25%,132.556675,97.074,126.138925,117.146725,98.51905,102.462975,105.5516,94.703375,134.872,101.7844,...,79380.35215,79774.712375,79824.96795,79398.137825,78207.534625,80019.021325,78324.401825,80115.2517,79372.788325,1359069.0
50%,343.86665,243.84695,347.42045,282.05645,265.2438,267.37665,261.3124,251.31195,316.8587,253.3937,...,88613.66615,88460.8355,88768.45595,88450.9726,87221.24205,87720.77215,86479.809,90513.687,89574.0419,1388878.0
75%,477.28545,427.381725,480.3937,440.3533,441.640325,435.245675,436.602525,441.4543,461.0486,435.324425,...,96919.85915,96704.497125,96880.94575,96922.3661,96700.6237,96550.18855,96575.29235,97074.4805,96752.08935,1426946.0
max,566.0,566.0,566.0,566.0,566.0,566.0,566.0,566.0,566.0,566.0,...,107228.1549,105509.4367,104693.1169,104978.2208,103852.5658,104757.5832,104435.8903,105785.8064,104430.6447,1565836.0




--- Sydney DataFrame Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72000 entries, 0 to 71999
Data columns (total 49 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   X1        72000 non-null  float64
 1   X2        72000 non-null  float64
 2   X3        72000 non-null  float64
 3   X4        72000 non-null  float64
 4   X5        72000 non-null  float64
 5   X6        72000 non-null  float64
 6   X7        72000 non-null  float64
 7   X8        72000 non-null  float64
 8   X9        72000 non-null  float64
 9   X10       72000 non-null  float64
 10  X11       72000 non-null  float64
 11  X12       72000 non-null  float64
 12  X13       72000 non-null  float64
 13  X14       72000 non-null  float64
 14  X15       72000 non-null  float64
 15  X16       72000 non-null  float64
 16  Y1        72000 non-null  float64
 17  Y2        72000 non-null  float64
 18  Y3        72000 non-null  float64
 19  Y4        72000 non-null  float64
 

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,...,Power8,Power9,Power10,Power11,Power12,Power13,Power14,Power15,Power16,Powerall
count,72000.0,72000.0,72000.0,72000.0,72000.0,72000.0,72000.0,72000.0,72000.0,72000.0,...,72000.0,72000.0,72000.0,72000.0,72000.0,72000.0,72000.0,72000.0,72000.0,72000.0
mean,324.13065,318.476619,248.650127,273.804973,364.120788,270.715581,268.090715,251.211585,257.469299,263.833769,...,92460.889647,92400.867081,92359.874711,93528.558491,92533.020595,92866.782188,94515.422258,92124.902168,92812.671388,1486229.0
std,201.042023,200.450328,207.727345,202.496658,165.250467,221.881965,210.125347,191.412032,193.920143,203.080397,...,6351.75487,7314.166192,6617.035471,7108.750555,6631.046214,7179.076034,6828.345629,6978.090343,7128.152117,23083.64
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,67649.7211,67935.5715,67961.5017,67930.7228,68171.3261,68113.9289,68294.9618,66233.3428,64609.7685,1361962.0
25%,137.449825,131.146525,43.2815,63.6229,248.5869,38.906525,57.4634,79.83525,71.295925,48.945475,...,87871.42585,86957.3497,87505.072375,88246.974,87321.9664,87602.965375,90044.9756,86835.5877,87872.9067,1470987.0
50%,355.4917,355.1715,201.8835,274.8346,388.3721,250.3746,233.0067,213.82475,239.2413,265.37975,...,92484.55405,91374.23765,92113.28725,93441.4739,93067.3057,91552.4339,94274.4891,91203.03,91453.7259,1487282.0
75%,524.0015,518.242925,462.5056,468.289225,515.557,511.2093,485.902425,422.66535,432.7845,457.983675,...,97638.0519,97719.189525,97502.1613,98653.893875,97534.19705,98739.563075,99668.561125,97512.5155,98362.51305,1504180.0
max,566.0,566.0,566.0,566.0,566.0,566.0,566.0,566.0,566.0,566.0,...,110026.7539,109417.62,109240.703,109668.4487,109185.0888,109898.206,109237.3888,109087.1614,109253.6068,1536347.0




--- Tasmania DataFrame Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72000 entries, 0 to 71999
Data columns (total 49 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   X1        72000 non-null  float64
 1   X2        72000 non-null  float64
 2   X3        72000 non-null  float64
 3   X4        72000 non-null  float64
 4   X5        72000 non-null  float64
 5   X6        72000 non-null  float64
 6   X7        72000 non-null  float64
 7   X8        72000 non-null  float64
 8   X9        72000 non-null  float64
 9   X10       72000 non-null  float64
 10  X11       72000 non-null  float64
 11  X12       72000 non-null  float64
 12  X13       72000 non-null  float64
 13  X14       72000 non-null  float64
 14  X15       72000 non-null  float64
 15  X16       72000 non-null  float64
 16  Y1        72000 non-null  float64
 17  Y2        72000 non-null  float64
 18  Y3        72000 non-null  float64
 19  Y4        72000 non-null  float64

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,...,Power8,Power9,Power10,Power11,Power12,Power13,Power14,Power15,Power16,Powerall
count,72000.0,72000.0,72000.0,72000.0,72000.0,72000.0,72000.0,72000.0,72000.0,72000.0,...,72000.0,72000.0,72000.0,72000.0,72000.0,72000.0,72000.0,72000.0,72000.0,72000.0
mean,300.334674,294.454049,289.066287,263.393313,307.688097,246.713427,278.580047,286.571972,276.4759,299.831906,...,232383.809106,235201.221458,239331.914935,235166.905914,234747.64522,233790.87416,235308.969514,238324.695375,236811.56324,3760135.0
std,191.473699,178.334355,180.893467,181.188155,192.543076,182.99667,182.148218,169.94966,179.503873,185.270428,...,28444.373453,29019.822451,28813.403078,28734.076347,28698.375147,28354.13899,28499.868118,28694.132104,28045.64135,112146.7
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,129775.9101,138563.9963,126897.2655,133149.9114,128318.1579,131102.3632,134838.9489,135212.6375,128026.0832,3235131.0
25%,121.019925,137.40945,126.750025,104.923275,125.937475,75.863775,114.66875,143.87975,122.933775,132.523025,...,211195.8858,213104.082425,217772.12395,213166.2458,212845.3707,211827.84055,213768.764875,217256.7625,216084.572525,3685857.0
50%,316.5647,303.4067,301.6357,244.31325,336.1389,223.47825,264.08635,297.62455,261.1735,314.0382,...,233918.81525,236770.0061,245712.065,237441.053,237238.1615,236434.5287,237548.6664,242743.0654,240613.48405,3755820.0
75%,479.768225,447.732125,446.484425,427.202575,484.349675,407.30515,451.786725,420.962525,442.2153,471.073825,...,259325.638375,265307.61875,265738.63535,264954.393975,263741.43595,261343.4153,264541.756425,265748.740125,265161.6425,3830819.0
max,566.0,566.0,566.0,566.0,566.0,566.0,566.0,566.0,566.0,566.0,...,282143.8044,279824.753,283548.5594,281457.3654,281789.4222,281968.7952,286279.1496,282189.7338,283875.5842,4241838.0


In [3]:
results = {}

# cols 1-16 represent X coordinates
# cols 17-32 represent Y coordinates
# cols 33 - 48 represent power of single containers
# col 49 represents sum of power in the farm
# We want to calculate Powerall depending on X, Y coordinates

X_cols = [f"X{i}" for i in range(1, 17)]
Y_cols = [f"Y{i}" for i in range(1, 17)]
powerall_col = ["Powerall"]

def split_data(subset, powerall):
    X_train, X_temp, y_train, y_temp = train_test_split(subset, powerall, test_size=0.4, random_state=44)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=44)
    return X_train, X_val, X_test, y_train, y_val, y_test


def prepare_subsets(dataframe):
    subset = dataframe[X_cols + Y_cols]
    powerall = dataframe[powerall_col]
    return split_data(subset, powerall)

In [4]:
def plot_predicted_vs_actual(y_test, y_predict, name, r2, rmse, mape, show_plot=False):
    """
    Generates a scatter plot of predicted vs. actual values.
    
    y_test: Actual values
    y_predict: Predicted values
    title: Title for the plot
    """
    plt.figure(figsize=(8, 6))
    plt.scatter(y_test, y_predict, alpha=0.5, label='Predicted vs Actual')
   
    # Ideal line (y=x)
    min_val = min(y_test.min().iloc[0], y_predict.min())
    max_val = max(y_test.max().iloc[0], y_predict.max())
    plt.plot([min_val, max_val], [min_val, max_val], 'k--', lw=2, label='Ideal Fit (y=x)')
   
    plt.xlabel("Actual Values")
    plt.ylabel("Predicted Values")
    plt.title(f"Predicted vs Actual for {name}")
    plt.legend()
    plt.grid(True)

    # Display R2, RMSE, and MAPE values on the plot
    textstr = f"R2: {r2:.4f}\nRMSE: {rmse:.2f}\nMAPE: {mape:.2f}%"
    plt.gca().text(0.05, 0.95, textstr, transform=plt.gca().transAxes, fontsize=10,
                   verticalalignment='top', bbox=dict(boxstyle='round', facecolor='white', alpha=0.5))
    
    if show_plot:
        plt.show()
    plots_dir = 'plots/predicted_vs_actualy'
    if not os.path.exists(plots_dir):
        os.makedirs(plots_dir)
   
    filename = name.replace(' ', '_') + '.png'
    filepath = os.path.join(plots_dir, filename)
   
    plt.savefig(filepath)
    plt.close()


def plot_residuals(y_test, y_predict, name,  r2, rmse, mape, show_plot=False):
    """
    Generates a scatter plot of residuals vs. predicted values.
       
    y_test: Actual values
    y_predict: Predicted values
    title: Title for the plot
    name: Name for saving the file
    """
    residuals = y_test.values.flatten() - y_predict.flatten()
        
    plt.figure(figsize=(8, 6))
    plt.scatter(y_predict, residuals, alpha=0.5)
        
    # Horizontal line at y=0
    plt.axhline(y=0, color='k', linestyle='--', lw=2)
        
    plt.xlabel("Predicted Values")
    plt.ylabel("Residuals (Actual - Predicted)")
    plt.title(f"Residuals vs Predicted Values for {name}")
    plt.grid(True)

    # Display R2, RMSE, and MAPE values on the plot
    textstr = f"R2: {r2:.4f}\nRMSE: {rmse:.2f}\nMAPE: {mape:.2f}%"
    plt.gca().text(0.05, 0.95, textstr, transform=plt.gca().transAxes, fontsize=10,
                   verticalalignment='top', bbox=dict(boxstyle='round', facecolor='white', alpha=0.5))
    
        
    if show_plot:
        plt.show()
        
    plots_dir = 'plots/residuals'
    if not os.path.exists(plots_dir):
        os.makedirs(plots_dir)
        
    filename = name.replace(' ', '_') + '_residuals.png'
    filepath = os.path.join(plots_dir, filename)
        
    plt.savefig(filepath)
    plt.close()

def plot_residuals_distribution(y_test, y_predict, name,  r2, rmse, mape, show_plot=False):
    """
    Generates a histogram of the residuals to show their distribution.
        
    y_test: Actual values
    y_predict: Predicted values
    title: Title for the plot
    name: Name for saving the file
    """
    residuals = y_test.values.flatten() - y_predict.flatten()
        
    plt.figure(figsize=(8, 6))
        
    plt.hist(residuals, bins=50, density=True, alpha=0.6, color='g', label='Residuals Histogram')
        
    # Fit a normal distribution to the data
    mu, std = stats.norm.fit(residuals)
    xmin, xmax = plt.xlim()
    x = np.linspace(xmin, xmax, 100)
    p = stats.norm.pdf(x, mu, std)
    plt.plot(x, p, 'k', linewidth=2, label='Normal Distribution Fit')
        
    plt.axvline(x=0, color='r', linestyle='--', lw=2, label='Zero Residuals')

    plt.xlabel("Residuals")
    plt.ylabel("Density")
    plt.title(f"Residuals Distribution for {name}")
    plt.legend()
    plt.grid(True)

    # Display R2, RMSE, and MAPE values on the plot
    textstr = f"R2: {r2:.4f}\nRMSE: {rmse:.2f}\nMAPE: {mape:.2f}%"
    plt.gca().text(0.05, 0.95, textstr, transform=plt.gca().transAxes, fontsize=10,
                   verticalalignment='top', bbox=dict(boxstyle='round', facecolor='white', alpha=0.5))
        
    if show_plot:
        plt.show()
            
    plots_dir = 'plots/residuals_distribution'
    if not os.path.exists(plots_dir):
        os.makedirs(plots_dir)
            
    filename = name.replace(' ', '_') + '_residuals_dist.png'
    filepath = os.path.join(plots_dir, filename)
            
    plt.savefig(filepath)
    plt.close()

def make_plots(y_test, y_predict, name, r2, rmse, mape):
    plot_predicted_vs_actual(y_test, y_predict, name, r2, rmse, mape)
    plot_residuals(y_test, y_predict, name, r2, rmse, mape)
    plot_residuals_distribution(y_test, y_predict, name, r2, rmse, mape)

    




In [5]:
def train_poly_regression(dataframe, degree, name):
    
    X_train, X_val, X_test, y_train, y_val, y_test = prepare_subsets(dataframe)

    poly_features = PolynomialFeatures(degree=degree, include_bias=False)
    X_train_poly = poly_features.fit_transform(X_train)
    X_test_poly = poly_features.transform(X_test)

    model = LinearRegression()
    
    model.fit(X_train_poly, y_train)
    y_predict = model.predict(X_test_poly)
    r2 = model.score(X_test_poly, y_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_predict))
    mape = mean_absolute_percentage_error(y_test, y_predict) * 100
    print(f"{name}\nScore: {r2}")
    print(f"RMSE: {rmse:.3f}")
    print(f"MAPE: {mape:.3f}%\n")

    results[name] = {
       "score": r2,
       "rmse": rmse,
       "mape": mape
    }

    make_plots(y_test, y_predict, name, r2, rmse, mape)

In [6]:
train_poly_regression(adelaide_df, 1, "Adelaide - Linear Regression")
train_poly_regression(perth_df, 1, "Perth - Linear Regression")
train_poly_regression(sydney_df, 1, "Sydney - Linear Regression")
train_poly_regression(tasmania_df, 1, "Tasmania - Linear Regression")

train_poly_regression(adelaide_df, 2, "Adelaide - Polynomial Regression Degree 2")
train_poly_regression(perth_df, 2, "Perth - Polynomial Regression Degree 2")
train_poly_regression(sydney_df, 2, "Sydney - Polynomial Regression Degree 2")
train_poly_regression(tasmania_df, 2, "Tasmania - Polynomial Regression Degree 2")



Adelaide - Linear Regression
Score: 0.1745709501418038
RMSE: 51144.787
MAPE: 2.906%

Perth - Linear Regression
Score: 0.14944002642095067
RMSE: 48611.310
MAPE: 2.751%

Sydney - Linear Regression
Score: 0.13260959255609428
RMSE: 21606.139
MAPE: 1.169%

Tasmania - Linear Regression
Score: 0.14371569409226093
RMSE: 104221.734
MAPE: 2.169%

Adelaide - Polynomial Regression Degree 2
Score: 0.8455546577463555
RMSE: 22123.249
MAPE: 1.245%

Perth - Polynomial Regression Degree 2
Score: 0.8185148398339247
RMSE: 22454.605
MAPE: 1.263%

Sydney - Polynomial Regression Degree 2
Score: 0.8247720650661557
RMSE: 9711.169
MAPE: 0.485%

Tasmania - Polynomial Regression Degree 2
Score: 0.7297021342418895
RMSE: 58555.893
MAPE: 1.217%



In [7]:
import xgboost as xgb

def train_xgb_model(dataframe, name):

    subset = dataframe[X_cols + Y_cols]
    powerall = dataframe[powerall_col]


    X_train, X_test, y_train, y_test = train_test_split(subset, powerall, test_size=0.2, random_state=44)

    xgb_model = xgb.XGBRegressor (objective="reg:squarederror", n_estimators = 1500, random_state=44, subsample= 0.8, learning_rate= 0.07)
    xgb_model.fit(X_train, y_train)
    y_predict = xgb_model.predict(X_test)

    r2 = r2_score(y_test, y_predict)
    rmse = np.sqrt(mean_squared_error(y_test, y_predict))
    mape = mean_absolute_percentage_error(y_test, y_predict) * 100

    print(f"{name}\nScore: {r2}")
    print(f"RMSE: {rmse:.3f}")
    print(f"MAPE: {mape:.3f}%\n")

    results[name] = {
        "score": r2,
        "rmse": rmse,
        "mape": mape
        }
    
    make_plots(y_test, y_predict, name, r2, rmse, mape)

In [None]:
train_xgb_model(adelaide_df, "Adelaide - XGBoost Regressor")
train_xgb_model(perth_df, "Perth - XGBoost Regressor")
train_xgb_model(sydney_df, "Sydney - XGBoost Regressor")
train_xgb_model(tasmania_df, "Tasmania - XGBoost Regressor")

In [None]:
import lightgbm as lgb

def train_lgb_model(dataframe, name):

    subset = dataframe[X_cols + Y_cols]
    powerall = dataframe[powerall_col]

    X_train, X_test, y_train, y_test = train_test_split(subset, powerall, test_size=0.2, random_state=44)

    lgb_model = lgb.LGBMRegressor(
        objective="regression", 
        n_estimators=1500, 
        random_state=44, 
        subsample=0.8, 
        learning_rate=0.07
    )
    
    lgb_model.fit(X_train, y_train)
    y_predict = lgb_model.predict(X_test)

    r2 = r2_score(y_test, y_predict)
    rmse = np.sqrt(mean_squared_error(y_test, y_predict))
    mape = mean_absolute_percentage_error(y_test, y_predict) * 100

    print(f"{name}\nScore: {r2}")
    print(f"RMSE: {rmse:.3f}")
    print(f"MAPE: {mape:.3f}%\n")

    results[name] = {
        "score": r2,
        "rmse": rmse,
        "mape": mape
    }

    make_plots(y_test, y_predict, name, r2, rmse, mape)

In [None]:
train_lgb_model(adelaide_df, "Adelaide - LightGBM Regressor")
train_lgb_model(perth_df, "Perth - LightGBM Regressor")
train_lgb_model(sydney_df, "Sydney - LightGBM Regressor")
train_lgb_model(tasmania_df, "Tasmania - LightGBM Regressor")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002987 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8160
[LightGBM] [Info] Number of data points in the train set: 57599, number of used features: 32
[LightGBM] [Info] Start training from score 1410128.507641
Adelaide - LightGBM Regressor
Score: 0.9017606010861957
RMSE: 17548.033
MAPE: 0.935%

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005468 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8160
[LightGBM] [Info] Number of data points in the train set: 57600, number of used features: 32
[LightGBM] [Info] Start training from score 1394560.146675
Perth - LightGBM Regressor
Score: 0.8813326497717089
RMSE: 17930.987
MAPE: 0.974%

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005103 seconds.
You can set `force_col_wise=tru

In [None]:
import catboost as cb

def train_catboost_model(dataframe, name):
    X_cols = [f"X{i}" for i in range(1, 17)]
    Y_cols = [f"Y{i}" for i in range(1, 17)]
    powerall_col = ["Powerall"]
    subset = dataframe[X_cols + Y_cols]
    powerall = dataframe[powerall_col]

    X_train, X_temp, y_train, y_temp = train_test_split(subset, powerall, test_size=0.4, random_state=44)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=44)

    catboost_model = cb.CatBoostRegressor(
        iterations=3000,
        learning_rate=0.05,
        depth=7,
        l2_leaf_reg=3,
        random_seed=44,
        verbose=False,
        early_stopping_rounds=100
    )
    
    catboost_model.fit(
        X_train, 
        y_train,
        eval_set=(X_val, y_val),
        verbose=False
    )
    
    y_predict = catboost_model.predict(X_test)

    r2 = r2_score(y_test, y_predict)
    rmse = np.sqrt(mean_squared_error(y_test, y_predict))
    mape = mean_absolute_percentage_error(y_test, y_predict) * 100

    print(f"{name}\nScore: {r2:.4f}")
    print(f"RMSE: {rmse:.3f}")
    print(f"MAPE: {mape:.3f}%\n")

    results[name] = {
        "score": r2,
        "rmse": rmse,
        "mape": mape
    }

    make_plots(y_test, y_predict, name, r2, rmse, mape)

In [None]:
train_catboost_model(adelaide_df, "Adelaide - CatBoost Regressor")
train_catboost_model(perth_df, "Perth - CatBoost Regressor")
train_catboost_model(sydney_df, "Sydney - CatBoost Regressor")
train_catboost_model(tasmania_df, "Tasmania - CatBoost Regressor")

Adelaide - CatBoost Regressor
Score: 0.9090
RMSE: 16978.547
MAPE: 0.896%

Perth - CatBoost Regressor
Score: 0.8886
RMSE: 17595.356
MAPE: 0.949%

Sydney - CatBoost Regressor
Score: 0.9224
RMSE: 6462.188
MAPE: 0.297%

Tasmania - CatBoost Regressor
Score: 0.8338
RMSE: 45916.585
MAPE: 0.934%



In [None]:
def train_ensemble_model(dataframe, name):
    X_cols = [f"X{i}" for i in range(1, 17)]
    Y_cols = [f"Y{i}" for i in range(1, 17)]
    powerall_col = ["Powerall"]
    subset = dataframe[X_cols + Y_cols]
    powerall = dataframe[powerall_col]

    X_train, X_temp, y_train, y_temp = train_test_split(subset, powerall, test_size=0.4, random_state=44)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=44)


    xgb_model = xgb.XGBRegressor(objective="reg:squarederror", n_estimators=3000, 
                                  learning_rate=0.05, max_depth=7, subsample=0.8, 
                                  random_state=44, colsample_bytree=0.8)
    
    lgb_model = lgb.LGBMRegressor(objective="regression", n_estimators=3000, 
                                   learning_rate=0.05, max_depth=7, subsample=0.8, 
                                   random_state=44, colsample_bytree=0.8, verbose=-1)
    
    catboost_model = cb.CatBoostRegressor(iterations=3000, learning_rate=0.05, 
                                          depth=7, random_seed=44, verbose=False)

 
    xgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)
    lgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], 
                  callbacks=[lgb.early_stopping(stopping_rounds=100)])
    catboost_model.fit(X_train, y_train, eval_set=(X_val, y_val), verbose=False)


    pred_xgb = xgb_model.predict(X_test)
    pred_lgb = lgb_model.predict(X_test)
    pred_cat = catboost_model.predict(X_test)


    y_predict = 0.36 * pred_xgb + 0.36 * pred_lgb + 0.28 * pred_cat

    r2 = r2_score(y_test, y_predict)
    rmse = np.sqrt(mean_squared_error(y_test, y_predict))
    mape = mean_absolute_percentage_error(y_test, y_predict) * 100

    print(f"{name}\nScore: {r2:.4f}")
    print(f"RMSE: {rmse:.3f}")
    print(f"MAPE: {mape:.3f}%\n")

    results[name] = {
        "score": r2,
        "rmse": rmse,
        "mape": mape
    }

    make_plots(y_test, y_predict, name, r2, rmse, mape)


In [None]:
train_ensemble_model(adelaide_df, "Adelaide - Ensemble (XGB+LGB+Cat)")
train_ensemble_model(perth_df, "Perth - Ensemble (XGB+LGB+Cat)")
train_ensemble_model(sydney_df, "Sydney - Ensemble (XGB+LGB+Cat)")
train_ensemble_model(tasmania_df, "Tasmania - Ensemble (XGB+LGB+Cat)")

Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[3000]	valid_0's l2: 2.94498e+08
Adelaide - Ensemble (XGB+LGB+Cat)
Score: 0.9104
RMSE: 16847.497
MAPE: 0.885%

Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[2997]	valid_0's l2: 3.18993e+08
Perth - Ensemble (XGB+LGB+Cat)
Score: 0.8896
RMSE: 17510.181
MAPE: 0.938%

Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[3000]	valid_0's l2: 4.29638e+07
Sydney - Ensemble (XGB+LGB+Cat)
Score: 0.9271
RMSE: 6264.027
MAPE: 0.265%

Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[2999]	valid_0's l2: 2.14339e+09
Tasmania - Ensemble (XGB+LGB+Cat)
Score: 0.8347
RMSE: 45795.159
MAPE: 0.927%



In [None]:
for name, metrics in results.items():
    print(f"\n{name}:")
    for metric, value in metrics.items():
        print(f"  {metric}: {value}")


Adelaide - Linear Regression:
  score: 0.1745709501418038
  rmse: 51144.786579729225
  mape: 2.9064190651831594

Perth - Linear Regression:
  score: 0.14944002642095067
  rmse: 48611.30996211733
  mape: 2.7505659369864275

Sydney - Linear Regression:
  score: 0.13260959255609428
  rmse: 21606.138651417088
  mape: 1.1691562311047736

Tasmania - Linear Regression:
  score: 0.14371569409226093
  rmse: 104221.73408520623
  mape: 2.1690888425474233

Adelaide - Polynomial Regression Degree 2:
  score: 0.8455546577463555
  rmse: 22123.24881646634
  mape: 1.2454148094950672

Perth - Polynomial Regression Degree 2:
  score: 0.8185148398339247
  rmse: 22454.60466406559
  mape: 1.2633278031244077

Sydney - Polynomial Regression Degree 2:
  score: 0.8247720650661557
  rmse: 9711.16923726056
  mape: 0.4848479106144605

Tasmania - Polynomial Regression Degree 2:
  score: 0.7297021342418895
  rmse: 58555.893335135246
  mape: 1.2166544054952246

Adelaide - XGBoost Regressor:
  score: 0.90413308143615

In [None]:
import optuna
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)



def objective(trial, X_train, y_train, X_val, y_val):

    params = {
        "objective": "reg:squarederror",
        "eval_metric": "rmse",
        "n_estimators": 2000,
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "gamma": trial.suggest_float("gamma", 1e-8, 1.0, log=True),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 10.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 10.0, log=True),
        "random_state": 44,
        "n_jobs": -1
    }
    
    model = xgb.XGBRegressor(
        **params,
        early_stopping_rounds=50
    )

    model.fit(
        X_train, 
        y_train, 
        eval_set=[(X_val, y_val)],
        verbose=False
    )

    trial.set_user_attr("best_iteration", model.best_iteration)
    
    
    y_predict = model.predict(X_val)
    
    
    rmse = np.sqrt(mean_squared_error(y_val, y_predict))
    return rmse

def train_xgb_with_optuna(dataframe, name, n_trials=50):
    
    print(f"--- Optuna tuning for: {name} ---")
    
    X_cols = [f"X{i}" for i in range(1, 17)]
    Y_cols = [f"Y{i}" for i in range(1, 17)]
    powerall_col = ["Powerall"]
    subset = dataframe[X_cols + Y_cols]
    powerall = dataframe[powerall_col]


    # 60% train, 40% temp
    X_train, X_temp, y_train, y_temp = train_test_split(subset, powerall, test_size=0.4, random_state=44)
    # 50% of temp -> validation
    # 50% of temp -> test
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=44)


    study = optuna.create_study(direction="minimize")
    study.optimize(
        lambda trial: objective(trial, X_train, y_train, X_val, y_val), 
        n_trials=n_trials
    )

    print(f"\n--- Tuning for: {name} ended ---")
    print(f"The best iteration (Validation RMSE): {study.best_value:.3f}")
    print("The best hiperparameters:")
    print(study.best_params)

    best_params = study.best_params
    best_iteration = study.best_trial.user_attrs["best_iteration"]
    
    # Combine train (60%) and validation (20%) sets for final training
    X_train_full = pd.concat([X_train, X_val])
    y_train_full = pd.concat([y_train, y_val])
    
    final_model = xgb.XGBRegressor(
        **best_params,
        n_estimators=best_iteration,
        random_state=44,
        n_jobs=4
    )
    
    final_model.fit(X_train_full, y_train_full)
    
    y_predict = final_model.predict(X_test)
    
    r2 = r2_score(y_test, y_predict)
    rmse = np.sqrt(mean_squared_error(y_test, y_predict))
    mape = mean_absolute_percentage_error(y_test, y_predict) * 100

    print(f"\n--- Results for {name} ---")
    print(f"Score (R2): {r2:.4f}")
    print(f"RMSE: {rmse:.3f}")
    print(f"MAPE: {mape:.3f}%\\n")
    results[f"{name}_Optuna_R2"] = {"score": r2, "rmse": rmse, "mape": mape}
    make_plots(y_test, y_predict, name + " - Optuna Tuning", r2, rmse, mape)

In [None]:
train_xgb_with_optuna(adelaide_df, "Adelaide - XGBoost with Optuna HPT", n_trials=50)
train_xgb_with_optuna(perth_df, "Perth - XGBoost with Optuna HPT", n_trials=50)
train_xgb_with_optuna(sydney_df, "Sydney - XGBoost with Optuna HPT", n_trials=50)
train_xgb_with_optuna(tasmania_df, "Tasmania - XGBoost with Optuna HPT", n_trials=50)

[I 2025-11-25 19:24:09,179] A new study created in memory with name: no-name-5a75e1cd-3609-4a87-bc3a-4ffe153d6d07


--- Optuna tuning for: Adelaide - XGBoost with Optuna HPT ---


[I 2025-11-25 19:24:22,249] Trial 0 finished with value: 19910.818767695113 and parameters: {'learning_rate': 0.148943211492241, 'max_depth': 9, 'subsample': 0.9235052938044574, 'colsample_bytree': 0.9532706264009774, 'gamma': 0.09286707381119697, 'reg_alpha': 7.516854521316566e-07, 'reg_lambda': 1.1266969160904916e-07}. Best is trial 0 with value: 19910.818767695113.
[I 2025-11-25 19:24:30,043] Trial 1 finished with value: 18096.886804088706 and parameters: {'learning_rate': 0.01521478344333008, 'max_depth': 7, 'subsample': 0.9814707026012683, 'colsample_bytree': 0.8086820149691745, 'gamma': 0.0011704439275789116, 'reg_alpha': 7.58475459698496e-08, 'reg_lambda': 0.0014393412019140916}. Best is trial 1 with value: 18096.886804088706.
[I 2025-11-25 19:24:32,799] Trial 2 finished with value: 17880.78208580374 and parameters: {'learning_rate': 0.1259564093326749, 'max_depth': 4, 'subsample': 0.7984258683767347, 'colsample_bytree': 0.831797284417624, 'gamma': 0.011440338699832754, 'reg_alp


--- Tuning for: Adelaide - XGBoost with Optuna HPT ended ---
The best iteration (Validation RMSE): 17299.491
The best hiperparameters:
{'learning_rate': 0.054661819857065565, 'max_depth': 6, 'subsample': 0.6930187833511031, 'colsample_bytree': 0.8629562804087731, 'gamma': 2.0401212058123916e-05, 'reg_alpha': 0.017587446925232952, 'reg_lambda': 5.795348487278384e-07}

--- Results for Adelaide - XGBoost with Optuna HPT ---
Score (R2): 0.9065
RMSE: 17210.135
MAPE: 0.910%\n


[I 2025-11-25 19:30:15,795] A new study created in memory with name: no-name-5296a051-94e6-419f-8d38-907a66f73f97


--- Optuna tuning for: Perth - XGBoost with Optuna HPT ---


[I 2025-11-25 19:30:18,536] Trial 0 finished with value: 18486.82038642665 and parameters: {'learning_rate': 0.15495368417212518, 'max_depth': 3, 'subsample': 0.7721313792260174, 'colsample_bytree': 0.8622195992891464, 'gamma': 0.037910207213409736, 'reg_alpha': 0.03580079198531317, 'reg_lambda': 5.070606457799454e-07}. Best is trial 0 with value: 18486.82038642665.
[I 2025-11-25 19:30:21,679] Trial 1 finished with value: 18165.12350632387 and parameters: {'learning_rate': 0.05231488259597966, 'max_depth': 4, 'subsample': 0.826854127409946, 'colsample_bytree': 0.7399301744972612, 'gamma': 0.6983729647098038, 'reg_alpha': 0.015768876537286647, 'reg_lambda': 4.7222843340641765}. Best is trial 1 with value: 18165.12350632387.
[I 2025-11-25 19:30:26,602] Trial 2 finished with value: 18963.363414753196 and parameters: {'learning_rate': 0.12843953439647274, 'max_depth': 7, 'subsample': 0.9786960771655163, 'colsample_bytree': 0.7076956272529076, 'gamma': 0.1775775897829326, 'reg_alpha': 2.154

KeyboardInterrupt: 

In [None]:
def objective_lgb(trial, X_train, y_train, X_val, y_val):
    
    params = {
        "objective": "regression_l2",
        "metric": "rmse",
        "n_estimators": 2000,
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 20, 150),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 10.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 10.0, log=True),
        "random_state": 44,
        "n_jobs": -1,
        "verbose": -1
    }

    model = lgb.LGBMRegressor(**params)

    model.fit(
        X_train,
        y_train,
        eval_set=[(X_val, y_val)],
        eval_metric="rmse",
        callbacks=[lgb.early_stopping(50, verbose=False)]
    )

    trial.set_user_attr("best_iteration", model.best_iteration_)
    
    y_predict = model.predict(X_val)
    
    rmse = np.sqrt(mean_squared_error(y_val, y_predict))
    return rmse

def train_lgb_with_optuna(dataframe, name, n_trials=50):
    
    print(f"--- Optuna tuning for: {name} ---")
    
    X_cols = [f"X{i}" for i in range(1, 17)]
    Y_cols = [f"Y{i}" for i in range(1, 17)]
    powerall_col = ["Powerall"]
    subset = dataframe[X_cols + Y_cols]
    powerall = dataframe[powerall_col]

    # 60% train, 40% temp
    X_train, X_temp, y_train, y_temp = train_test_split(subset, powerall, test_size=0.4, random_state=44)
    # 50% of temp -> validation
    # 50% of temp -> test
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=44)


    study = optuna.create_study(direction="minimize")
    study.optimize(
        lambda trial: objective_lgb(trial, X_train, y_train, X_val, y_val), 
        n_trials=n_trials
    )

    print(f"\n--- Tuning for: {name} ended ---")
    print(f"The best iteration (Validation RMSE): {study.best_value:.3f}")
    print("The best hiperparameters:")
    print(study.best_params)

    best_params = study.best_params
    best_iteration = study.best_trial.user_attrs["best_iteration"]
    
    # Combine train (60%) and validation (20%) sets for final training
    X_train_full = pd.concat([X_train, X_val])
    y_train_full = pd.concat([y_train, y_val])
    
    final_model = lgb.LGBMRegressor(
        **best_params,
        n_estimators=best_iteration,
        random_state=44,
        n_jobs=-1,
        verbose=-1
    )
    
    final_model.fit(X_train_full, y_train_full)
    
    y_predict = final_model.predict(X_test)
    
    r2 = r2_score(y_test, y_predict)
    rmse = np.sqrt(mean_squared_error(y_test, y_predict))
    mape = mean_absolute_percentage_error(y_test, y_predict) * 100

    print(f"\n--- Results for {name} ---")
    print(f"Score (R2): {r2:.4f}")
    print(f"RMSE: {rmse:.3f}")
    print(f"MAPE: {mape:.3f}%\\n")
    results[f"{name}_Optuna_LGB"] = {"score": r2, "rmse": rmse, "mape": mape}
    make_plots(y_test, y_predict, name + " - Optuna Tuning", r2, rmse, mape)

In [None]:
train_lgb_with_optuna(adelaide_df, "Adelaide - LightGBM with Optuna HPT", n_trials=50)
train_lgb_with_optuna(perth_df, "Perth - LightGBM with Optuna HPT", n_trials=50)
train_lgb_with_optuna(sydney_df, "Sydney - LightGBM with Optuna HPT", n_trials=50)
train_lgb_with_optuna(tasmania_df, "Tasmania - LightGBM with Optuna HPT", n_trials=50)

In [None]:
import catboost as cb

def objective_cb(trial, X_train, y_train, X_val, y_val):
    
    params = {
        "objective": "RMSE",
        "eval_metric": "RMSE",
        "iterations": 2000,
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "depth": trial.suggest_int("depth", 3, 10),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-8, 10.0, log=True),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "random_strength": trial.suggest_float("random_strength", 1e-8, 10.0, log=True),
        "random_seed": 44,
        "thread_count": -1,
        "verbose": False
    }

    model = cb.CatBoostRegressor(
        **params,
        early_stopping_rounds=50
    )

    model.fit(
        X_train,
        y_train,
        eval_set=(X_val, y_val),
        verbose=False
    )

    trial.set_user_attr("best_iteration", model.get_best_iteration())
    
    y_predict = model.predict(X_val)
    
    rmse = np.sqrt(mean_squared_error(y_val, y_predict))
    return rmse

def train_cb_with_optuna(dataframe, name, n_trials=50):
    
    print(f"--- Optuna tuning for: {name} ---")
    
    X_cols = [f"X{i}" for i in range(1, 17)]
    Y_cols = [f"Y{i}" for i in range(1, 17)]
    powerall_col = ["Powerall"]
    subset = dataframe[X_cols + Y_cols]
    powerall = dataframe[powerall_col]

    # 60% train, 40% temp
    X_train, X_temp, y_train, y_temp = train_test_split(subset, powerall, test_size=0.4, random_state=44)
    # 50% of temp -> validation
    # 50% of temp -> test
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=44)


    study = optuna.create_study(direction="minimize")
    study.optimize(
        lambda trial: objective_cb(trial, X_train, y_train, X_val, y_val), 
        n_trials=n_trials
    )

    print(f"\n--- Tuning for: {name} ended ---")
    print(f"The best iteration (Validation RMSE): {study.best_value:.3f}")
    print("The best hiperparameters:")
    print(study.best_params)

    best_params = study.best_params
    best_iteration = study.best_trial.user_attrs["best_iteration"]
    
    # Combine train (60%) and validation (20%) sets for final training
    X_train_full = pd.concat([X_train, X_val])
    y_train_full = pd.concat([y_train, y_val])
    
    final_model = cb.CatBoostRegressor(
        **best_params,
        iterations=best_iteration,
        random_seed=44,
        thread_count=-1,
        verbose=False
    )
    
    final_model.fit(X_train_full, y_train_full, verbose=False)
    
    y_predict = final_model.predict(X_test)
    
    r2 = r2_score(y_test, y_predict)
    rmse = np.sqrt(mean_squared_error(y_test, y_predict))
    mape = mean_absolute_percentage_error(y_test, y_predict) * 100

    print(f"\n--- Results for {name} ---")
    print(f"Score (R2): {r2:.4f}")
    print(f"RMSE: {rmse:.3f}")
    print(f"MAPE: {mape:.3f}%\\n")
    results[f"{name}_Optuna_CB"] = {"score": r2, "rmse": rmse, "mape": mape}
    make_plots(y_test, y_predict, name + " - Optuna Tuning", r2, rmse, mape)

In [None]:
train_cb_with_optuna(adelaide_df, "Adelaide - CatBoost with Optuna HPT", n_trials=50)

In [None]:
train_cb_with_optuna(perth_df, "Perth - CatBoost with Optuna HPT", n_trials=50)

In [None]:
train_cb_with_optuna(sydney_df, "Sydney - CatBoost with Optuna HPT", n_trials=50)

In [None]:
train_cb_with_optuna(tasmania_df, "Tasmania - CatBoost with Optuna HPT", n_trials=50)

In [None]:
print(results)