In [1]:
import pandas as pd
from pathlib import Path
from src.config.paths import HAMARAT_DATA_DIR
from C_oblique_decision_tree_benchmark.evaluation.benchmark_runner import DepthSweepRunner

# Load the experiments (inputs)
experiments = pd.read_csv(HAMARAT_DATA_DIR / "experiments.csv")

# Load the outcomes (outputs)
fraction_renewables = pd.read_csv(HAMARAT_DATA_DIR / "fraction renewables.csv", header=None)
carbon_emissions_reduction = pd.read_csv(HAMARAT_DATA_DIR / "carbon emissions reduction fraction.csv", header=None)

# Optional: Load time vector (if provided separately)
time = pd.read_csv(HAMARAT_DATA_DIR / "TIME.csv", header=None)

In [3]:
print("Experiments:", experiments.shape)
print("Fraction Renewables:", fraction_renewables.shape)
print("Carbon Emissions Reduction:", carbon_emissions_reduction.shape)

Experiments: (5000, 48)
Fraction Renewables: (5000, 641)
Carbon Emissions Reduction: (5000, 641)


In [4]:
# Extract the final value (i.e., for the year 2050) of the renewable fraction for each simulation run.
# Each row corresponds to a scenario, and each column is a quarterly timestep from 2010 to 2050.
# We use the final column to classify whether the energy transition was successful.
final_renewable_share = fraction_renewables.iloc[:, -1]

# Define a binary classification label based on the policy outcome:
# If the renewable share in 2050 is less than 60%, we consider the outcome undesirable (label = 1),
# otherwise the outcome is acceptable (label = 0).
label = (final_renewable_share < 0.60).astype(int)

# Add this binary classification label to the original experiments DataFrame,
# so that each row (scenario) is now associated with a success/failure outcome.
experiments["label"] = label

# Define which columns to exclude from the feature matrix:
# - "label" is the target and should not be part of the input features.
# - "model", "policy", and "year" are metadata fields, not true decision variables.
columns_to_drop = ["label", "model", "policy", "year"]

# Create the input features matrix (X) by dropping non-feature columns.
# This DataFrame will be passed to the decision tree model.
X_df = experiments.drop(columns=columns_to_drop)

# Extract the label column as a NumPy array (required by most ML algorithms).
# This will be used as the classification target (y).
y_array = experiments["label"].values

In [5]:
# Wrap the cleaned dataset into a dictionary format expected by DepthSweepRunner.
# The key is a string identifier for the dataset, and the value is a tuple (X, y),
# where X is a DataFrame of input features and y is a NumPy array of binary labels.
# This structure supports multiple datasets if needed.
datasets = {
    "hamarat_energy_transition": (X_df, y_array)
}

# Specify the algorithm you want to benchmark.
# Options include: "hhcart_a", "hhcart_d", "cart", "randcart", "ridge_cart", "oc1", "co2", "wodt"
# Here, we select HHCART with diagonal reflection strategy ("hhcart_d").
algorithm = "hhcart_d"

# Build a registry that includes only the selected algorithm.
# The registry maps algorithm names to functions that instantiate decision tree classifiers.
# Although build_registry() returns all available models, we extract only the one we need.
registry = {
    algorithm: DepthSweepRunner.build_registry()[algorithm]
}

In [6]:
# Create an instance of the DepthSweepRunner, which manages the benchmarking process.
# - datasets: dictionary containing the dataset(s) to run (here, just one: "hamarat_energy_transition")
# - max_depth: the maximum tree depth to explore during the sweep.
#   This means trees will be trained at depths 0 through 4 (inclusive).
runner = DepthSweepRunner(datasets=datasets, max_depth=4)

# Run the benchmarking sweep:
# - registry: specifies which algorithm(s) to benchmark — here, just "hhcart_d"
# - auto_export: if True, saves the results DataFrame to CSV automatically
# - filename: the name of the CSV file where benchmarking results will be saved
# - tree_dict_filename: the name of the .pkl file where all trained trees will be saved
# The runner returns:
# - results_df: a DataFrame containing metrics for each (algorithm, depth) combination
# - tree_dict: a dictionary holding the trained trees for later inspection or visualisation
results_df, tree_dict = runner.run(
    registry=registry,
    auto_export=True,
    filename=f"{algorithm}_hamarat.csv",
    tree_dict_filename=f"{algorithm}_hamarat.pkl"
)

Depth Sweeping: 100%|██████████| 5/5 [34:00<00:00, 408.10s/it]

[OK] Saved DataFrame to: C:\Users\jaspe\OneDrive\Desktop\Oblique-Decision-Tree-Algorithms-for-Scenario-Discovery\_data\depth_sweep_single_run_results\hhcart_d_hamarat.csv
[OK] Saved trees_dict to: C:\Users\jaspe\OneDrive\Desktop\Oblique-Decision-Tree-Algorithms-for-Scenario-Discovery\_data\depth_sweep_single_run_results\hhcart_d_hamarat.pkl



