In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
import json
from scipy.stats import qmc

import GPy
import torch

from ModelSetting_new import Model
from UtilityFunctions import U_SMOCU
from OptimizeMethod import Multi_start_SGD

from safir_runner import SafirRunner
from al_plotting import plot_acquisition_values, plot_relative_acq_change, plot_log_likelihood, plot_misclassification, plot_gp_decision_boundary_2d, plot_gp_decision_boundary_3d, plot_flips_frac_values, plot_boundary_spread
from al_helpers import DesignVar, normalise_point, denormalise_point, df_to_training_data, save_model_state, load_model_state, relative_sequential_difference

In [None]:
design_vars = [
    DesignVar("t_h", 1.0, 60.0, "min"),   # heating duration
    DesignVar("B",   0.250, 0.500, "m"),  # column width
    DesignVar("F", 300000, 1800000, "N"),  # Applied force
    DesignVar("l", 3.00, 5.00, "m"),       # column length
    DesignVar("T_max", 600, 1200, "°C"),   # Maximum temperature 
    DesignVar("r_cool", 6, 12, "°C/min"),  # cooling rate
    # DesignVar("rho", 360, 490,"kg/m^3"),  # Density
    # DesignVar("E", 9.44e9, 17.37e9, "N/m^2"),   # Modulus of Elasticity
    # DesignVar("f_c", 35.7e6, 45.8e6,"N/m^2"),    # Compressive strength
    # DesignVar("e0", 0.013275, 0.026865, "m"),    # Eccentricity
]
d = len(design_vars)

In [None]:
base_path = Path(r"C:\Users\justu\Documents\Master_Thesis\SAFIR\Files\Active_Learning\TEST")
base_path.mkdir(parents=True, exist_ok=True)

data_path   = base_path / "Data_AL.csv"
kernel_path = base_path / "Kernel_AL.json"


column_names = [
    "Analysis", "e0", "rho", "E", "mu", "f_c", "f_t", "w", "h_ch", "h_cc", "eps",
    "B", "l", "F", "t_h", "T_max", "r_cool", "H", "t_end", "t_end_guess",
    "fine_size", "n_elements", "failure", "failure_time",
    "time_thermo", "time_mech", "time_tot", "stiffness",
]

runner = SafirRunner(base_dir=base_path, column_names=column_names)

In [None]:
# Bernoulli likelihood for classification
lik = GPy.likelihoods.Bernoulli()

# ARD RBF kernel with one lengthscale per input dimension
kernel = GPy.kern.RBF(input_dim=d, variance=3.0, lengthscale=0.4, ARD=True)

x_bounds = [0.0, 1.0]          # Surrogate model design space interval.

xl = np.full(d, x_bounds[0])
xu = np.full(d, x_bounds[1])
xinterval = (xl, xu)

# Uniform prior over the interval
volume = np.prod(xu - xl)
px = 1.0 / volume
px_log = -np.log(volume)

c_num = 2  # binary classification
parameters = [d, c_num, kernel, lik]

vinterval = None
linterval = (0.1, 1.0)
prior_mean = -2

In [None]:
# Load existing data if present
use_existing_data = False
if data_path.exists():
    df = pd.read_csv(data_path)
    if len(df) > 0:
        use_existing_data = True
else:
    df = pd.DataFrame(columns=column_names)

if use_existing_data:
    print(f"Warm start: using existing dataset with {len(df)} rows from {data_path}")
    X_init, Y_init = df_to_training_data(df, design_vars, y_col="failure")

    if kernel_path.exists():
        print(f"Found saved model state at {kernel_path}. Reloading.")
        kernel, saved_prior_mean = load_model_state(
            kernel_path=kernel_path,
            d=d,
            expected_design_vars=[dv.name for dv in design_vars],
            expected_xinterval=xinterval,
        )
        if saved_prior_mean is not None:
            prior_mean = float(saved_prior_mean)

    parameters = [d, c_num, kernel, lik]

else:
    print("Cold start: no existing data found. Running initial SAFIR simulation.")
    # Initial simulation point
    phys0 = {
        "t_h": 50.0,   # [min]
        "B":   0.260,  # [m]
        "F":   500000, # [N]
        "l":    3.2,   # [m]
        "T_max": 1150, # [°C]
        "r_cool": 11.5, # [°C/min]
        # "rho": 380, # [kg/m^3]
        # "E": 10.0e9, # [N/m^2]
        # "f_c": 38.0e6, # [N/m^2]
        # "e0": 0.024, # [m]
    }
    y0, df = runner.run(phys0, df)
    print("Initial failure label:", y0)

    X_init = normalise_point(phys0, design_vars)[None, :]
    Y_init = np.array([[y0]], float)

#  Build GPC model
model = Model(
    X=X_init,
    Y=Y_init,
    parameters=parameters,
    xinterval=xinterval,
    optimize=False,
    kern_variance_fix=False,
    kern_lengthscale_fix=False,
    mean_fix=True,
    vinterval=vinterval,
    linterval=linterval,
    prior_mean=prior_mean,
    px_log=px_log,
)

print("Initial kernel:", model.gpc.kern)
print("Initial lengthscales:", model.gpc.kern.lengthscale)
print("Initial mean function:", model.gpc.mean_function)
print("Initial training points:", model.gpc.X.shape[0])

In [None]:
# NR-SMOCU-SGD settings
smocu_x_num   = 4000     # points to approximate SMOCU integral
mc_search_num = 1_000_000     # candidates for intial Sobol search
SGD_steps     = 200      # Amount of gradient steps
mc_search_num_frac = 0.001   # Fraction of samples initial Sobol search which score best on closest to current decision boundary

acq = U_SMOCU(
    softtype=2,
    k=20,
    x_num=smocu_x_num,
    approx_label=True   # NR-SMOCU
)

In [None]:
log_path = base_path / "AL_Log.csv"
AL_LOG_COLUMNS = [
    "global_iter",
    "acq_value",
    "miscl",
    "log_likelihood",
    "start_opt",
    "count",
    "optimisation_count",
    "boundary_frac", 
    "flips_frac",
    "flips_frac_count",
    "plot_thresh",
]

In [None]:
# Load or initialise Active learning bookkeeping
if log_path.exists():
    al_log = pd.read_csv(log_path)
    if len(al_log) > 0:
        last_row = al_log.iloc[-1]
        acq_list = al_log["acq_value"].astype(float).tolist()
        miscl_list = al_log["miscl"].astype(float).tolist()
        log_likelihood_list = al_log["log_likelihood"].astype(float).tolist()
        start_opt = bool(last_row["start_opt"])
        count = int(last_row["count"])
        optimisation_count = int(last_row["optimisation_count"])
        global_iter_start = int(last_row["global_iter"])
        boundary_frac_list = al_log["boundary_frac"].astype(float).tolist()
        flips_frac_list = al_log["flips_frac"].astype(float).tolist()
        flips_frac_count = int(last_row["flips_frac_count"])
        plot_thresh = float(last_row["plot_thresh"])
    else:
        al_log = pd.DataFrame(columns=AL_LOG_COLUMNS)
        acq_list = []
        miscl_list = []
        log_likelihood_list = []
        start_opt = False
        count = 0
        optimisation_count = 0
        global_iter_start = 0
        boundary_frac_list = []
        flips_frac_list = []
        flips_frac_count = 0
else:
    al_log = pd.DataFrame(columns=AL_LOG_COLUMNS)
    acq_list = []
    miscl_list = []
    log_likelihood_list = []
    start_opt = False
    count = 0
    optimisation_count = 0
    global_iter_start = 0
    boundary_frac_list = []
    flips_frac_list = []
    flips_frac_count = 0

print(
    f"AL resume state: global_iter_start={global_iter_start}, "
    f"start_opt={start_opt}, count={count}, optimisation_count={optimisation_count}"
)

In [None]:
if use_existing_data and log_path.exists():
    n_data = len(df)
    n_log  = len(al_log)

    if n_data == n_log + 1:
        pass
    elif n_data == n_log + 2:
        print(f"Warning: Detected one unlogged evaluation (Data rows {n_data}, AL log rows {n_log}).")
        print("Likely crash after SAFIR/data save but before AL_Log save.")
    else:
        print(f"Warning: Data rows ({n_data}) and AL log rows ({n_log}) are inconsistent.")
        print("Investigate: multiple missing log rows or mismatched base_path.")

In [None]:
# Monior set for Hyperparameter optimisation and convergence
n_monitor = 1_000_000

sampler = qmc.Sobol(d=d, scramble=True, seed=123)
X_monitor = sampler.random(n_monitor)

p_monitor = model.predict_proba(X_monitor)[:, 1]

y_mon_now = (p_monitor >= 0.5).astype(int)
y_mon_prev = y_mon_now.copy()

In [None]:
# Optimisation rule parameters
opt_frac = 0.25       # Fraction of maximum change in boundary movement to start counting when to optimize hyperparameters
opt_thresh = 20       # Count of boundary change under treshold for when to start optimizing hyperparameters
opt_every = 10       # Do hyperparameter optimization every ... times

# Convergence / stopping rule parameters (Convergence starts after the first try of kernel hyperparameter optimisation)
boundary_frac_threshold = 0.01
boundary_frac_threshold_2 = 0.05
diff_boundary_frac_threshold = 0.0005
miscl_window = 11
miscl_thresh = 0.05
diff_boundary_frac_window = 21
converged = False

n_iterations = 200    # Amount of iterations for active learning batch

In [None]:
X_history = [X_init.copy()]
Y_history = [Y_init.copy()]

for it in range(n_iterations):
    global_iter = global_iter_start + it + 1
    print(f"\n=== Iteration {global_iter} ===")
    print("Mean function:", model.gpc.mean_function)

    # 1) Choose next query
    x_star_norm, acq_value = Multi_start_SGD(
        acq,
        model=model,
        mc_search_num=mc_search_num,
        learning_rate=0.001,
        n_starts=1,
        top_frac=0.01,
        n_sgd_steps=SGD_steps,
        frac_mc_search=mc_search_num_frac,
        return_field=False,
    )

    acq_list.append(acq_value)
    acq_arr = np.array(acq_list)
    acq_rel = relative_sequential_difference(acq_arr)

    # 2) Map to physical space and run SAFIR
    phys_star = denormalise_point(x_star_norm, design_vars)
    print("Next physical point:", phys_star)

    y_star, df = runner.run(phys_star, df)

    # 3) Update history
    X_history.append(np.vstack([X_history[-1], x_star_norm]))
    Y_history.append(np.vstack([Y_history[-1], [[y_star]]]))

    lengthscales_before = np.asarray(model.gpc.kern.lengthscale).copy()

    # 4) Decide whether to optimise hyperparameters
    do_opt = False
    if flips_frac_count > opt_thresh:
        start_opt = True
        if (count % opt_every == 0) and start_opt:
            do_opt = True
        if start_opt:
            count += 1
    
    model.Update(
        x_star_norm,
        y_star,
        optimize=do_opt,
        kern_variance_fix=False,
        mean_fix=True,
        ll_tol=0.0,
    )
    
    save_model_state(model, kernel_path=kernel_path, prior_mean=prior_mean, xinterval=xinterval, design_vars=design_vars)
    
    log_likelihood = float(model.gpc.log_likelihood())
    log_likelihood_list.append(log_likelihood)

    lengthscales_after = np.asarray(model.gpc.kern.lengthscale)
    if np.any(lengthscales_before != lengthscales_after):
        optimisation_count += 1

    # 5) Monitor AL data
    miscl = model.training_misclassification()
    miscl_list.append(miscl)
    miscl_arr = np.array(miscl_list)

    p_monitor = model.predict_proba(X_monitor)[:, 1]
    boundary_mask = (p_monitor > 0.3) & (p_monitor < 0.7)
    boundary_frac = boundary_mask.mean()
    
    y_mon_now = (p_monitor >= 0.5).astype(int)   
    
    flips_frac = np.mean(y_mon_now != y_mon_prev)
    y_mon_prev = y_mon_now.copy()

    boundary_frac_list.append(boundary_frac)
    flips_frac_list.append(flips_frac)

    if flips_frac < opt_frac * np.max(flips_frac_list):
        flips_frac_count += 1

    diff_boundary_frac = np.diff(np.array(boundary_frac_list))

    if count == 0:
        plot_thresh = opt_frac * np.max(flips_frac_list)

    print("Next normalised point:", x_star_norm)
    print("Label (failure?):", y_star, "   Acquisition:", acq_value)
    print("Kernel variance:", model.gpc.kern.variance)
    print("Kernel lengthscales:", model.gpc.kern.lengthscale)
    print("Log-likelihood:", log_likelihood)
    if len(acq_rel) > 10:
        print("Relative acquisition values:", acq_rel[-11:])
    print("Count from moment of optimisation:", count)
    print("Misclassification metric:", miscl)
    print("Optimisation count:", optimisation_count)
    print("Flips_frac_count:", flips_frac_count)
    print("Boundary spread metric:", boundary_frac)

    
    al_log.loc[len(al_log)] = {
        "global_iter": int(global_iter),
        "acq_value": float(acq_value),
        "miscl": float(miscl),
        "log_likelihood": float(log_likelihood),
        "start_opt": bool(start_opt),
        "count": int(count),
        "optimisation_count": int(optimisation_count),
        "boundary_frac": float(boundary_frac), 
        "flips_frac": float(flips_frac), 
        "flips_frac_count":  int(flips_frac_count),
        "plot_thresh": float(plot_thresh)
    }
    
    al_log.to_csv(log_path, index=False)

    # Plotting of AL metrics
    global_iterations = np.arange(1, len(acq_list)+1)
    plot_flips_frac_values(global_iterations, np.array(flips_frac_list), plot_thresh, base_path / "Figures" / "Boundary_change_vs_iterations.png")
    plot_boundary_spread(global_iterations, np.array(boundary_frac_list), boundary_frac_threshold, base_path / "Figures" / "Boundary_spread_vs_iterations.png")
    plot_acquisition_values(global_iterations, acq_arr, base_path / "Figures" / "Acquisition_vs_iterations.png")
    plot_relative_acq_change(global_iterations, acq_rel, base_path / "Figures" / "RelAcqChange_vs_iterations.png")
    plot_log_likelihood(global_iterations, np.array(log_likelihood_list), base_path / "Figures" / "LogLikelihood_vs_iterations.png")
    plot_misclassification(global_iterations, miscl_arr, base_path / "Figures" / "Misclassification_vs_iterations.png",
                           miscl_threshold=miscl_thresh)

    # Plotting of visualising 2D problem
    if d == 2:
        x_bounds = (design_vars[0].lower, design_vars[0].upper)
        y_bounds = (design_vars[1].lower, design_vars[1].upper)
    
        plot_gp_decision_boundary_2d(
            model=model,
            X_norm=model.gpc.X,
            y=model.gpc.Y.ravel().astype(int),
            x_bounds=x_bounds,
            y_bounds=y_bounds,
            grid_res=150,
            x_label=rf"{design_vars[0].name} [{design_vars[0].unit}]",
            y_label=rf"{design_vars[1].name} [{design_vars[1].unit}]",
            title=f"GPC Decision Boundary – iteration {global_iter}",
            save_path= base_path / "Figures" / f"GPC_Decision_Boundary_it_{global_iter}.png",
            show=False,
        )
    
    
    # 6) Stopping rules
    if count > 1 and boundary_frac_list[-1] < boundary_frac_threshold and np.all(miscl_arr[-miscl_window:] < miscl_thresh):
        print(
            f"\nStopping early at iteration {global_iter}: "
            f"Boundary band < {boundary_frac_threshold}."
        )
        converged = True
        break

    # if count > 1 and boundary_frac_list[-1] < boundary_frac_threshold_2 and np.all(miscl_arr[-miscl_window:] < miscl_thresh) and np.all(diff_boundary_frac[-diff_boundary_frac_window:] < diff_boundary_frac_threshold):
    #     print(
    #         f"\nStopping early at iteration {global_iter}: "
    #         f"Boundary band < {boundary_frac_threshold_2} and no improvement anymore on boundary spread metric."
    #     )
    #     converged = True
    #     break

X_final = X_history[-1]
Y_final = Y_history[-1]
print("\nTotal evaluated points:", X_final.shape[0])
print("Final dataframe rows:", len(df))

In [None]:
if converged:
    print("\nConverged: training final optimised model on full dataset")

    X_current = np.asarray(model.gpc.X, float)
    Y_current = np.asarray(model.gpc.Y, float).reshape(-1, 1)

    final_model = model.ModelTrain(
        X_current,
        Y_current,
        optimize=True,                
        kern_lengthscale_fix=False,    
        kern_variance_fix=False,      
        mean_fix=True,                 
    )

    print("Final kernel:")
    print("Final variance:", final_model.gpc.kern.variance)
    print("Final lengthscales:", final_model.gpc.kern.lengthscale)
    print("Final log-likelihood:", float(final_model.gpc.log_likelihood()))

    final_model.gpc.save_model(str(base_path / "Final_GPC"), compress=True)
    print(
        'Final model is saved on base_path. '
        'Model can be reloaded with: '
        'gpc_loaded = GPy.core.GP.load_model(str(base_path / "Final_GPC.zip"))'
    )

    save_model_state(
        final_model,
        kernel_path=kernel_path,
        prior_mean=prior_mean,
        xinterval=xinterval,
        design_vars=design_vars,
    )
else:
    print("\nNot converged: skipping final hyperparameter optimisation.")