# Physics Informed Gaussian Process model to predict NTP concentration during IVT reaction

Using FTIR and a pH sensor, we can monitor the NTP composition and concentration during the IVT reaction. Setup idea:

<div style="text-align: center;">
  <img src="../assets/ftir_ph_ntp_setup.png" alt="PLS model" width="400"/>
</div>

In [None]:
def train_model(X_ntps, y_ntps):
    # Create the combined kernel
    base_kernel = C(1.0, (1e-2, 1e5)) * RBF(1.0, (1e-2, 1e5))

    # Update the pKa value in the custom kernel
    custom_kernel = HendersonHasselbalchKernel(base_kernel, pKa=7.3)

    # Retrain the Gaussian Process model with the updated pKa
    gp_ntps_custom = GaussianProcessRegressor(
        kernel=custom_kernel, n_restarts_optimizer=10, alpha=1e-2
    )
    gp_ntps_custom.fit(X_ntps, y_ntps)
    return gp_ntps_custom


def predict_model(
    gp_ntps_custom: GaussianProcessRegressor,
    X_train: np.ndarray[typing.Any, typing.Any],
    y_train: np.ndarray[typing.Any, typing.Any],
    X_test: np.ndarray[typing.Any, typing.Any],
    y_test: np.ndarray[typing.Any, typing.Any],
):
    # Predict mean and variance with the updated custom kernel
    predictions, sigma_predictions = gp_ntps_custom.predict(X_test, return_std=True)
    accuracy = gp_ntps_custom.score(X_test, y_test)
    print(accuracy)
    print(predictions)
    print(y_test)

    # Plot the results
    plt.figure(figsize=(10, 6))
    plt.scatter(X_train[:, 0], y_train, color="green", label="Observed Data (NTPs)")
    plt.plot(
        X_test[:, 0],
        predictions,
        color="purple",
        label="GP Prediction (Custom Kernel, pKa=7.3)",
    )
    plt.fill_between(
        X_test[:, 0].ravel(),
        predictions - 1.96 * sigma_predictions,
        predictions + 1.96 * sigma_predictions,
        alpha=0.2,
        color="purple",
        label="95% Confidence Interval",
    )
    plt.title(
        "Gaussian Process Regression with Custom Kernel fit with pH and Time: NTPs vs pH"
    )
    plt.xlabel("pH")
    plt.ylabel("NTPs")
    plt.legend()
    plt.tight_layout()
    plt.grid(True)
    plt.show()

    # Plot predictions vs actual values
    plt.figure(figsize=(10, 6))
    plt.scatter(y_test, predictions, color="blue")
    plt.plot([0, 0.5], [0, 0.5], color="red")
    plt.title("GP (pH + time) Predictions vs Actual Values")
    plt.xlabel("Actual Values")
    plt.ylabel("Predictions")
    plt.grid(True)
    plt.show()


if __name__ == "__main__":
    # Generate synthetic data for NTPs vs pH
    train_data = load_data("step_response_data.xlsx", file_type="excel")

    X_train = train_data[["pH", "Time"]].values.reshape(-1, 2)
    y_train = train_data["NTPs"].values

    test_data = load_data("pH data set.xlsx", file_type="excel")

    X_test = test_data[["pH", "Time"]].values.reshape(-1, 2)
    test_data["ntp"] = test_data["ntp"] / test_data["ntp"].max() * 0.5
    y_test = test_data["ntp"].values

    gp = train_model(X_train, y_train)

    predict_model(gp, X_train, y_train, X_test, y_test.reshape(-1, 1))