In [None]:
# Gaussian Process classification with a probit likelihood (PyMC)
# All comments in this script are in English as requested.

import numpy as np
import matplotlib.pyplot as plt
import pymc as pm
import arviz as az
from scipy.stats import norm
from sklearn.metrics import accuracy_score

# For more informative PyTensor tracebacks during debugging (optional)
import pytensor
pytensor.config.exception_verbosity = "high"

# Fix random seed for reproducibility
RNG = np.random.default_rng(42)

# Generate a simple 1D toy dataset for binary classification
n_train = 50
X_train = np.linspace(-3, 3, n_train)[:, None]

# True latent function (for data generation only)
def true_latent(x):
    return np.sin(1.5 * x).ravel()

f_true = true_latent(X_train.ravel())

# Convert latent values to probabilities via the standard normal CDF (probit)
Phi_np = lambda z: norm.cdf(z)
prob = Phi_np(f_true)

# Sample binary labels using the probit probabilities
y_train = RNG.binomial(1, prob)

# Check no NaNs/Infs in the data
assert np.isfinite(X_train).all(), "X_train contains NaN or Inf"
assert np.isfinite(y_train).all(), "y_train contains NaN or Inf"

# Prepare test points for prediction
X_test = np.linspace(-4, 4, 200)[:, None]

# Build the PyMC model with numerical-stability improvements
with pm.Model() as gp_probit_model:
    # Narrower, more stable hyperpriors to avoid extremes
    ls = pm.Gamma("ls", alpha=3.0, beta=1.0)   # lengthscale
    eta = pm.HalfNormal("eta", sigma=0.5)      # amplitude

    # Kernel: RBF (ExpQuad) plus a tiny white noise term for numerical stability
    # The WhiteNoise term adds a small diagonal jitter to the covariance matrix.
    cov = eta**2 * pm.gp.cov.ExpQuad(input_dim=1, ls=ls) + pm.gp.cov.WhiteNoise(1e-6)

    # Latent GP prior
    gp = pm.gp.Latent(cov_func=cov)
    f = gp.prior("f", X=X_train)

    # Probit link: transform latent f to probability using normal CDF
    p = 0.5 * (1 + pm.math.erf(f / pm.math.sqrt(2)))

    # Bernoulli likelihood
    y = pm.Bernoulli("y", p=p, observed=y_train)

    # Conditional GP for prediction: add explicit jitter here as well
    f_pred = gp.conditional("f_pred", X_test, jitter=1e-6)

    # Inference: sample from the posterior.
    # Use cores=1 to avoid multiprocessing issues (safe in notebooks).
    # If you run in a robust terminal environment and want parallel chains, set cores>1.
    trace = pm.sample(
        draws=100,
        tune=100,
        chains=2,
        cores=1,                   # <-- run single-core to avoid EOFError from multiprocessing
        target_accept=0.8,
        return_inferencedata=True,
        random_seed=42,
    )

    # Draw posterior predictive latent samples for f_pred directly using pm.draw
    f_pred_samples = pm.draw(f_pred, draws=500, random_seed=42)


# Extract latent predictive samples (shape: n_samples x n_points)
f_pred_samples = ppc["f_pred"]

# Convert latent draws to probabilities via normal cdf (probit)
prob_pred_samples = norm.cdf(f_pred_samples)

# Posterior mean probability and 95% credible interval
prob_mean = prob_pred_samples.mean(axis=0)
prob_hpd = az.hdi(prob_pred_samples, hdi_prob=0.95)

# Plot results
plt.figure(figsize=(8, 5))
plt.scatter(X_train.ravel(), y_train - 0.03 + 0.06 * RNG.random(n_train),
            c=y_train, cmap="bwr", label="training labels")
plt.plot(X_test.ravel(), prob_mean, label="posterior mean prob", lw=2)
plt.fill_between(X_test.ravel(), prob_hpd[:, 0], prob_hpd[:, 1], alpha=0.3, label="95% credible interval")
plt.ylim(-0.1, 1.1)
plt.xlabel("X")
plt.ylabel("Prob(y=1)")
plt.title("Gaussian Process Classification with Probit Likelihood (PyMC)")
plt.legend()
plt.show()

# Print a brief summary of hyperparameters posterior
print(az.summary(trace, var_names=["ls", "eta"]))

# Approximate training accuracy: find nearest test indices for train points
idx_closest = np.argmin(np.abs(X_test.ravel()[None, :] - X_train.ravel()[:, None]), axis=1)
train_preds_at_trainloc = (prob_mean[idx_closest] >= 0.5).astype(int)
acc = accuracy_score(y_train, train_preds_at_trainloc)
print(f"Approximate training accuracy (at nearest test points): {acc:.3f}")


Initializing NUTS using jitter+adapt_diag...
Sequential sampling (2 chains in 1 job)
NUTS: [ls, eta, f_rotated_, f_pred]


Output()

Sampling 2 chains for 100 tune and 100 draw iterations (200 + 200 draws total) took 257 seconds.
Chain 0 reached the maximum tree depth. Increase `max_treedepth`, increase `target_accept` or reparameterize.
We recommend running at least 4 chains for robust computation of convergence diagnostics
The rhat statistic is larger than 1.01 for some parameters. This indicates problems during sampling. See https://arxiv.org/abs/1903.08008 for details
The effective sample size per chain is smaller than 100 for some parameters.  A higher number is needed for reliable rhat and ess computation. See https://arxiv.org/abs/1903.08008 for details
Sampling: [f_pred]


Output()

KeyError: 'f_pred'