# Import packages and load data

In [1]:
import sys
import os
import time
sys.path.append(os.path.abspath('../..'))

import gpflow
import tensorflow as tf
from gpflow.optimizers import Scipy

from rcgp.morcgp import MOGPRegressor, MORCGPRegressor, MOGPRegressor_NC, MORCGPRegressor_NC, MORCGPRegressor_NC_fixed_weights, MORCGPRegressor_fixed_weights, MORCGPRegressor_PM, MORCGP, MORCGP_shared_noise
from rcgp.rcgp import RCGPRegressor
from rcgp.kernels import ConstantMean, RBFKernel, SineMean
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.covariance import MinCovDet
import pandas as pd

plt.rcParams.update({
    "text.usetex": True,         
    "font.family": "serif",       
    "text.latex.preamble": r"\usepackage{amsmath}",
    'font.size': 28,         
    'axes.labelsize': 28,    
    'axes.titlesize': 30,      # <-- Add this line for title size
    'xtick.labelsize': 24,   
    'ytick.labelsize': 24,  
    'legend.fontsize': 24,
    'lines.linewidth': 5,    
    'lines.markersize': 6   
})






  import pkg_resources


In [2]:
def generate_A(d, r=1, base_strength=1.0, noise_level=0.1, seed=None):
    if seed is not None:
        np.random.seed(seed)
    shared_component = base_strength * np.ones((d, r))
    noise = noise_level * np.random.randn(d, r)
    A = shared_component + noise
    return A

def calculate_rmse(y_true, y_pred):
    errors = y_true - y_pred
    squared_errors = errors ** 2
    mse = np.mean(squared_errors)
    rmse = np.sqrt(mse)
    return rmse

def nlpd(Y_true, mu_pred, var_pred):
    epsilon = 1e-10
    var_pred = np.maximum(var_pred, epsilon)
    
    nlpd_values = 0.5 * np.log(2 * np.pi * var_pred) + ((Y_true - mu_pred) ** 2) / (2 * var_pred)
    
    return np.mean(nlpd_values)

In [3]:
def uniform_outliers_c1(Y: np.ndarray, percent_outliers: float, start: float, end: float) -> np.ndarray:
    if not (0 <= percent_outliers <= 1):
        raise ValueError("percent_outliers must be between 0 and 1.")
    if start < 0 or end <= start:
        raise ValueError("Invalid range: ensure 0 <= start < end.")

    Y_outliers = Y.copy()
    N, D = Y.shape
    total_elements = N 
    num_outliers = int(np.round(percent_outliers * total_elements))

    row_indices = np.random.choice(N, num_outliers, replace=False)

    signs = np.random.choice([-1, 1], size=num_outliers)

    uniform_values = np.random.uniform(start, end, size=num_outliers) * signs

    Y_outliers[row_indices, 0] += uniform_values

    return Y_outliers

def asymmetric_outliers_c1(Y: np.ndarray, percent_outliers: float, start: float, end: float) -> np.ndarray:
    if not (0 <= percent_outliers <= 1):
        raise ValueError("percent_outliers must be between 0 and 1.")
    if start < 0 or end <= start:
        raise ValueError("Invalid range: ensure 0 <= start < end.")
    
    Y_outliers = Y.copy()
    N, D = Y.shape
    total_elements = N 
    num_outliers = int(np.round(percent_outliers * total_elements))

    row_indices = np.random.choice(N, num_outliers, replace=False)

    uniform_values = np.random.uniform(start, end, size=num_outliers)

    Y_outliers[row_indices, 0] += uniform_values

    return Y_outliers

def focused_outliers_c1(X, Y, percent_outliers, y_value, perturbation=0.1):
    def mad(X):
        medians = np.median(X, axis=0)
        deviations = np.abs(X - medians)
        return np.median(deviations, axis=0)

    X = X.copy()
    Y = Y.copy()

    n_samples, n_features = X.shape
    n_outliers = int(n_samples * percent_outliers)

    # Indices of outliers
    indices = np.random.choice(n_samples, size=n_outliers, replace=False)

    medians_2d = np.tile(np.median(X, axis=0), (n_outliers, 1))

    def mad(X, axis=0):
        """Compute Median Absolute Deviation (MAD)"""
        med = np.median(X, axis=axis)
        return np.median(np.abs(X - med), axis=axis)

    mads = mad(X)
    mads_2d = np.tile(mads, (n_outliers, 1))

    u = np.random.uniform(0, perturbation, size=medians_2d.shape)
    X_outliers = medians_2d + u * mads_2d

    # Create a 1D array of size n_outliers with all elements = y_value
    Y_outliers = np.full(shape=n_outliers, fill_value=y_value)

    first_column = Y[:, 0]
    median_y0 = np.median(first_column)
    mad_y0 = np.median(np.abs(first_column - median_y0))
    Y_mad_outliers = np.full(shape=n_outliers, fill_value=mad_y0)

    # Draw u independently for each element
    u = np.random.uniform(0, perturbation, size=Y_outliers.shape)

    # Compute the perturbed Y_outliers
    Y_outliers_perturbed = Y_outliers + u * Y_mad_outliers

    # Replace rows in X at the outlier indices
    X[indices, :] = X_outliers

    # Replace the first column in Y at the outlier indices
    Y[indices, 0] = Y_outliers_perturbed

    return X, Y

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Step 1: Generate a smooth dataset
np.random.seed(42)  # for reproducibility
n_samples = 100

# X is 1D but we make it 2D (n_samples, 1)
X = np.linspace(0, 10, n_samples).reshape(-1, 1)

# Y depends smoothly on X, add some noise
Y = np.sin(X) + 0.1 * np.random.randn(n_samples, 1)

# Step 2: Introduce focused outliers using your function
X_out, Y_out = focused_outliers_c1(X, Y, percent_outliers=0.1, y_value=5, perturbation=0.3)

# Step 3: Plot the original and outlier dataset
plt.figure(figsize=(10, 6))
plt.scatter(X, Y, color='blue', label='Original Data')
plt.scatter(X_out, Y_out, color='red', label='With Outliers')
plt.xlabel('X')
plt.ylabel('Y')
plt.title('Original Dataset and Dataset with Focused Outliers')
plt.legend()
plt.show()

In [118]:
np.random.seed(42) 
n_samples = 100 # Number of samples 
n_features = 5 # Number of features 
percent_outliers = 0.1 # 10% of samples are outliers 
y_value = 3 # Generate X as random numbers (normal distribution) 
X = np.random.randn(n_samples, n_features) # Generate Y as a linear combination of X plus some noise 
coefficients = np.random.randn(n_features) 
Y = X @ coefficients + 0.5 * np.random.randn(n_samples)
Y = Y.reshape(-1,1)

print('X shape:', X.shape)
print('Y shape:', Y.shape)

X_out, Y_out = focused_outliers_c1(X, Y, 0.1, 5, perturbation=0.1)
print('X_out shape:', X_out.shape)
print('Y_out shape:', Y_out.shape)

X shape: (100, 5)
Y shape: (100, 1)
X_out shape: (100, 5)
Y_out shape: (100, 1)


In [4]:
def make_X_multi(X, D=2):
    """
    X: shape (N, input_dim) - multi-dimensional input
    D: number of tasks
    """
    N, input_dim = X.shape
    X_multi = []
    
    for task in range(D):
        # Add task index as last column
        X_task = np.hstack([X, np.full((N, 1), task)])
        X_multi.append(X_task)
    
    return np.vstack(X_multi)  # Shape: (N*D, input_dim + 1)

In [5]:
def run_MOGP(X_train_scaled, Y_train_scaled, X_test_scaled, Y_test_scaled):

    start_total = time.time()

    # --- 1. Prepare multi-task inputs ---
    X_multi_train = make_X_multi(X_train_scaled, D=2)
    X_multi_test = make_X_multi(X_test_scaled, D=2)
    Y_multi_train = Y_train_scaled.reshape(-1, 1, order='F')
    Y_multi_test = Y_test_scaled.reshape(-1, 1, order='F')

    input_dim = X_train_scaled.shape[1]  # number of features
    D = 2  # number of tasks

    # --- 2. Define kernel ---
    base_kernel = gpflow.kernels.RBF(
        lengthscales=1.0,
        variance=0.1,
        active_dims=list(range(input_dim))
    )

    coregion_kernel = gpflow.kernels.Coregion(
        output_dim=D,
        rank=D,
        active_dims=[input_dim]
    )

    # Fix the diagonal of coregion kernel
    gpflow.utilities.set_trainable(coregion_kernel.kappa, False)
    coregion_kernel.kappa.assign(tf.ones_like(coregion_kernel.kappa) * 1e-6)

    # Combine kernels
    kernel = base_kernel * coregion_kernel

    # --- 3. Build exact GP model ---
    model_gpr = gpflow.models.GPR(
        data=(X_multi_train, Y_multi_train),
        kernel=kernel,
        mean_function=None
    )

    # Optionally, you can fix the base kernel variance as before
    gpflow.utilities.set_trainable(base_kernel.variance, False)

    # --- 4. Optimize hyperparameters ---
    opt = Scipy()

    def objective_closure_gpr():
        return -model_gpr.log_marginal_likelihood()
    try:
        opt.minimize(objective_closure_gpr, model_gpr.trainable_variables, options=dict(maxiter=1000))
    except Exception as e:
        print(f"Optimization failed: {e}")
        print("Try reducing maxiter or checking data shapes")

    # --- 5. Predict on test data ---
    mean_pred_mogp, var_pred_mogp = model_gpr.predict_y(X_multi_test)
    mu_mogp, std_mogp = mean_pred_mogp.numpy().reshape(-1, D, order='F'), np.sqrt(var_pred_mogp.numpy()).reshape(-1, D, order='F')

    end_total = time.time()
    time_mogp = end_total - start_total

    rmse_mogp = calculate_rmse(Y_test_scaled, mu_mogp.reshape(-1, D, order='F'))
    nlpd_mogp = nlpd(Y_test_scaled, mu_mogp.reshape(-1, D, order='F'), std_mogp.reshape(-1, D, order='F')**2)
    
    return rmse_mogp, nlpd_mogp, time_mogp


def run_MORCGP(X_train_scaled, Y_train_scaled, X_test_scaled, Y_test_scaled, prop_outliers, k=1):
    # Measure total time
    start_total = time.time()

    mcd = MinCovDet(support_fraction=1-prop_outliers).fit(Y_train_scaled)
    robust_covariance = mcd.covariance_
    robust_init_A = np.linalg.cholesky(robust_covariance)

    morcgp = MORCGP_shared_noise(mean=0, length_scale=1, noise_var=0.1, A=robust_init_A)
    morcgp.fit(X_train_scaled, Y_train_scaled, epsilons=np.array([prop_outliers, 0]))
    morcgp.optimize_loo_cv(print_opt_param=False, print_iter_objective=False, k=k, init_cov=robust_covariance, fix_weights=True)

    mu_morcgp, var_morcgp = morcgp.predict(X_test_scaled)
    std_morcgp = np.sqrt(var_morcgp + morcgp.noise_var)
    end_total = time.time()

    time_morcgp = end_total - start_total
    rmse_morcgp = calculate_rmse(Y_test_scaled, mu_morcgp)
    nlpd_morcgp = nlpd(Y_test_scaled, mu_morcgp, std_morcgp**2)

    return rmse_morcgp, nlpd_morcgp, time_morcgp

def run_tMOGP(X_train_scaled, Y_train_scaled, X_test_scaled, Y_test_scaled, df):
    # Create multi-task inputs
    X_multi_train = make_X_multi(X_train_scaled, D=2)
    X_multi_test = make_X_multi(X_test_scaled, D=2)
    Y_multi_train = Y_train_scaled.reshape(-1, 1, order='F')
    Y_multi_test = Y_test_scaled.reshape(-1, 1, order='F')

    input_dim = X_train_scaled.shape[1]  # This is the key fix!
    N = X_train_scaled.shape[0]
    D = 2

    start_total = time.time()

    base_kernel = gpflow.kernels.RBF(
        lengthscales=1.0, 
        variance=0.1, 
        active_dims=list(range(input_dim)) ,
    )

    coregion_kernel = gpflow.kernels.Coregion(
        output_dim=D, 
        rank=D, 
        active_dims=[input_dim]  
    )

    gpflow.utilities.set_trainable(base_kernel.variance, False)

    gpflow.utilities.set_trainable(coregion_kernel.kappa, False)
    coregion_kernel.kappa.assign(tf.ones_like(coregion_kernel.kappa) * 1e-6)

    kernel = base_kernel * coregion_kernel

    likelihood_vgp = gpflow.likelihoods.StudentT(df=df)
    # gpflow.utilities.set_trainable(likelihood_vgp.scale, False)
    model_vgp = gpflow.models.VGP(
        data=(X_multi_train, Y_multi_train),
        kernel=kernel,
        likelihood=likelihood_vgp
    )

    opt = Scipy()
    def objective_closure_vgp():
        return -model_vgp.maximum_log_likelihood_objective()

    try:
        opt.minimize(objective_closure_vgp, model_vgp.trainable_variables, options=dict(maxiter=1000))
    except Exception as e:
        print(f"Optimization failed: {e}")
        print("Try reducing maxiter or checking data shapes")

    mean_pred_tmogp, var_pred_tmogp = model_vgp.predict_y(X_multi_test)
    mu_tmogp, std_tmogp = mean_pred_tmogp.numpy().reshape(-1, D, order='F'), np.sqrt(var_pred_tmogp.numpy()).reshape(-1, D, order='F')
    end_total = time.time()

    time_tmogp = end_total - start_total
    rmse_tmogp = calculate_rmse(Y_test_scaled, mu_tmogp.reshape(-1, D, order='F'))
    nlpd_tmogp = nlpd(Y_test_scaled, mu_tmogp.reshape(-1, D, order='F'), std_tmogp.reshape(-1, D, order='F')**2)
    
    return rmse_tmogp, nlpd_tmogp, time_tmogp

In [6]:
def run_MOGP_numpy(X_train_scaled, Y_train_scaled, X_test_scaled, Y_test_scaled, prop_outliers):

    # Measure total time
    start_total = time.time()

    mcd = MinCovDet(support_fraction=1-prop_outliers).fit(Y_train_scaled)
    robust_covariance = mcd.covariance_
    robust_init_A = np.linalg.cholesky(robust_covariance)

    mogp = MOGPRegressor_NC(mean = 0, length_scale=1, noise = 0.1, A=robust_init_A)
    mogp.fit(X_train_scaled, Y_train_scaled)
    mogp.optimize_hyperparameters()

    mu_mogp, var_mogp = mogp.predict(X_test_scaled)
    std_mogp = np.sqrt(var_mogp + mogp.noise)
    end_total = time.time()

    time_mogp = end_total - start_total
    rmse_mogp = calculate_rmse(Y_test_scaled, mu_mogp)
    nlpd_mogp = nlpd(Y_test_scaled, mu_mogp, std_mogp**2)

    return rmse_mogp, nlpd_mogp, time_mogp


In [7]:
# URL of the dataset
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00242/ENB2012_data.xlsx'

# Read Excel file directly from the URL
df = pd.read_excel(url)

# Extract covariates X (columns X1 to X8)
X = df.loc[:, 'X1':'X8'].to_numpy()

# Extract target variables Y (columns Y1 and Y2)
Y = df.loc[:, ['Y1', 'Y2']].to_numpy()

# No outliers

In [55]:
# Split data into train and test sets (default test size = 25%)
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.25, random_state=42
)

scaler_X = StandardScaler()
X_train_scaled = scaler_X.fit_transform(X_train)
X_test_scaled = scaler_X.transform(X_test)

scaler_Y = StandardScaler()
Y_train_scaled = scaler_Y.fit_transform(Y_train)
Y_test_scaled = scaler_Y.transform(Y_test)

print("X_train shape:", X_train_scaled.shape)
print("X_test shape:", X_test_scaled.shape)
print("Y_train shape:", Y_train_scaled.shape)
print("Y_test shape:", Y_test_scaled.shape)

X_train shape: (576, 8)
X_test shape: (192, 8)
Y_train shape: (576, 2)
Y_test shape: (192, 2)


In [None]:
rmses_mogp, rmses_morcgp, rmses_tmogp = [], [], []
nlpds_mogp, nlpds_morcgp, nlpds_tmogp = [], [], []
times_mogp, times_morcgp, times_tmogp = [], [], []

prop_outliers = 0
num_seeds = 20

run_mogp = True
run_morcgp = False
run_tmogp = False

for i in tqdm(range(num_seeds)):
    # Split data into train and test sets (default test size = 25%)
    X_train, X_test, Y_train, Y_test = train_test_split(
        X, Y, test_size=0.25, random_state=i
    )

    scaler_X = StandardScaler()
    X_train_scaled = scaler_X.fit_transform(X_train)
    X_test_scaled = scaler_X.transform(X_test)

    scaler_Y = StandardScaler()
    Y_train_scaled = scaler_Y.fit_transform(Y_train)
    Y_test_scaled = scaler_Y.transform(Y_test)

    if run_mogp:
        rmse_mogp, nlpd_mogp, time_mogp = run_MOGP_numpy(X_train_scaled, Y_train_scaled, X_test_scaled, Y_test_scaled, prop_outliers=prop_outliers)
        print(f'No outliers: MOGP seed {i}: RMSE = {rmse_mogp}, NLPD = {nlpd_mogp}, Time = {time_mogp}')
        rmses_mogp.append(rmse_mogp)
        nlpds_mogp.append(nlpd_mogp)
        times_mogp.append(time_mogp)
    if run_morcgp:
        rmse_morcgp, nlpd_morcgp, time_morcgp = run_MORCGP(X_train_scaled, Y_train_scaled, X_test_scaled, Y_test_scaled, prop_outliers=prop_outliers, k=2)
        rmses_morcgp.append(rmse_morcgp)
        nlpds_morcgp.append(nlpd_morcgp)
        times_morcgp.append(time_morcgp)
    if run_tmogp:
        rmse_tmogp, nlpd_tmogp, time_tmogp = run_tMOGP(X_train_scaled, Y_train_scaled, X_test_scaled, Y_test_scaled, df=10)
        print(f'No outliers: t-MOGP seed {i}: RMSE = {rmse_tmogp}, NLPD = {nlpd_tmogp}, Time = {time_tmogp}')
        rmses_tmogp.append(rmse_tmogp)
        nlpds_tmogp.append(nlpd_tmogp)
        times_tmogp.append(time_tmogp)

if run_mogp:
    print("RMSE:", ", ".join(f"{x:.4f}" for x in rmses_mogp))
    print("NLPD:", ", ".join(f"{x:.4f}" for x in nlpds_mogp))
    print("Time:", ", ".join(f"{x:.4f}" for x in times_mogp))
    print(f'RMSE MOGP: {np.mean(rmses_mogp):.4f} ± {np.std(rmses_mogp):.4f}')
    print(f'NLPD MOGP: {np.mean(nlpds_mogp):.4f} ± {np.std(nlpds_mogp):.4f}')
    print(f'Time MOGP: {np.mean(times_mogp):.4f} ± {np.std(times_mogp):.4f}')

if run_morcgp:
    print(f'RMSE MORCGP: {np.mean(rmses_morcgp):.4f} ± {np.std(rmses_morcgp):.4f}')
    print(f'NLPD MORCGP: {np.mean(nlpds_morcgp):.4f} ± {np.std(nlpds_morcgp):.4f}')
    print(f'Time MORCGP: {np.mean(times_morcgp):.4f} ± {np.std(times_morcgp):.4f}')

if run_tmogp:
    print("RMSE:", ", ".join(f"{x:.4f}" for x in rmses_tmogp))
    print("NLPD:", ", ".join(f"{x:.4f}" for x in nlpds_tmogp))
    print("Time:", ", ".join(f"{x:.4f}" for x in times_tmogp))
    print(f'RMSE t-MOGP: {np.mean(rmses_tmogp):.4f} ± {np.std(rmses_tmogp):.4f}')
    print(f'NLPD t-MOGP: {np.mean(nlpds_tmogp):.4f} ± {np.std(nlpds_tmogp):.4f}')
    print(f'Time t-MOGP: {np.mean(times_tmogp):.4f} ± {np.std(times_tmogp):.4f}')

  5%|▌         | 1/20 [02:38<50:02, 158.00s/it]

No outliers: MOGP seed 0: RMSE = 0.11764126905016717, NLPD = -0.7841519965588765, Time = 158.00338864326477


# Uniform outliers

In [None]:
rmses_mogp, rmses_morcgp, rmses_tmogp = [], [], []
nlpds_mogp, nlpds_morcgp, nlpds_tmogp = [], [], []
times_mogp, times_morcgp, times_tmogp = [], [], []

prop_outliers = 0.1
num_seeds = 20

run_mogp = False
run_morcgp = False
run_tmogp = True

for i in tqdm(range(num_seeds)):
    # Split data into train and test sets (default test size = 25%)
    X_train, X_test, Y_train, Y_test = train_test_split(
        X, Y, test_size=0.25, random_state=i
    )

    scaler_X = StandardScaler()
    X_train_scaled = scaler_X.fit_transform(X_train)
    X_test_scaled = scaler_X.transform(X_test)

    scaler_Y = StandardScaler()
    Y_train_scaled = scaler_Y.fit_transform(Y_train)
    Y_test_scaled = scaler_Y.transform(Y_test)

    Y_train_scaled = uniform_outliers_c1(Y=Y_train_scaled, percent_outliers=prop_outliers, start=6, end=9)

    if run_mogp:
        rmse_mogp, nlpd_mogp, time_mogp = run_MOGP_numpy(X_train_scaled, Y_train_scaled, X_test_scaled, Y_test_scaled, prop_outliers=prop_outliers)
        print(f'Uniform outliers: MOGP seed {i}: RMSE = {rmse_mogp}, NLPD = {nlpd_mogp}, Time = {time_mogp}')
        rmses_mogp.append(rmse_mogp)
        nlpds_mogp.append(nlpd_mogp)
        times_mogp.append(time_mogp)
    if run_morcgp:
        rmse_morcgp, nlpd_morcgp, time_morcgp = run_MORCGP(X_train_scaled, Y_train_scaled, X_test_scaled, Y_test_scaled, prop_outliers=prop_outliers, k=2)
        rmses_morcgp.append(rmse_morcgp)
        nlpds_morcgp.append(nlpd_morcgp)
        times_morcgp.append(time_morcgp)
    if run_tmogp:
        rmse_tmogp, nlpd_tmogp, time_tmogp = run_tMOGP(X_train_scaled, Y_train_scaled, X_test_scaled, Y_test_scaled, df=10)
        print(f'Uniform outliers: t-MOGP seed {i}: RMSE = {rmse_tmogp}, NLPD = {nlpd_tmogp}, Time = {time_tmogp}')
        rmses_tmogp.append(rmse_tmogp)
        nlpds_tmogp.append(nlpd_tmogp)
        times_tmogp.append(time_tmogp)

if run_mogp:
    print("RMSE:", ", ".join(f"{x:.4f}" for x in rmses_mogp))
    print("NLPD:", ", ".join(f"{x:.4f}" for x in nlpds_mogp))
    print("Time:", ", ".join(f"{x:.4f}" for x in times_mogp))
    print(f'RMSE MOGP: {np.mean(rmses_mogp):.4f} ± {np.std(rmses_mogp):.4f}')
    print(f'NLPD MOGP: {np.mean(nlpds_mogp):.4f} ± {np.std(nlpds_mogp):.4f}')
    print(f'Time MOGP: {np.mean(times_mogp):.4f} ± {np.std(times_mogp):.4f}')

if run_morcgp:
    print(f'RMSE MORCGP: {np.mean(rmses_morcgp):.4f} ± {np.std(rmses_morcgp):.4f}')
    print(f'NLPD MORCGP: {np.mean(nlpds_morcgp):.4f} ± {np.std(nlpds_morcgp):.4f}')
    print(f'Time MORCGP: {np.mean(times_morcgp):.4f} ± {np.std(times_morcgp):.4f}')

if run_tmogp:
    print("RMSE:", ", ".join(f"{x:.4f}" for x in rmses_tmogp))
    print("NLPD:", ", ".join(f"{x:.4f}" for x in nlpds_tmogp))
    print("Time:", ", ".join(f"{x:.4f}" for x in times_tmogp))
    print(f'RMSE t-MOGP: {np.mean(rmses_tmogp):.4f} ± {np.std(rmses_tmogp):.4f}')
    print(f'NLPD t-MOGP: {np.mean(nlpds_tmogp):.4f} ± {np.std(nlpds_tmogp):.4f}')
    print(f'Time t-MOGP: {np.mean(times_tmogp):.4f} ± {np.std(times_tmogp):.4f}')

  5%|▌         | 1/20 [05:19<1:41:06, 319.27s/it]

Uniform outliers: t-MOGP seed 0: RMSE = 0.16046423477245877, NLPD = -0.19587417636046048, Time = 319.26712107658386


 10%|█         | 2/20 [10:44<1:36:50, 322.79s/it]

Uniform outliers: t-MOGP seed 1: RMSE = 0.18169544708796684, NLPD = -0.15461373976985493, Time = 325.2557153701782


 15%|█▌        | 3/20 [16:23<1:33:31, 330.06s/it]

Uniform outliers: t-MOGP seed 2: RMSE = 0.1670191217701907, NLPD = -0.193453972583591, Time = 338.71628046035767


 20%|██        | 4/20 [21:57<1:28:26, 331.68s/it]

Uniform outliers: t-MOGP seed 3: RMSE = 0.1551287069745665, NLPD = -0.20255872378566578, Time = 334.16786074638367


 25%|██▌       | 5/20 [27:30<1:23:02, 332.16s/it]

Uniform outliers: t-MOGP seed 4: RMSE = 0.14433529112092974, NLPD = -0.2321454055852291, Time = 333.0067844390869


 30%|███       | 6/20 [33:07<1:17:54, 333.91s/it]

Uniform outliers: t-MOGP seed 5: RMSE = 0.15651104973572963, NLPD = -0.20483856766756717, Time = 337.28333282470703


 35%|███▌      | 7/20 [38:42<1:12:22, 334.03s/it]

Uniform outliers: t-MOGP seed 6: RMSE = 0.1660717040002731, NLPD = -0.20368355420485393, Time = 334.301744222641


 40%|████      | 8/20 [44:13<1:06:39, 333.31s/it]

Uniform outliers: t-MOGP seed 7: RMSE = 0.16133149580994577, NLPD = -0.19834573097481326, Time = 331.7658154964447


 45%|████▌     | 9/20 [49:50<1:01:18, 334.44s/it]

Uniform outliers: t-MOGP seed 8: RMSE = 0.16389321077233227, NLPD = -0.1776032497659946, Time = 336.93428802490234


 50%|█████     | 10/20 [55:27<55:51, 335.18s/it] 

Uniform outliers: t-MOGP seed 9: RMSE = 0.16552643655124014, NLPD = -0.19621299118977112, Time = 336.81599259376526


 55%|█████▌    | 11/20 [1:00:59<50:08, 334.23s/it]

Uniform outliers: t-MOGP seed 10: RMSE = 0.14972170103071591, NLPD = -0.2168111549002698, Time = 332.07683515548706


 60%|██████    | 12/20 [1:06:25<44:14, 331.83s/it]

Uniform outliers: t-MOGP seed 11: RMSE = 0.15575365592008436, NLPD = -0.21925368700120593, Time = 326.333708524704


 65%|██████▌   | 13/20 [1:12:02<38:53, 333.34s/it]

Uniform outliers: t-MOGP seed 12: RMSE = 0.16668539976508512, NLPD = -0.19028203493013432, Time = 336.80171728134155


 70%|███████   | 14/20 [1:17:36<33:20, 333.44s/it]

Uniform outliers: t-MOGP seed 13: RMSE = 0.15292915899550327, NLPD = -0.2138099136338459, Time = 333.6870172023773


 75%|███████▌  | 15/20 [1:23:11<27:50, 334.07s/it]

Uniform outliers: t-MOGP seed 14: RMSE = 0.14327585233240553, NLPD = -0.23267053873668134, Time = 335.5295424461365


 80%|████████  | 16/20 [1:28:43<22:12, 333.17s/it]

Uniform outliers: t-MOGP seed 15: RMSE = 0.1824531359294169, NLPD = -0.1625948848600273, Time = 331.0837371349335


 85%|████████▌ | 17/20 [1:34:13<16:37, 332.41s/it]

Uniform outliers: t-MOGP seed 16: RMSE = 0.1543115920918446, NLPD = -0.2026589857405748, Time = 330.65034008026123


 90%|█████████ | 18/20 [1:39:46<11:04, 332.40s/it]

Uniform outliers: t-MOGP seed 17: RMSE = 0.1531524648707857, NLPD = -0.22131697847621903, Time = 332.3805992603302


 95%|█████████▌| 19/20 [1:45:25<05:34, 334.54s/it]

Uniform outliers: t-MOGP seed 18: RMSE = 0.1652400049353904, NLPD = -0.17567344686906675, Time = 339.50209760665894


100%|██████████| 20/20 [1:51:04<00:00, 333.23s/it]

Uniform outliers: t-MOGP seed 19: RMSE = 0.1537309242943694, NLPD = -0.1942909020157757, Time = 339.0015666484833
RMSE: 0.1605, 0.1817, 0.1670, 0.1551, 0.1443, 0.1565, 0.1661, 0.1613, 0.1639, 0.1655, 0.1497, 0.1558, 0.1667, 0.1529, 0.1433, 0.1825, 0.1543, 0.1532, 0.1652, 0.1537
NLPD: -0.1959, -0.1546, -0.1935, -0.2026, -0.2321, -0.2048, -0.2037, -0.1983, -0.1776, -0.1962, -0.2168, -0.2193, -0.1903, -0.2138, -0.2327, -0.1626, -0.2027, -0.2213, -0.1757, -0.1943
Time: 319.2671, 325.2557, 338.7163, 334.1679, 333.0068, 337.2833, 334.3017, 331.7658, 336.9343, 336.8160, 332.0768, 326.3337, 336.8017, 333.6870, 335.5295, 331.0837, 330.6503, 332.3806, 339.5021, 339.0016
RMSE t-MOGP: 0.1600 ± 0.0101
NLPD t-MOGP: -0.1994 ± 0.0202
Time t-MOGP: 333.2281 ± 4.9379





# Asymmetric outliers

In [None]:
rmses_mogp, rmses_morcgp, rmses_tmogp = [], [], []
nlpds_mogp, nlpds_morcgp, nlpds_tmogp = [], [], []
times_mogp, times_morcgp, times_tmogp = [], [], []

prop_outliers = 0.1
num_seeds = 1

run_mogp = False
run_morcgp = False
run_tmogp = True

for i in tqdm(range(num_seeds)):
    # Split data into train and test sets (default test size = 25%)
    X_train, X_test, Y_train, Y_test = train_test_split(
        X, Y, test_size=0.25, random_state=i
    )
    np.random.seed(i)

    scaler_X = StandardScaler()
    X_train_scaled = scaler_X.fit_transform(X_train)
    X_test_scaled = scaler_X.transform(X_test)

    scaler_Y = StandardScaler()
    Y_train_scaled = scaler_Y.fit_transform(Y_train)
    Y_test_scaled = scaler_Y.transform(Y_test)

    Y_train_scaled = asymmetric_outliers_c1(Y=Y_train_scaled, percent_outliers=prop_outliers, start=3, end=9)

    if run_mogp:
        rmse_mogp, nlpd_mogp, time_mogp = run_MOGP_numpy(X_train_scaled, Y_train_scaled, X_test_scaled, Y_test_scaled, prop_outliers=prop_outliers)
        print(f'Asymmetric outliers: MOGP seed {i}: RMSE = {rmse_mogp}, NLPD = {nlpd_mogp}, Time = {time_mogp}')
        rmses_mogp.append(rmse_mogp)
        nlpds_mogp.append(nlpd_mogp)
        times_mogp.append(time_mogp)
    if run_morcgp:
        rmse_morcgp, nlpd_morcgp, time_morcgp = run_MORCGP(X_train_scaled, Y_train_scaled, X_test_scaled, Y_test_scaled, prop_outliers=prop_outliers, k=2)
        rmses_morcgp.append(rmse_morcgp)
        nlpds_morcgp.append(nlpd_morcgp)
        times_morcgp.append(time_morcgp)
    if run_tmogp:
        rmse_tmogp, nlpd_tmogp, time_tmogp = run_tMOGP(X_train_scaled, Y_train_scaled, X_test_scaled, Y_test_scaled, df=10)
        print(f'Asymmetric outliers: t-MOGP seed {i}: RMSE = {rmse_tmogp}, NLPD = {nlpd_tmogp}, Time = {time_tmogp}')
        rmses_tmogp.append(rmse_tmogp)
        nlpds_tmogp.append(nlpd_tmogp)
        times_tmogp.append(time_tmogp)

if run_mogp:
    print("RMSE:", ", ".join(f"{x:.4f}" for x in rmses_mogp))
    print("NLPD:", ", ".join(f"{x:.4f}" for x in nlpds_mogp))
    print("Time:", ", ".join(f"{x:.4f}" for x in times_mogp))
    print(f'RMSE MOGP: {np.mean(rmses_mogp):.4f} ± {np.std(rmses_mogp):.4f}')
    print(f'NLPD MOGP: {np.mean(nlpds_mogp):.4f} ± {np.std(nlpds_mogp):.4f}')
    print(f'Time MOGP: {np.mean(times_mogp):.4f} ± {np.std(times_mogp):.4f}')

if run_morcgp:
    print(f'RMSE MORCGP: {np.mean(rmses_morcgp):.4f} ± {np.std(rmses_morcgp):.4f}')
    print(f'NLPD MORCGP: {np.mean(nlpds_morcgp):.4f} ± {np.std(nlpds_morcgp):.4f}')
    print(f'Time MORCGP: {np.mean(times_morcgp):.4f} ± {np.std(times_morcgp):.4f}')

if run_tmogp:
    print("RMSE:", ", ".join(f"{x:.4f}" for x in rmses_tmogp))
    print("NLPD:", ", ".join(f"{x:.4f}" for x in nlpds_tmogp))
    print("Time:", ", ".join(f"{x:.4f}" for x in times_tmogp))
    print(f'RMSE t-MOGP: {np.mean(rmses_tmogp):.4f} ± {np.std(rmses_tmogp):.4f}')
    print(f'NLPD t-MOGP: {np.mean(nlpds_tmogp):.4f} ± {np.std(nlpds_tmogp):.4f}')
    print(f'Time t-MOGP: {np.mean(times_tmogp):.4f} ± {np.std(times_tmogp):.4f}')

100%|██████████| 1/1 [01:55<00:00, 115.55s/it]

Asymmetric outliers: t-MOGP seed 0: RMSE = 0.2961992591670887, NLPD = 0.5697941383085201, Time = 115.55044722557068
RMSE: 0.2962
NLPD: 0.5698
Time: 115.5504
RMSE t-MOGP: 0.2962 ± 0.0000
NLPD t-MOGP: 0.5698 ± 0.0000
Time t-MOGP: 115.5504 ± 0.0000





In [99]:
# Measure total time
start_total = time.time()

prop_outliers = 0.1

mcd = MinCovDet(support_fraction=1-prop_outliers).fit(Y_train_scaled)
robust_covariance = mcd.covariance_
print(robust_covariance)
robust_init_A = np.linalg.cholesky(robust_covariance)

morcgp = MORCGP_shared_noise(mean=0, length_scale=1, noise_var=0.1, A=robust_init_A)
morcgp.fit(X_train_scaled, Y_train_scaled, epsilons=np.array([prop_outliers, 0]))
init_gamma, init_c, gamma, c = morcgp.optimize_loo_cv(print_opt_param=True, print_iter_objective=False, k=2, init_cov=robust_covariance, fix_weights=True)

mu_morcgp, var_morcgp = morcgp.predict(X_test_scaled)
std_morcgp = np.sqrt(var_morcgp + morcgp.noise_var)

end_total = time.time()
print(f"Total runtime: {end_total - start_total:.4f} seconds")

[[0.94723653 0.90867649]
 [0.90867649 0.90533608]]
Optimized length_scale: 1.1465
Optimized noise_var: 0.030009044314197594
Optimized A: [[ 1.26725414 -0.10627154]
 [ 0.99090882  0.28129842]]
Optimized B: 
[[1.61722669 1.22583928]
 [1.22583928 1.06102909]]
Total runtime: 59.0478 seconds


In [100]:
rmse_morcgp = calculate_rmse(Y_test_scaled, mu_morcgp)

print("RMSE MORCGP:", rmse_morcgp)

nlpd_morcgp = nlpd(Y_test_scaled, mu_morcgp, std_morcgp**2)

print("NLPD MORCGP:", nlpd_morcgp)

RMSE MORCGP: 0.15972676319172377
NLPD MORCGP: -0.1996795777671061


# Focused outliers

In [None]:
rmses_mogp, rmses_morcgp, rmses_tmogp = [], [], []
nlpds_mogp, nlpds_morcgp, nlpds_tmogp = [], [], []
times_mogp, times_morcgp, times_tmogp = [], [], []

prop_outliers = 0.1
num_seeds = 20

run_mogp = False
run_morcgp = False
run_tmogp = True

for i in tqdm(range(num_seeds)):
    # Split data into train and test sets (default test size = 25%)
    X_train, X_test, Y_train, Y_test = train_test_split(
        X, Y, test_size=0.25, random_state=i
    )

    scaler_X = StandardScaler()
    X_train_scaled = scaler_X.fit_transform(X_train)
    X_test_scaled = scaler_X.transform(X_test)

    scaler_Y = StandardScaler()
    Y_train_scaled = scaler_Y.fit_transform(Y_train)
    Y_test_scaled = scaler_Y.transform(Y_test)

    X_train_scaled, Y_train_scaled = focused_outliers_c1(X=X_train_scaled, Y=Y_train_scaled, percent_outliers=prop_outliers, y_value=6, perturbation=0.1)

    if run_mogp:
        rmse_mogp, nlpd_mogp, time_mogp = run_MOGP(X_train_scaled, Y_train_scaled, X_test_scaled, Y_test_scaled, prop_outliers=prop_outliers)
        print(f'Focused outliers: MOGP seed {i}: RMSE = {rmse_mogp}, NLPD = {nlpd_mogp}, Time = {time_mogp}')
        rmses_mogp.append(rmse_mogp)
        nlpds_mogp.append(nlpd_mogp)
        times_mogp.append(time_mogp)
    if run_morcgp:
        rmse_morcgp, nlpd_morcgp, time_morcgp = run_MORCGP(X_train_scaled, Y_train_scaled, X_test_scaled, Y_test_scaled, prop_outliers=prop_outliers, k=2)
        rmses_morcgp.append(rmse_morcgp)
        nlpds_morcgp.append(nlpd_morcgp)
        times_morcgp.append(time_morcgp)
    if run_tmogp:
        rmse_tmogp, nlpd_tmogp, time_tmogp = run_tMOGP(X_train_scaled, Y_train_scaled, X_test_scaled, Y_test_scaled, df=9)
        print(f'Focused outliers: t-MOGP seed {i}: RMSE = {rmse_tmogp}, NLPD = {nlpd_tmogp}, Time = {time_tmogp}')
        rmses_tmogp.append(rmse_tmogp)
        nlpds_tmogp.append(nlpd_tmogp)
        times_tmogp.append(time_tmogp)

if run_mogp:
    print("RMSE:", ", ".join(f"{x:.4f}" for x in rmses_mogp))
    print("NLPD:", ", ".join(f"{x:.4f}" for x in nlpds_mogp))
    print("Time:", ", ".join(f"{x:.4f}" for x in times_mogp))
    print(f'RMSE MOGP: {np.mean(rmses_mogp):.4f} ± {np.std(rmses_mogp):.4f}')
    print(f'NLPD MOGP: {np.mean(nlpds_mogp):.4f} ± {np.std(nlpds_mogp):.4f}')
    print(f'Time MOGP: {np.mean(times_mogp):.4f} ± {np.std(times_mogp):.4f}')

if run_morcgp:
    print(f'RMSE MORCGP: {np.mean(rmses_morcgp):.4f} ± {np.std(rmses_morcgp):.4f}')
    print(f'NLPD MORCGP: {np.mean(nlpds_morcgp):.4f} ± {np.std(nlpds_morcgp):.4f}')
    print(f'Time MORCGP: {np.mean(times_morcgp):.4f} ± {np.std(times_morcgp):.4f}')

if run_tmogp:
    print("RMSE:", ", ".join(f"{x:.4f}" for x in rmses_tmogp))
    print("NLPD:", ", ".join(f"{x:.4f}" for x in nlpds_tmogp))
    print("Time:", ", ".join(f"{x:.4f}" for x in times_tmogp))
    print(f'RMSE t-MOGP: {np.mean(rmses_tmogp):.4f} ± {np.std(rmses_tmogp):.4f}')
    print(f'NLPD t-MOGP: {np.mean(nlpds_tmogp):.4f} ± {np.std(nlpds_tmogp):.4f}')
    print(f'Time t-MOGP: {np.mean(times_tmogp):.4f} ± {np.std(times_tmogp):.4f}')

  5%|▌         | 1/20 [05:31<1:45:00, 331.59s/it]

Focused outliers: t-MOGP seed 0: RMSE = 0.18238100502401008, NLPD = -0.10871702885997836, Time = 331.59401679039


 10%|█         | 2/20 [11:07<1:40:12, 334.01s/it]

Focused outliers: t-MOGP seed 1: RMSE = 0.2899976926245902, NLPD = 0.3274582337074868, Time = 335.6956226825714


 15%|█▌        | 3/20 [16:43<1:34:52, 334.84s/it]

Focused outliers: t-MOGP seed 2: RMSE = 0.22890229840069673, NLPD = 0.034181934887956206, Time = 335.8170235157013


 20%|██        | 4/20 [22:17<1:29:15, 334.74s/it]

Focused outliers: t-MOGP seed 3: RMSE = 0.19482610156083394, NLPD = -0.0656043131119932, Time = 334.58404755592346


 25%|██▌       | 5/20 [27:44<1:22:59, 332.00s/it]

Focused outliers: t-MOGP seed 4: RMSE = 0.17868894573179037, NLPD = -0.05071253329423427, Time = 327.1401779651642


 30%|███       | 6/20 [33:19<1:17:39, 332.80s/it]

Focused outliers: t-MOGP seed 5: RMSE = 0.2003460587414216, NLPD = 0.029682326790368422, Time = 334.3425076007843


 35%|███▌      | 7/20 [38:55<1:12:22, 334.06s/it]

Focused outliers: t-MOGP seed 6: RMSE = 0.18904617642243343, NLPD = -0.108629372525187, Time = 336.647696018219


 40%|████      | 8/20 [44:22<1:06:19, 331.65s/it]

Focused outliers: t-MOGP seed 7: RMSE = 0.23231910969819233, NLPD = 0.12212516707218828, Time = 326.48794865608215


 45%|████▌     | 9/20 [50:04<1:01:23, 334.85s/it]

Focused outliers: t-MOGP seed 8: RMSE = 0.29382333371260816, NLPD = 0.31485396359174495, Time = 341.8680913448334


 50%|█████     | 10/20 [55:31<55:24, 332.46s/it] 

Focused outliers: t-MOGP seed 9: RMSE = 0.2016329235173583, NLPD = -0.008614309758651031, Time = 327.10135102272034


 55%|█████▌    | 11/20 [1:00:55<49:28, 329.82s/it]

Focused outliers: t-MOGP seed 10: RMSE = 0.18537647958206493, NLPD = -0.03270702760039907, Time = 323.85032081604004


 60%|██████    | 12/20 [1:06:30<44:11, 331.43s/it]

Focused outliers: t-MOGP seed 11: RMSE = 0.18488609148280596, NLPD = -0.12117266698218256, Time = 335.0999584197998


 65%|██████▌   | 13/20 [1:12:11<38:59, 334.26s/it]

Focused outliers: t-MOGP seed 12: RMSE = 0.21478170368888777, NLPD = 0.0555955284138955, Time = 340.75965332984924


 70%|███████   | 14/20 [1:14:38<27:46, 277.75s/it]

Focused outliers: t-MOGP seed 13: RMSE = 0.23943352105980018, NLPD = 0.300792549932667, Time = 147.1593623161316


 75%|███████▌  | 15/20 [1:20:10<24:30, 294.05s/it]

Focused outliers: t-MOGP seed 14: RMSE = 0.17967001777181435, NLPD = -0.04736703640005272, Time = 331.8168122768402


 80%|████████  | 16/20 [1:25:42<20:22, 305.54s/it]

Focused outliers: t-MOGP seed 15: RMSE = 0.20658470421328293, NLPD = -0.10670307796532415, Time = 332.2172751426697


 85%|████████▌ | 17/20 [1:31:12<15:38, 312.92s/it]

Focused outliers: t-MOGP seed 16: RMSE = 0.20586815547777454, NLPD = 0.07513202312684197, Time = 330.1042242050171


 90%|█████████ | 18/20 [1:36:41<10:35, 317.66s/it]

Focused outliers: t-MOGP seed 17: RMSE = 0.1566333514791599, NLPD = -0.24120210381100401, Time = 328.67982602119446


 95%|█████████▌| 19/20 [1:42:06<05:19, 319.87s/it]

Focused outliers: t-MOGP seed 18: RMSE = 0.17591460729423594, NLPD = -0.08627920083673139, Time = 325.0340495109558


100%|██████████| 20/20 [1:47:47<00:00, 323.37s/it]

Focused outliers: t-MOGP seed 19: RMSE = 0.20563994706808092, NLPD = -0.022529247601039682, Time = 341.2397840023041
RMSE: 0.1824, 0.2900, 0.2289, 0.1948, 0.1787, 0.2003, 0.1890, 0.2323, 0.2938, 0.2016, 0.1854, 0.1849, 0.2148, 0.2394, 0.1797, 0.2066, 0.2059, 0.1566, 0.1759, 0.2056
NLPD: -0.1087, 0.3275, 0.0342, -0.0656, -0.0507, 0.0297, -0.1086, 0.1221, 0.3149, -0.0086, -0.0327, -0.1212, 0.0556, 0.3008, -0.0474, -0.1067, 0.0751, -0.2412, -0.0863, -0.0225
Time: 331.5940, 335.6956, 335.8170, 334.5840, 327.1402, 334.3425, 336.6477, 326.4879, 341.8681, 327.1014, 323.8503, 335.1000, 340.7597, 147.1594, 331.8168, 332.2173, 330.1042, 328.6798, 325.0340, 341.2398
RMSE t-MOGP: 0.2073 ± 0.0345
NLPD t-MOGP: 0.0130 ± 0.1492
Time t-MOGP: 323.3620 ± 40.7509



