In [None]:
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.kernel_ridge import KernelRidge
from xgboost import XGBRegressor

# Data

In [None]:
X_train = ...
y_train = ...

X_test = ...
y_test = ...

# SVM regression

In [None]:
pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("svr", SVR())
])

param_grid = {
    "svr__kernel": ["rbf"],
    "svr__C": [0.1, 1, 10, 100],
    "svr__epsilon": [0.01, 0.1, 0.5],
    "svr__gamma": ["scale", 0.01, 0.1, 1.0],
}

grid = GridSearchCV(
    pipe,
    param_grid=param_grid,
    scoring="neg_mean_squared_error",
    cv=5,
    n_jobs=-1
)

grid.fit(X_train, y_train)

best_model = grid.best_estimator_
pred = best_model.predict(X_test)

print("Best params:", grid.best_params_)
print("Test MSE:", mean_squared_error(y_test, pred))
print("Test R2 :", r2_score(y_test, pred))

# XGBost regression

In [None]:
base = XGBRegressor(
    objective="reg:squarederror",
    random_state=42,
    n_jobs=-1,
    # optional speed-ups:
    # tree_method="hist",
)

param_dist = {
    "n_estimators": [500, 1000, 2000],
    "learning_rate": [0.01, 0.03, 0.05, 0.1],
    "max_depth": [3, 4, 6, 8],
    "min_child_weight": [1, 3, 5, 10],
    "subsample": [0.6, 0.8, 1.0],
    "colsample_bytree": [0.6, 0.8, 1.0],
    "reg_lambda": [0.0, 1.0, 5.0, 10.0],
    "reg_alpha": [0.0, 0.1, 1.0],
}

search = RandomizedSearchCV(
    estimator=base,
    param_distributions=param_dist,
    n_iter=30,
    scoring="neg_mean_squared_error",
    cv=3,
    verbose=1,
    random_state=42,
    n_jobs=-1,
)

search.fit(
    X_train, y_train,
    eval_set=[(X_valid, y_valid)],
    eval_metric="rmse",
    early_stopping_rounds=50,
    verbose=False
)

best_model = search.best_estimator_
pred = best_model.predict(X_test)

print("Best params:", search.best_params_)
print("Test MSE:", mean_squared_error(y_test, pred))
print("Test R2 :", r2_score(y_test, pred))

# Kerenl Ridge regression

In [None]:
pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("krr", KernelRidge(kernel="rbf"))
])

param_grid = {
    "krr__alpha": [1e-3, 1e-2, 1e-1, 1, 10, 100],     # regularization strength
    "krr__gamma": [1e-3, 1e-2, 1e-1, 1, 10],          # RBF width (higher = wiggly)
}

grid = GridSearchCV(
    pipe,
    param_grid=param_grid,
    scoring="neg_mean_squared_error",
    cv=5,
    n_jobs=-1
)

grid.fit(X_train, y_train)

best_model = grid.best_estimator_
pred = best_model.predict(X_test)

print("Best params:", grid.best_params_)
print("Test MSE:", mean_squared_error(y_test, pred))
print("Test R2 :", r2_score(y_test, pred))


# Graph Laplacian

In [None]:
import numpy as np
import scipy.sparse as sp
import scipy.sparse.linalg as spla
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

def knn_rbf_adjacency(X, n_neighbors=15, sigma=1.0):
    """
    Build sparse symmetric adjacency W using kNN and RBF weights:
      w_ij = exp(-||xi-xj||^2 / (2*sigma^2))
    """
    nn = NearestNeighbors(n_neighbors=n_neighbors + 1, metric="euclidean").fit(X)
    dists, idx = nn.kneighbors(X)

    # drop self-neighbor at column 0
    dists = dists[:, 1:]
    idx = idx[:, 1:]

    n = X.shape[0]
    rows = np.repeat(np.arange(n), n_neighbors)
    cols = idx.reshape(-1)

    weights = np.exp(-(dists.reshape(-1) ** 2) / (2.0 * sigma**2))

    W = sp.csr_matrix((weights, (rows, cols)), shape=(n, n))
    W = 0.5 * (W + W.T)  # symmetrize
    return W

def graph_laplacian(W: sp.spmatrix) -> sp.spmatrix:
    d = np.asarray(W.sum(axis=1)).ravel()
    return sp.diags(d) - W

def laplacian_regression(L: sp.spmatrix, y: np.ndarray, labeled_idx, lam=1.0, ridge=1e-8):
    n = L.shape[0]
    labeled_idx = np.array(labeled_idx, dtype=int)

    m = np.zeros(n)
    m[labeled_idx] = 1.0
    M = sp.diags(m)

    A = M + lam * L + ridge * sp.eye(n, format="csr")
    b = M @ y
    return spla.spsolve(A.tocsr(), b)

# ---- Demo data (replace with your real X,y where y is known only for some nodes) ----
rng = np.random.RandomState(0)
n, p = 800, 10
X = rng.randn(n, p)

# Underlying smooth-ish signal + noise
y_full = X @ rng.uniform(-2, 2, size=p) + 0.5 * rng.randn(n)

# Suppose only some nodes are labeled
labeled_idx = rng.choice(n, size=120, replace=False)

# 1) scale features (important for kNN/RBF)
Xs = StandardScaler().fit_transform(X)

# 2) build graph
W = knn_rbf_adjacency(Xs, n_neighbors=20, sigma=1.0)
L = graph_laplacian(W)

# 3) tune lambda using CV on labeled nodes (hide some labeled each fold)
lambdas = [0.1, 0.5, 1, 2, 5, 10, 20]
kf = KFold(n_splits=5, shuffle=True, random_state=42)

best_lam, best_cv = None, float("inf")

for lam in lambdas:
    fold_mses = []
    for train_idx, val_idx in kf.split(labeled_idx):
        train_labeled = labeled_idx[train_idx]
        val_labeled   = labeled_idx[val_idx]

        f_hat = laplacian_regression(L, y_full, train_labeled, lam=lam)
        fold_mses.append(mean_squared_error(y_full[val_labeled], f_hat[val_labeled]))

    cv_mse = float(np.mean(fold_mses))
    if cv_mse < best_cv:
        best_cv, best_lam = cv_mse, lam

# 4) fit with best lambda using all labeled nodes
f_final = laplacian_regression(L, y_full, labeled_idx, lam=best_lam)

print("Best lambda:", best_lam, "CV MSE:", best_cv)
print("Predictions for all nodes available in f_final (length =", len(f_final), ")")
