# Secure FL – Enhanced Evaluation (Single-Round Closed-Form)
This notebook extends the original *Protocol v1/v2* implementation with:
1. **Diverse datasets** – synthetic non-i.i.d. generator & California Housing real-world dataset.
2. **Baselines** – centralized, naive federated, and differential-privacy FL.
3. **Secure protocol** – single-round closed-form (`Protocol v3`).
4. **Metrics** – MSE & R$^2$ on pooled test data.

Running all cells will reproduce the results discussed in Section 4 of the paper.

In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

np.random.seed(42)


In [2]:
def add_intercept(X: np.ndarray) -> np.ndarray:
    """Append a column of 1s to the left of X."""
    return np.hstack([np.ones((X.shape[0], 1)), X])


## 1  Synthetic non‑i.i.d. data generator

In [3]:
def generate_synthetic_noniid_clients(n_clients: int = 10,
                                      n_features: int = 7,
                                      min_size: int = 1000,
                                      max_size: int = 10000,
                                      concept_shift: bool = True,
                                      missing_feature_prob: float = 0.25,
                                      seed: int = 0):
    """Return a list of dicts {'X_train','y_train','X_test','y_test'}"""

    rng = np.random.default_rng(seed)
    clients = []

    # Global ground‑truth weight vector (for all clients) – random for each experiment
    true_w = rng.normal(size=n_features)

    for i in range(n_clients):
        m = rng.integers(min_size, max_size + 1)

        # Concept shift: client‑specific mean offset
        mu = 5.0 * i if concept_shift else 0.0

        X = rng.normal(loc=mu, scale=1.0, size=(m, n_features))

        # Optionally zero‑out one random feature for this client (missing feature scenario)
        if rng.random() < missing_feature_prob:
            miss_idx = rng.integers(0, n_features)
            X[:, miss_idx] = 0.0

        noise = rng.normal(scale=1.0, size=m)
        y = X @ true_w + noise

        # Train/test split (90/10)
        idx = rng.permutation(m)    # randomly permute indices
        split = int(0.9 * m)
        train_idx, test_idx = idx[:split], idx[split:]
        clients.append({
            "X_train": add_intercept(X[train_idx]),
            "y_train": y[train_idx],
            "X_test":  add_intercept(X[test_idx]),
            "y_test":  y[test_idx]
        })
    return clients


## 2  California Housing dataset split by latitude bands

In [4]:
def load_california_housing_clients(n_clients: int = 8,
                                   test_size: float = 0.1,
                                   seed: int = 0):
    """Partition California Housing by latitude quantiles."""

    rng = np.random.default_rng(seed)

    data = fetch_california_housing()
    X_full = data.data
    y_full = data.target

    # Latitude is feature index 6
    latitudes = X_full[:, 6]
    quantiles = np.quantile(latitudes, np.linspace(0, 1, n_clients + 1))
    clients = []

    for i in range(n_clients):
        mask = (latitudes >= quantiles[i]) & (latitudes < quantiles[i + 1])
        X_i = X_full[mask]
        y_i = y_full[mask]
        if len(y_i) < 50:  # guard against tiny splits
            continue
        X_train, X_test, y_train, y_test = train_test_split(
            X_i, y_i, test_size=test_size, random_state=seed
        )
        clients.append({
            "X_train": add_intercept(X_train),
            "y_train": y_train,
            "X_test":  add_intercept(X_test),
            "y_test":  y_test
        })

    return clients


## 3  Baseline methods

In [5]:
def centralized_solution(clients):
    """Pool all data and solve closed‑form linear regression."""

    X = np.vstack([c["X_train"] for c in clients])
    y = np.hstack([c["y_train"] for c in clients])

    XtX = X.T @ X
    Xty = X.T @ y
    w = np.linalg.solve(XtX, Xty)

    return w

def naive_federated_solution(clients):
    """Each client sends (XᵀX, Xᵀy) in the clear (no privacy)."""

    d = clients[0]["X_train"].shape[1]

    XtX_sum = np.zeros((d, d))    # to accumulate the sum of XᵀX from all clients.
    Xty_sum = np.zeros(d)         # to accumulate the sum of Xᵀy from all clients.

    for c in clients:
        XtX_sum += c["X_train"].T @ c["X_train"]
        Xty_sum += c["X_train"].T @ c["y_train"]

    return np.linalg.solve(XtX_sum, Xty_sum)

def dp_gaussian_federated_solution(clients, sigma: float = 1.0, seed: int = 0):
    """Gaussian DP: each client adds N(0,σ²) noise to its stats."""

    rng = np.random.default_rng(seed)

    d = clients[0]["X_train"].shape[1]

    XtX_sum = np.zeros((d, d))
    Xty_sum = np.zeros(d)

    for c in clients:
        XtX = c["X_train"].T @ c["X_train"] + rng.normal(scale=sigma, size=(d, d))
        Xty = c["X_train"].T @ c["y_train"] + rng.normal(scale=sigma, size=d)
        XtX_sum += XtX
        Xty_sum += Xty

    return np.linalg.solve(XtX_sum, Xty_sum)


## 4  Secure protocol – single‑round closed‑form (`Protocol v3`)
Implements the transformation from Section 3 with optional $\beta$‑rescaling.

In [6]:
def secure_protocol_single_round(clients,
                                 beta_range=(-0.1, 0.1),
                                 seed: int = 0):

    rng = np.random.default_rng(seed)

    # n = number of raw features (excluding intercept)
    n = clients[0]["X_train"].shape[1] - 1

    # Build invertible W ∈ ℝ^{(n+2)×(n+2)} with last row = 1 until it's invertible
    while True:
        W = rng.normal(size=(n+2, n+2))
        W[-1, :] = 1.0
        if np.linalg.matrix_rank(W) == n + 2:   # the probability of it being singular approaches zero as the size of matrix increases
            break                               # so it will terminate after one or few iterations

    L_sum = np.zeros((n + 1, n + 2))

    for c in clients:
        X = c["X_train"]                      # m × (n+1) (intercept already)
        y = c["y_train"]

        W_tilde = W[:-1, :]                   # (n+1) × (n+2)
        Y = np.tile(y.reshape(-1, 1), (1, n + 2))

        L_i = X.T @ (X @ W_tilde - Y)
        beta = rng.uniform(*beta_range)

        L_i = (1 - beta) * L_i
        L_sum += L_i

    # Aggregate and recover global weights
    C = L_sum @ np.linalg.inv(W)
    A, b = C[:, :-1], C[:, -1]
    w_opt = -np.linalg.solve(A, b)

    return w_opt


## 5  Evaluation utilities

In [7]:
def evaluate_global_model(w, clients):
    X_test = np.vstack([c["X_test"] for c in clients])
    y_test = np.hstack([c["y_test"] for c in clients])

    y_pred = X_test @ w

    mse = mean_squared_error(y_test, y_pred)
    r2  = r2_score(y_test, y_pred)

    return mse, r2

## 6  Experiments
### 6.1 Synthetic non‑i.i.d. scenario

In [8]:
clients_syn = generate_synthetic_noniid_clients(n_clients=10, n_features=7, seed=42)

results_syn = {}

# Centralized
w_central = centralized_solution(clients_syn)
results_syn["Centralized"] = evaluate_global_model(w_central, clients_syn)

# Naive FL
w_naive = naive_federated_solution(clients_syn)
results_syn["Naive FL"] = evaluate_global_model(w_naive, clients_syn)

# DP‑FL
w_dp = dp_gaussian_federated_solution(clients_syn, sigma=5.0, seed=42)
results_syn["DP (σ=5)"] = evaluate_global_model(w_dp, clients_syn)

# Secure Protocol v3
w_secure = secure_protocol_single_round(clients_syn, beta_range=(-0.1,0.1), seed=42)
results_syn["Protocol v3"] = evaluate_global_model(w_secure, clients_syn)

pd.DataFrame(results_syn, index=["MSE","R2"]).T


Unnamed: 0,MSE,R2
Centralized,1.001011,0.998741
Naive FL,1.001011,0.998741
DP (σ=5),1.000932,0.998741
Protocol v3,1.001007,0.998741


### 6.2 California Housing scenario

In [11]:
clients_cal = load_california_housing_clients(n_clients=8, seed=42)
results_cal = {}

w_central = centralized_solution(clients_cal)
results_cal["Centralized"] = evaluate_global_model(w_central, clients_cal)

w_naive = naive_federated_solution(clients_cal)
results_cal["Naive FL"] = evaluate_global_model(w_naive, clients_cal)

w_dp = dp_gaussian_federated_solution(clients_cal, sigma=5.0, seed=42)
results_cal["DP (σ=5)"] = evaluate_global_model(w_dp, clients_cal)

w_secure = secure_protocol_single_round(clients_cal, beta_range=(-0.1,0.1), seed=42)
results_cal["Protocol v3"] = evaluate_global_model(w_secure, clients_cal)

pd.DataFrame(results_cal, index=["MSE","R2"]).T


Unnamed: 0,MSE,R2
Centralized,0.524448,0.600005
Naive FL,0.524448,0.600005
DP (σ=5),0.578053,0.559121
Protocol v3,0.524684,0.599825


## 7  Discussion
The tables above replicate the extended evaluation requested in the **Enhanced Experimental Setup and Evaluation** feedback section. You can tweak parameters (e.g. `sigma`, `beta_range`) to explore the privacy‑utility trade‑offs.