# 102b Final Project (Python)
Converted from the original RMarkdown to a Python notebook.



In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

# Function to perform LASSO regression using Coordinate Descent

def lasso_cd(response, features, coeffs_init=None, max_iter=1000, epsilon=1e-6, reg=0.1):
    n_samples, n_features = features.shape

    if coeffs_init is None:
        coeffs = np.zeros(n_features)
    else:
        coeffs = coeffs_init.copy()

    def shrink(z, gamma):
        return np.sign(z) * np.maximum(np.abs(z) - gamma, 0)

    col_norms = np.sum(features ** 2, axis=0)
    resid = response - features @ coeffs

    for _ in range(max_iter):
        prev_coeffs = coeffs.copy()
        for k in range(n_features):
            feature_k = features[:, k]
            partial_resid = resid + coeffs[k] * feature_k
            update_val = feature_k @ partial_resid
            coeffs[k] = shrink(update_val, reg) / col_norms[k]
            resid = partial_resid - coeffs[k] * feature_k

        if np.sqrt(np.sum((coeffs - prev_coeffs) ** 2)) < epsilon:
            break

    return coeffs


# Load all datasets

data_dir = Path('.')
dataset1 = pd.read_csv(data_dir / "regression_data_node1.csv")
dataset2 = pd.read_csv(data_dir / "regression_data_node2.csv")
dataset3 = pd.read_csv(data_dir / "regression_data_node3.csv")
dataset_test = pd.read_csv(data_dir / "test_data.csv")

# Split datasets into training (80%) and validation (20%)

def split_data(data, split_index):
    train = data.iloc[:split_index]
    valid = data.iloc[split_index:]
    return {"train": train, "valid": valid}


d1 = split_data(dataset1, 160)
d2 = split_data(dataset2, 240)
d3 = split_data(dataset3, 400)

# Extract test set components

y_test = dataset_test["y"].to_numpy(dtype=float)
X_test = dataset_test.drop(columns=["y"]).to_numpy()

# Prepare training and validation sets (features and targets)

def extract_xy(df):
    y = df["y"].to_numpy(dtype=float)
    X = df.drop(columns=["y"]).to_numpy()
    return {"X": X, "y": y}


data_all = [
    {"train": extract_xy(d1["train"]), "val": extract_xy(d1["valid"])},
    {"train": extract_xy(d2["train"]), "val": extract_xy(d2["valid"])},
    {"train": extract_xy(d3["train"]), "val": extract_xy(d3["valid"])},
]

# Define list of candidate lambda values to search

lambda_values = np.arange(0.05, 1.01, 0.05)
optimal_lambdas = np.zeros(3)
val_errors = [
    np.zeros(len(lambda_values)),
    np.zeros(len(lambda_values)),
    np.zeros(len(lambda_values)),
]

# Model training and lambda tuning for each dataset

for d in range(3):
    lowest_mse = np.inf
    for l, lam in enumerate(lambda_values):
        fitted_model = lasso_cd(
            response=data_all[d]["train"]["y"],
            features=data_all[d]["train"]["X"],
            reg=lam,
        )

        predictions = data_all[d]["val"]["X"] @ fitted_model
        mse_val = np.sum((data_all[d]["val"]["y"] - predictions) ** 2) / (
            2 * data_all[d]["val"]["X"].shape[0]
        )
        val_errors[d][l] = mse_val

        if mse_val < lowest_mse:
            lowest_mse = mse_val
            optimal_lambdas[d] = lam

    print(f"Best lambda for dataset {d + 1}: {optimal_lambdas[d]}")

# Train final models with best lambda values

model1 = lasso_cd(data_all[0]["train"]["y"], data_all[0]["train"]["X"], reg=optimal_lambdas[0])
model2 = lasso_cd(data_all[1]["train"]["y"], data_all[1]["train"]["X"], reg=optimal_lambdas[1])
model3 = lasso_cd(data_all[2]["train"]["y"], data_all[2]["train"]["X"], reg=optimal_lambdas[2])

# Plot validation losses

plt.figure()
plt.plot(lambda_values, val_errors[0], marker='o')
plt.title("Validation Loss - Dataset 1")
plt.xlabel("Lambda")
plt.ylabel("Loss")
plt.show()

plt.figure()
plt.plot(lambda_values, val_errors[1], marker='o')
plt.title("Validation Loss - Dataset 2")
plt.xlabel("Lambda")
plt.ylabel("Loss")
plt.show()

plt.figure()
plt.plot(lambda_values, val_errors[2], marker='o')
plt.title("Validation Loss - Dataset 3")
plt.xlabel("Lambda")
plt.ylabel("Loss")
plt.show()

# Identify non-zero coefficients

nz1 = np.where(model1 != 0)[0] + 1
nz2 = np.where(model2 != 0)[0] + 1
nz3 = np.where(model3 != 0)[0] + 1

nz1
nz2
nz3

# Common indices across all three models

intersect_all = sorted(set(nz1).intersection(nz2, nz3))
intersect_all

# Calculate test losses

def test_loss(X, y, coeffs):
    return np.sum((y - X @ coeffs) ** 2) / (2 * X.shape[0])


loss1 = test_loss(X_test, y_test, model1)
loss2 = test_loss(X_test, y_test, model2)
loss3 = test_loss(X_test, y_test, model3)

loss1
loss2
loss3



In [None]:
# Part B (standalone) - recompute required Part A outputs

import numpy as np
import pandas as pd
from pathlib import Path

# Recompute minimal Part A outputs needed for Part B.

def soft_threshold(z, lambda_val):
    return np.sign(z) * np.maximum(np.abs(z) - lambda_val, 0)


def cd_lasso(X, y, lambda_val, tol=1e-4, max_iter=200):
    n_obs, n_feat = X.shape
    beta_vec = np.zeros(n_feat)
    for _ in range(max_iter):
        beta_prev = beta_vec.copy()
        for j in range(n_feat):
            r_j = y - X @ beta_vec + X[:, j] * beta_vec[j]
            rho_j = np.sum(X[:, j] * r_j) / n_obs
            denom_j = np.sum(X[:, j] ** 2) / n_obs
            beta_vec[j] = soft_threshold(rho_j, lambda_val) / denom_j
        diff_norm = np.sqrt(np.sum((beta_vec - beta_prev) ** 2))
        if diff_norm < tol:
            break
    return beta_vec


def make_lambda_grid(X, y, grid_size=10):
    X_centered = X - X.mean(axis=0, keepdims=True)
    y_centered = y - y.mean()
    n = X.shape[0]
    lambda_max = np.max(np.abs(X_centered.T @ y_centered)) / n
    lambda_seq = np.exp(np.linspace(np.log(lambda_max), np.log(lambda_max * 1e-3), grid_size))
    return lambda_seq


def fit_and_eval(file_train, grid_size=10):
    df = pd.read_csv(file_train)
    X_all = df.iloc[:, :600].to_numpy()
    y_all = df.iloc[:, 600].to_numpy(dtype=float)

    n_total = X_all.shape[0]
    split_index = int(np.floor(0.8 * n_total))
    X_train = X_all[:split_index]
    y_train = y_all[:split_index]
    X_vali = X_all[split_index:]
    y_vali = y_all[split_index:]

    X_mean = X_train.mean(axis=0)
    y_mean = y_train.mean()
    X_train_c = X_train - X_mean
    y_train_c = y_train - y_mean

    lambda_values = make_lambda_grid(X_train, y_train, grid_size)
    val_errors = np.zeros(len(lambda_values))

    for i, lam in enumerate(lambda_values):
        beta_hat = cd_lasso(X_train_c, y_train_c, lam)
        intercept = y_mean - np.sum(beta_hat * X_mean)
        y_pred = X_vali @ beta_hat + intercept
        val_errors[i] = np.mean((y_vali - y_pred) ** 2)

    best_idx = int(np.argmin(val_errors))
    best_lambda = lambda_values[best_idx]

    X_combined = np.vstack([X_train, X_vali])
    y_combined = np.concatenate([y_train, y_vali])
    X_comb_mean = X_combined.mean(axis=0)
    y_comb_mean = y_combined.mean()
    X_comb_c = X_combined - X_comb_mean
    y_comb_c = y_combined - y_comb_mean

    final_beta = cd_lasso(X_comb_c, y_comb_c, best_lambda)

    return {
        "lambda": best_lambda,
        "nonzero": np.where(final_beta != 0)[0] + 1,
    }


data_dir = Path('.')
node_files = [
    data_dir / 'regression_data_node1.csv',
    data_dir / 'regression_data_node2.csv',
    data_dir / 'regression_data_node3.csv',
]

results_list = [fit_and_eval(f, grid_size=10) for f in node_files]

results_b = [
    {"lam": res["lambda"], "nonzero": res["nonzero"]}
    for res in results_list
]

lambda_vec = np.array([x["lam"] for x in results_b])
nonzero_sets = [x["nonzero"] for x in results_b]


def soft_thresh(z, lam):
    return np.sign(z) * np.maximum(np.abs(z) - lam, 0)


def cd_partial(Xmat, yvec, lam, beta_init, rounds=5):
    beta = beta_init.copy()
    n, p = Xmat.shape

    for _ in range(rounds):
        for j in range(p):
            partial_res = yvec - Xmat @ beta + Xmat[:, j] * beta[j]
            rho_j = np.sum(Xmat[:, j] * partial_res) / n
            denom = np.sum(Xmat[:, j] ** 2) / n
            beta[j] = soft_thresh(rho_j, lam) / denom
    return beta


p = 600

rng = np.random.default_rng(123)
node_data = []
for k in range(1, 4):
    df = pd.read_csv(data_dir / f"regression_data_node{k}.csv")
    X = df.iloc[:, :p].to_numpy()
    y = df.iloc[:, p].to_numpy(dtype=float)

    idx = rng.choice(X.shape[0], size=int(np.floor(0.8 * X.shape[0])), replace=False)
    node_data.append({
        "Xtrain": X[idx],
        "ytrain": y[idx],
        "size": len(idx),
    })

sample_weights = np.array([d["size"] for d in node_data])

# Federated Learning (5 iterations/round)

beta_global = np.zeros(p)
converge_tol = 1e-6

while True:
    beta_prev = beta_global.copy()

    local_models = np.stack([
        cd_partial(
            d["Xtrain"], d["ytrain"],
            lam=lambda_vec[k],
            beta_init=beta_global,
            rounds=5,
        )
        for k, d in enumerate(node_data)
    ], axis=1)

    beta_global = (local_models * sample_weights).sum(axis=1) / sample_weights.sum()

    if np.sqrt(np.sum((beta_global - beta_prev) ** 2)) < converge_tol:
        break

agg_beta_5 = beta_global
agg_nonzero_5 = np.where(agg_beta_5 != 0)[0] + 1

print("Non-zero coefficients:", ", ".join(str(i) for i in agg_nonzero_5))

# Confusion Matrix
for k in range(1, 4):
    truth_vec = np.isin(np.arange(1, p + 1), agg_nonzero_5).astype(int)
    pred_vec = np.isin(np.arange(1, p + 1), nonzero_sets[k - 1]).astype(int)

    tp = np.sum((truth_vec == 1) & (pred_vec == 1))
    fp = np.sum((truth_vec == 0) & (pred_vec == 1))
    fn = np.sum((truth_vec == 1) & (pred_vec == 0))
    tn = np.sum((truth_vec == 0) & (pred_vec == 0))

    conf_mat = pd.DataFrame(
        {
            "Pred_0": [tn, fn],
            "Pred_1": [fp, tp],
        },
        index=["True_0", "True_1"],
    )

    print(f"
Confusion Matrix (5 iterations, Node {k}):")
    print(conf_mat)

# Case 1: 5 iterations per node

df_test = pd.read_csv(data_dir / "test_data.csv")
Xtest = df_test.iloc[:, :p].to_numpy()
ytest = df_test.iloc[:, p].to_numpy(dtype=float)

test_loss_5 = np.mean((Xtest @ agg_beta_5 - ytest) ** 2)
print(f"
Test Loss (5 iterations): {test_loss_5:.6f}")

# Case 2: 10 iterations per node

beta_global = np.zeros(p)

while True:
    beta_prev = beta_global.copy()

    local_models = np.stack([
        cd_partial(
            d["Xtrain"], d["ytrain"],
            lam=lambda_vec[k],
            beta_init=beta_global,
            rounds=10,
        )
        for k, d in enumerate(node_data)
    ], axis=1)
    beta_global = (local_models * sample_weights).sum(axis=1) / sample_weights.sum()
    if np.sqrt(np.sum((beta_global - beta_prev) ** 2)) < converge_tol:
        break

agg_beta_10 = beta_global
agg_nonzero_10 = np.where(agg_beta_10 != 0)[0] + 1

print(", ".join(str(i) for i in agg_nonzero_10))

test_loss_10 = np.mean((Xtest @ agg_beta_10 - ytest) ** 2)
print(f"Test Loss (10 iterations): {test_loss_10:.6f}")
