<a href="https://colab.research.google.com/github/hbisgin/BigDatav1/blob/main/Lecture21_MLP_OneHiddenLayerOneNodeBreastCancer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# NumPy-only MLP with exactly ONE hidden neuron
# Dataset: Breast Cancer Wisconsin (binary classification)
# Architecture: d -> 1 -> 1  (ReLU hidden, Sigmoid output, BCE loss)
import numpy as np
from sklearn.datasets import load_breast_cancer  # data loading only

np.random.seed(42)

# ---------- 1) Data ----------
data = load_breast_cancer()
X = data.data.astype(np.float64)                  # (n, d)
y = data.target.astype(np.float64).reshape(-1, 1) # (n, 1)  (0/1)

# Standardize features
mu = X.mean(axis=0, keepdims=True)
sd = X.std(axis=0, keepdims=True) + 1e-12
X = (X - mu) / sd

# Train/test split (80/20)
n = X.shape[0]
perm = np.random.permutation(n)
ntr = int(0.8 * n)
tr, te = perm[:ntr], perm[ntr:]
Xtr, ytr = X[tr], y[tr]
Xte, yte = X[te], y[te]

# ---------- 2) Model: d -> 1 -> 1 ----------
d = Xtr.shape[1]

# ONE hidden neuron: weights (d×1), bias (1×1)
W1 = np.random.randn(d, 1) * np.sqrt(2.0 / d)
b1 = np.zeros((1, 1))

# Output neuron: weights (1×1), bias (1×1)
W2 = np.random.randn(1, 1) * np.sqrt(2.0 / 1.0)
b2 = np.zeros((1, 1))

def relu(z): return np.maximum(0.0, z)
def relu_deriv(z): return (z > 0).astype(z.dtype)
def sigmoid(z): return 1.0 / (1.0 + np.exp(-z))

def bce_loss(y_true, y_prob, eps=1e-12):
    # manual clamp (no np.clip)
    p = np.maximum(eps, np.minimum(1.0 - eps, y_prob))
    return -np.mean(y_true * np.log(p) + (1 - y_true) * np.log(1 - p))

def accuracy(y_true, y_prob):
    return np.mean((y_prob >= 0.5).astype(np.float64) == y_true)

# ---------- 3) Train (full-batch GD) ----------
lr = 0.05
epochs = 2000
wd = 0.0  # L2 weight decay (try 1e-4)

for t in range(1, epochs + 1):
    # Forward
    z1 = np.dot(Xtr, W1) + b1      # (ntr, 1)
    a1 = relu(z1)                  # (ntr, 1)
    z2 = np.dot(a1, W2) + b2       # (ntr, 1)
    p  = sigmoid(z2)               # (ntr, 1)

    # Loss (+ optional L2)
    loss = bce_loss(ytr, p)
    if wd > 0:
        loss += 0.5 * wd * (np.sum(W1*W1) + np.sum(W2*W2))

    # Backprop (sigmoid+BCE => dL/dz2 = (p - y)/n)
    ntr_ = Xtr.shape[0]
    dz2 = (p - ytr) / ntr_               # (ntr, 1)
    dW2 = np.dot(a1.T, dz2)              # (1, 1)
    db2 = np.sum(dz2, axis=0, keepdims=True)

    da1 = np.dot(dz2, W2.T)              # (ntr, 1)
    dz1 = da1 * relu_deriv(z1)           # (ntr, 1)
    dW1 = np.dot(Xtr.T, dz1)             # (d, 1)
    db1 = np.sum(dz1, axis=0, keepdims=True)

    if wd > 0:
        dW2 += wd * W2
        dW1 += wd * W1

    # Gradient step
    W2 -= lr * dW2; b2 -= lr * db2
    W1 -= lr * dW1; b1 -= lr * db1

    # Monitor
    if t % 200 == 0 or t == 1:
        train_acc = accuracy(ytr, p)
        p_te = sigmoid(np.dot(relu(np.dot(Xte, W1) + b1), W2) + b2)
        test_loss = bce_loss(yte, p_te)
        test_acc = accuracy(yte, p_te)
        print(f"iter {t:4d} | train loss {loss:.4f} acc {train_acc:.3f} | "
              f"test loss {test_loss:.4f} acc {test_acc:.3f}")

# ---------- 4) Final eval ----------
p_tr = sigmoid(np.dot(relu(np.dot(Xtr, W1) + b1), W2) + b2)
p_te = sigmoid(np.dot(relu(np.dot(Xte, W1) + b1), W2) + b2)
print("\nFinal:")
print("Train acc =", accuracy(ytr, p_tr))
print("Test  acc =", accuracy(yte, p_te))


iter    1 | train loss 0.7585 acc 0.611 | test loss 0.6942 acc 0.649
iter  200 | train loss 0.1973 acc 0.965 | test loss 0.2378 acc 0.947
iter  400 | train loss 0.1200 acc 0.978 | test loss 0.1643 acc 0.956
iter  600 | train loss 0.0915 acc 0.980 | test loss 0.1370 acc 0.965
iter  800 | train loss 0.0779 acc 0.985 | test loss 0.1256 acc 0.965
iter 1000 | train loss 0.0699 acc 0.985 | test loss 0.1200 acc 0.965
iter 1200 | train loss 0.0647 acc 0.985 | test loss 0.1169 acc 0.965
iter 1400 | train loss 0.0610 acc 0.985 | test loss 0.1153 acc 0.974
iter 1600 | train loss 0.0582 acc 0.987 | test loss 0.1148 acc 0.974
iter 1800 | train loss 0.0559 acc 0.987 | test loss 0.1151 acc 0.974
iter 2000 | train loss 0.0541 acc 0.987 | test loss 0.1161 acc 0.974

Final:
Train acc = 0.9868131868131869
Test  acc = 0.9736842105263158


#Below code has multiple nodes in one hidden layer

In [None]:
import numpy as np
from sklearn.datasets import load_breast_cancer  # data loading only

np.random.seed(42)

# ------------ 1) Data ------------
data = load_breast_cancer()
X = data.data.astype(np.float64)                  # (n, d)
y = data.target.astype(np.float64).reshape(-1,1)  # (n, 1)  (0/1)

# standardize features
mu = X.mean(axis=0, keepdims=True)
sd = X.std(axis=0, keepdims=True) + 1e-12
X = (X - mu) / sd

# simple train/test split (80/20)
n = X.shape[0]
perm = np.random.permutation(n)
ntr = int(0.8 * n)
tr, te = perm[:ntr], perm[ntr:]
Xtr, ytr = X[tr], y[tr]
Xte, yte = X[te], y[te]

# ------------ 2) Model ------------
d = Xtr.shape[1]
H = 16  # hidden width

W1 = np.random.randn(d, H) * np.sqrt(2.0 / d)
b1 = np.zeros((1, H))
W2 = np.random.randn(H, 1) * np.sqrt(2.0 / H)
b2 = np.zeros((1, 1))

def relu(z): return np.maximum(0.0, z)
def relu_deriv(z): return (z > 0).astype(z.dtype)
def sigmoid(z): return 1.0 / (1.0 + np.exp(-z))

def bce_loss(y_true, y_prob, eps=1e-12):
    p = np.maximum(eps, np.minimum(1.0 - eps, y_prob))
    return -np.mean(y_true * np.log(p) + (1 - y_true) * np.log(1 - p))

def accuracy(y_true, y_prob):
    return np.mean((y_prob >= 0.5).astype(np.float64) == y_true)

# ------------ 3) Train (full-batch GD) ------------
lr = 0.05
epochs = 2000
wd = 0.0  # L2 weight decay (try 1e-4)

for t in range(1, epochs+1):
    # forward
    z1 = np.dot(Xtr, W1) + b1         # (ntr, H)
    a1 = relu(z1)                      # (ntr, H)
    z2 = np.dot(a1, W2) + b2          # (ntr, 1)
    p  = sigmoid(z2)                   # (ntr, 1)

    # loss (+ optional L2)
    loss = bce_loss(ytr, p)
    if wd > 0:
        loss += 0.5 * wd * (np.sum(W1*W1) + np.sum(W2*W2))

    # backward (sigmoid+BCE => dL/dz2 = (p - y) / n)
    ntr_ = Xtr.shape[0]
    dz2 = (p - ytr) / ntr_            # (ntr, 1)
    dW2 = np.dot(a1.T, dz2)           # (H, 1)
    db2 = np.sum(dz2, axis=0, keepdims=True)

    da1 = np.dot(dz2, W2.T)           # (ntr, H)
    dz1 = da1 * relu_deriv(z1)        # (ntr, H)
    dW1 = np.dot(Xtr.T, dz1)          # (d, H)
    db1 = np.sum(dz1, axis=0, keepdims=True)

    if wd > 0:
        dW2 += wd * W2
        dW1 += wd * W1

    # gradient step
    W2 -= lr * dW2; b2 -= lr * db2
    W1 -= lr * dW1; b1 -= lr * db1

    # monitor
    if t % 200 == 0 or t == 1:
        train_acc = accuracy(ytr, p)
        p_te = sigmoid(np.dot(relu(np.dot(Xte, W1) + b1), W2) + b2)
        test_loss = bce_loss(yte, p_te)
        test_acc = accuracy(yte, p_te)
        print(f"iter {t:4d} | train loss {loss:.4f} acc {train_acc:.3f} | "
              f"test loss {test_loss:.4f} acc {test_acc:.3f}")

# ------------ 4) Final eval ------------
p_tr = sigmoid(np.dot(relu(np.dot(Xtr, W1) + b1), W2) + b2)
p_te = sigmoid(np.dot(relu(np.dot(Xte, W1) + b1), W2) + b2)
print("\nFinal:")
print("Train acc =", accuracy(ytr, p_tr))
print("Test  acc =", accuracy(yte, p_te))


iter    1 | train loss 0.4410 acc 0.776 | test loss 0.4016 acc 0.798
iter  200 | train loss 0.0879 acc 0.978 | test loss 0.1173 acc 0.956
iter  400 | train loss 0.0656 acc 0.987 | test loss 0.1076 acc 0.965
iter  600 | train loss 0.0556 acc 0.987 | test loss 0.1032 acc 0.965
iter  800 | train loss 0.0487 acc 0.989 | test loss 0.1001 acc 0.965
iter 1000 | train loss 0.0437 acc 0.993 | test loss 0.0998 acc 0.974
iter 1200 | train loss 0.0398 acc 0.993 | test loss 0.1004 acc 0.965
iter 1400 | train loss 0.0367 acc 0.993 | test loss 0.1009 acc 0.965
iter 1600 | train loss 0.0339 acc 0.993 | test loss 0.1013 acc 0.965
iter 1800 | train loss 0.0314 acc 0.993 | test loss 0.1020 acc 0.965
iter 2000 | train loss 0.0292 acc 0.993 | test loss 0.1036 acc 0.965

Final:
Train acc = 0.9934065934065934
Test  acc = 0.9649122807017544
