Implemented a 2-layer neural network from scratch in NumPy to classify Iris flower species. Hand-coded forward and backward propagation using ReLU and softmax, and trained using cross-entropy loss. Achieved ~95% accuracy on the test set without using any deep learning libraries.

## Forward Pass

\begin{array}{|c|l|}
\hline
\textbf{Symbol / Formula} & \textbf{Meaning / Computation} \\
\hline
w^{[1]} & \text{Weights from input layer (layer 1) to hidden layer (layer 2)} \\
(i, j) & \text{Subscripts: } i = \text{hidden neuron index},\ j = \text{input neuron index} \\
w^{[1]}_{1,1} & \text{Weight from } x_1 \text{ to hidden neuron } a_1 \\
w^{[1]}_{2,1} & \text{Weight from } x_1 \text{ to } a_2 \\
w^{[2]}_{1,2} & \text{Weight from hidden neuron } a_2 \text{ to output neuron } y \\
\hline
a_1 & w^{[1]}_{1,1}x_1 + w^{[1]}_{1,2}x_2 + b^{[1]}_1 \\
    & = (0.5 \cdot 2.3) + (0.1 \cdot 10.2) + (-0.3) = 1.87 \\
\hline
a_2 & w^{[1]}_{2,1}x_1 + w^{[1]}_{2,2}x_2 + b^{[1]}_2 \\
    & = (-0.1 \cdot 2.3) + (0.3 \cdot 10.2) + 0.2 = 3.03 \\
\hline
a_3 & w^{[1]}_{3,1}x_1 + w^{[1]}_{3,2}x_2 + b^{[1]}_3 \\
    & = (0.2 \cdot 2.3) + (-0.1 \cdot 10.2) + 0.5 = 0.5 \\
\hline
z^{[2]} &= w^{[2]}_{1,1} a_1 + w^{[2]}_{1,2} a_2 + w^{[2]}_{1,3} a_3 + b^{[2]}_1 \\
        &= (-0.2 \cdot 1.87) + (-0.3 \cdot 3.03) + (-0.15 \cdot 0.50) + 0.05 = -1.209 \\
\hline
\hat{y} &= \sigma(z^{[2]}) = \frac{1}{1 + e^{-z^{[2]}}} = \frac{1}{1 + e^{-(-1.209)}} \approx 0.23
\end{array}

### 🔹 Forward Pass
| Step             | Formula                                 | Meaning                                 |
|------------------|------------------------------------------|------------------------------------------|
| Hidden Z         | $z_1 = X W_1 + b_1$                      | Pre-activation for hidden layer          |
| Hidden A         | $a_1 = \text{ReLU}(z_1)$                 | Activation output of hidden layer        |
| Output Z         | $z_2 = a_1 W_2 + b_2$                    | Pre-activation for output layer          |
| Output ŷ         | $\hat{y} = \sigma(z_2)$                  | Sigmoid output                           |
### 🔹 Backward Pass
| Step                  | Formula                                                | Meaning                                   |
|------------------------|---------------------------------------------------------|--------------------------------------------|
| Output delta           | $\delta_2 = \hat{y} - y$                                | Error at output                            |
| Grad W₂                | $\frac{\partial L}{\partial W_2} = a_1^T \cdot \delta_2$| Weight gradient from hidden to output      |
| Grad b₂                | $\frac{\partial L}{\partial b_2} = \text{sum}(\delta_2)$| Bias gradient for output                   |
| Hidden delta           | $\delta_1 = (\delta_2 W_2^T) \circ \text{ReLU}'(z_1)$  | Error at hidden, ∘ = element-wise mult     |
| Grad W₁                | $\frac{\partial L}{\partial W_1} = X^T \cdot \delta_1$ | Weight gradient from input to hidden       |
| Grad b₁                | $\frac{\partial L}{\partial b_1} = \text{sum}(\delta_1)$| Bias gradient for hidden layer             |
### 🔹 Weight Updates (Gradient Descent)
| Parameter | Update Rule                                      |
|-----------|--------------------------------------------------|
| $W_1$     | $W_1 \leftarrow W_1 - \eta \cdot \frac{\partial L}{\partial W_1}$ |
| $b_1$     | $b_1 \leftarrow b_1 - \eta \cdot \frac{\partial L}{\partial b_1}$ |
| $W_2$     | $W_2 \leftarrow W_2 - \eta \cdot \frac{\partial L}{\partial W_2}$ |
| $b_2$     | $b_2 \leftarrow b_2 - \eta \cdot \frac{\partial L}{\partial b_2}$ |
### ✅ Mnemonic to Remember
> "**Z → A → Z → ŷ** (forward)  
>  **ŷ − y → backprop through $W^T$ → apply activation' → gradients → update**"

In [None]:
# Forward
z1 = W1 x + b1
a1 = ReLU(z1)
z2 = W2 a1 + b2
ŷ  = sigmoid(z2)

# Backward
dL/dŷ = ŷ - y
dL/dz2 = dL/dŷ * sigmoid'(z2)
dL/dW2 = a1.T @ dL/dz2
dL/db2 = dL/dz2

dL/da1 = dL/dz2 @ W2.T
dL/dz1 = dL/da1 * ReLU'(z1)
dL/dW1 = x.T @ dL/dz1
dL/db1 = dL/dz1

#

### Code 

In [14]:
import numpy as np
# Activation functions and derivatives
def relu(z):
    return np.maximum(0, z)
def relu_derivative(z):
    return (z > 0).astype(float)
def sigmoid(z):
    return 1 / (1 + np.exp(-z))
def sigmoid_derivative(z):
    s = sigmoid(z)
    return s * (1 - s)
def softmax(z):
    exp_z = np.exp(z - np.max(z, axis=1, keepdims=True))
    return exp_z / np.sum(exp_z, axis=1, keepdims=True)
# Loss function
def binary_cross_entropy(y_true, y_pred):
    eps = 1e-8  # avoid log(0)
    return -np.mean(y_true * np.log(y_pred + eps) + (1 - y_true) * np.log(1 - y_pred + eps))
def categorical_cross_entropy(y_true, y_pred):
    eps = 1e-8
    return -np.mean(np.sum(y_true * np.log(y_pred + eps), axis=1))

#

In [1]:
# Network class
class FlexibleNN:
    def __init__(self, input_dim, hidden_dim=3,output_dim=1):
        # Initialize weights & biases
        self.output_dim = output_dim
        self.W1 = np.random.randn(input_dim, hidden_dim) * np.sqrt(2. / input_dim)
        self.b1 = np.zeros((1, hidden_dim))
        self.W2 = np.random.randn(hidden_dim, self.output_dim) * np.sqrt(2. / hidden_dim)
        self.b2 = np.zeros((1, self.output_dim))


    def forward(self, x):
        self.x = x                           # Cache input
        self.z1 = x @ self.W1 + self.b1      # Linear → Hidden layer
        self.a1 = relu(self.z1)              # ReLU activation
        self.z2 = self.a1 @ self.W2 + self.b2 # Linear → Output
        if self.output_dim == 1:
            self.y_pred = sigmoid(self.z2)
        else:
            self.y_pred = softmax(self.z2)
        return self.y_pred

    def backward(self, y_true, learning_rate=0.01):
        m = y_true.shape[0]
        # Output layer
        if self.output_dim == 1:     # ∂L/∂z2
            dz2 = self.y_pred - y_true  # (N, 1)
        else:
            dz2 = self.y_pred - y_true  # (N, C)

        dW2 = self.a1.T @ dz2 / m
        db2 = np.sum(dz2, axis=0, keepdims=True) / m

        # Hidden layer
        da1 = dz2 @ self.W2.T                     # ∂L/∂a1
        dz1 = da1 * relu_derivative(self.z1)      # ∂L/∂z1
        dW1 = self.x.T @ dz1 / m
        db1 = np.sum(dz1, axis=0, keepdims=True) / m

        # Gradient descent
        self.W2 -= learning_rate * dW2
        self.b2 -= learning_rate * db2
        self.W1 -= learning_rate * dW1
        self.b1 -= learning_rate * db1

    def train(self, x, y_true, learning_rate=0.01):
        y_pred = self.forward(x)
        if self.output_dim == 1:
            loss = binary_cross_entropy(y_true, y_pred)
        else:
            loss = categorical_cross_entropy(y_true, y_pred)

        self.backward(y_true, learning_rate)
        return y_pred, loss

In [2]:
x = np.array([[2.3, 10.2]])
model = FlexibleNN(input_dim=2,output_dim=1)
print("Output:", model.forward(x))

Output: [[0.92181588]]


In [3]:
x = np.array([[2.3, 10.2]])  # shape (1, 2)
model = FlexibleNN(input_dim=2, hidden_dim=3,output_dim=1)
model.W1 = np.array([
    [0.5, -0.1,0.2],  # weights from x1 to hidden neurons
    [0.1, 0.3,-0.1],  # weights from x2 to hidden neurons
])
model.b1 = np.array([[-0.3, -0.2, 0.5]])
model.W2 = np.array([
    [-0.2],
    [-0.3],
    [-0.15]
])
model.b2 = np.array([[0.05]])
y_hat = model.forward(x)
print(f"Predicted output: {y_hat[0][0]:.4f}")

Predicted output: 0.2473


In [None]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# 1. Load the data
data = load_iris()
X = data['data']            # shape (150, 4)
y = data['target'].reshape(-1, 1)  # shape (150, 1)

# 2. One-hot encode labels (for softmax output)
encoder = OneHotEncoder(sparse_output=False)
y_encoded = encoder.fit_transform(y)  # shape (150, 3)

# 3. Train-test split (stratified)
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, stratify=y, random_state=42
)

# 4. Standardize features (fit on train only!)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

model = FlexibleNN(input_dim=4, hidden_dim=5, output_dim=3)

for epoch in range(1000):
    _, loss = model.train(X_train, y_train, learning_rate=0.05)
    if epoch % 100 == 0:
        print(f"Epoch {epoch}, Loss: {loss:.4f}")
probs = model.forward(X_test)
y_pred_labels = np.argmax(probs, axis=1)
y_true_labels = np.argmax(y_test, axis=1)

accuracy = np.mean(y_pred_labels == y_true_labels)
print(f"Test Accuracy: {accuracy:.4f}")

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, ReLU, Softmax
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import CategoricalCrossentropy
import numpy as np

# 1. Load and preprocess data
iris = load_iris()
X = iris['data']
y = iris['target'].reshape(-1, 1)

# One-hot encode labels
encoder = OneHotEncoder(sparse_output=False)
y_encoded = encoder.fit_transform(y)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, stratify=y, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# 2. Define model
model = Sequential([
    Dense(5, input_shape=(4,), activation='relu'),   # hidden layer
    Dense(3, activation='softmax')                   # output layer
])
# 3. Compile
model.compile(
    optimizer=Adam(learning_rate=0.05),
    loss=CategoricalCrossentropy(),
    metrics=['accuracy']
)
# 4. Train
model.fit(X_train, y_train, epochs=500, batch_size=32, verbose=0)

# 5. Evaluate
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"Test Accuracy: {accuracy:.4f}")

In [None]:
#C:\Users\vjs\Desktop\JobSearch\ReviewNB\ML\JPProjects>jupyter nbconvert NeuralNetwork_Fundamentals.ipynb --to html --template classic