In [4]:
import numpy as np

def initialize_parameters(layer_dims):
    np.random.seed(42)  # For reproducibility
    parameters = {}
    L = len(layer_dims)
    
    # Loop through layers to initialize weights and biases
    for l in range(1, L):
        parameters[f"W{l}"] = np.random.randn(layer_dims[l], layer_dims[l - 1]) * 0.01
        parameters[f"b{l}"] = np.zeros((layer_dims[l], 1))
        
    return parameters


**Test the function initialize_parameters**


In [6]:
layer_dims = [784, 20, 7, 5, 10]
parameters = initialize_parameters(layer_dims)
print(parameters["W1"].shape)  # Should print (20, 784)
print(parameters["b1"].shape)  # Should print (20, 1)
print(parameters["W4"].shape)  # Should print (10, 5)
print(parameters["b4"].shape)  # Should print (10, 1)


(20, 784)
(20, 1)
(10, 5)
(10, 1)


In [7]:
def linear_forward(A, W, b):
    """
    Compute the linear part of a layer's forward propagation.
    
    Arguments:
    A -- activations from previous layer
    W -- weights matrix of current layer
    b -- bias vector of current layer
    
    Returns:
    Z -- linear component (W.A + b)
    linear_cache -- tuple (A, W, b) for backpropagation
    """
    Z = np.dot(W, A) + b
    linear_cache = (A, W, b)
    return Z, linear_cache


**Test the function linear_forward**


In [8]:
np.random.seed(42)
A_prev = np.random.randn(3, 2)
W = np.random.randn(4, 3)
b = np.random.randn(4, 1)

Z, cache = linear_forward(A_prev, W, b)
print("Z shape:", Z.shape)  # Should be (4, 2)
print("Z:", Z)


Z shape: (4, 2)
Z: [[ 0.48338097  0.15237448]
 [-1.33390458 -2.08407482]
 [ 0.7505205  -1.0779219 ]
 [-1.23465366 -1.76418096]]


**Softmax**

In [9]:
def softmax(Z):
    """
    Implements the softmax activation function.

    Arguments:
    Z -- numpy array of shape (number of classes, number of examples)

    Returns:
    A -- output of softmax(Z), same shape as Z
    activation_cache -- returns Z for use in backpropagation
    """
    Z_shifted = Z - np.max(Z, axis=0, keepdims=True)  # for numerical stability
    exp_Z = np.exp(Z_shifted)
    A = exp_Z / np.sum(exp_Z, axis=0, keepdims=True)
    
    activation_cache = Z
    return A, activation_cache


In [10]:
def relu(Z):
    """
    Implements the ReLU activation function.

    Arguments:
    Z -- numpy array of any shape

    Returns:
    A -- output of relu(Z), same shape as Z
    activation_cache -- returns Z for use in backpropagation
    """
    A = np.maximum(0, Z)
    activation_cache = Z
    return A, activation_cache


In [11]:
Z = np.array([[1, -1, 0],
              [2, -2, 3]])

A_relu, cache_relu = relu(Z)
A_softmax, cache_softmax = softmax(Z)

print("ReLU result:\n", A_relu)
print("Softmax result:\n", A_softmax)


ReLU result:
 [[1 0 0]
 [2 0 3]]
Softmax result:
 [[0.26894142 0.73105858 0.04742587]
 [0.73105858 0.26894142 0.95257413]]


In [12]:
def linear_activation_forward(A_prev, W, B, activation):
    """
    Implements forward propagation for LINEAR -> ACTIVATION.

    Arguments:
    A_prev -- activations from previous layer
    W -- weights matrix
    B -- bias vector
    activation -- the activation function to use: "relu" or "softmax"

    Returns:
    A -- output of the activation function
    cache -- dictionary containing linear_cache and activation_cache
    """
    Z, linear_cache = linear_forward(A_prev, W, B)

    if activation == "relu":
        A, activation_cache = relu(Z)
    elif activation == "softmax":
        A, activation_cache = softmax(Z)
    else:
        raise ValueError("Activation must be 'relu' or 'softmax'")

    cache = {
        "linear_cache": linear_cache,
        "activation_cache": activation_cache
    }

    return A, cache


In [13]:
np.random.seed(42)
A_prev = np.random.randn(3, 2)
W = np.random.randn(4, 3)
B = np.random.randn(4, 1)

A_relu, cache_relu = linear_activation_forward(A_prev, W, B, activation="relu")
print("ReLU output:\n", A_relu)

A_softmax, cache_softmax = linear_activation_forward(A_prev, W, B, activation="softmax")
print("Softmax output:\n", A_softmax)


ReLU output:
 [[0.48338097 0.15237448]
 [0.         0.        ]
 [0.7505205  0.        ]
 [0.         0.        ]]
Softmax output:
 [[0.37762821 0.64676528]
 [0.0613518  0.06909858]
 [0.49326653 0.18898867]
 [0.06775346 0.09514747]]


In [18]:
def l_model_forward(X, parameters, use_batchnorm=False):
    """
    Implements forward pass for [LINEAR->RELU]*(L-1) -> LINEAR -> SOFTMAX.

    Arguments:
    X -- input data
    parameters -- dictionary containing W1...WL, b1...bL
    use_batchnorm -- whether to apply batch normalization after activation

    Returns:
    AL -- last post-activation value
    caches -- list of caches from each layer
    """
    caches = []
    A = X
    L = len(parameters) // 2  # number of layers

    for l in range(1, L):
        A_prev = A
        W = parameters[f"W{l}"]
        b = parameters[f"b{l}"]
        A, cache = linear_activation_forward(A_prev, W, b, activation="relu")

        if use_batchnorm:
            A = apply_batchnorm(A)

        caches.append(cache)

    # Final layer: LINEAR -> SOFTMAX
    WL = parameters[f"W{L}"]
    bL = parameters[f"b{L}"]
    AL, cache = linear_activation_forward(A, WL, bL, activation="softmax")
    caches.append(cache)

    return AL, caches


In [14]:
def compute_cost(AL, Y):
    """
    Implements the categorical cross-entropy loss function.

    Arguments:
    AL -- probability predictions from softmax, shape (num_classes, m)
    Y -- true labels (one-hot encoded), shape (num_classes, m)

    Returns:
    cost -- cross-entropy cost (scalar)
    """
    m = Y.shape[1]

    # Avoid log(0) by adding small epsilon
    epsilon = 1e-15
    AL = np.clip(AL, epsilon, 1 - epsilon)

    cost = -np.sum(Y * np.log(AL)) / m
    return cost


In [15]:
Y = np.array([[1, 0], [0, 1]])  # true labels (2 classes, 2 samples)
AL = np.array([[0.8, 0.1], [0.2, 0.9]])  # predictions

cost = compute_cost(AL, Y)
print("Cost:", cost)  # Should be a small positive number


Cost: 0.164252033486018


**BatchNorm**

In [19]:
def apply_batchnorm(A, epsilon=1e-8):
    """
    Applies batch normalization to the activation values of a layer.

    Arguments:
    A -- activation values of shape (layer size, number of examples)
    epsilon -- small float to prevent division by zero

    Returns:
    NA -- normalized activations (same shape as A)
    """
    mu = np.mean(A, axis=1, keepdims=True)
    var = np.var(A, axis=1, keepdims=True)
    NA = (A - mu) / np.sqrt(var + epsilon)
    return NA


In [20]:
A = np.random.randn(5, 10)
NA = apply_batchnorm(A)
print("Before normalization: mean =", np.mean(A, axis=1))
print("After normalization: mean =", np.mean(NA, axis=1))  # should be ~0


Before normalization: mean = [-0.22077365 -0.34418034 -0.34990549  0.13905794  0.24136955]
After normalization: mean = [ 4.44089210e-17 -2.22044605e-17 -8.46545056e-17  5.55111512e-18
  2.22044605e-17]
