In [None]:
import numpy as np

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def sigmoid_derivative(x):
    return sigmoid(x) * (1 - sigmoid(x))

def initialize_weights(input_dim, hidden_layers, output_dim):
    weights = {}
    layers_dims = [input_dim] + hidden_layers + [output_dim]
    for i in range(1, len(layers_dims)):
        weights['W' + str(i)] = np.random.randn(layers_dims[i], layers_dims[i-1]) * 0.01
        weights['b' + str(i)] = np.zeros((layers_dims[i], 1))
    return weights

def forward_propagation(X, weights):
    caches = []
    A = X
    L = len(weights) // 2
    for l in range(1, L):
        A_prev = A
        Z = np.dot(weights['W' + str(l)], A_prev) + weights['b' + str(l)]
        A = sigmoid(Z)
        caches.append((A_prev, Z))
    Z_last = np.dot(weights['W' + str(L)], A) + weights['b' + str(L)]
    AL = Z_last # Linear activation for the last layer
    caches.append((A, Z_last))
    return AL, caches

def compute_cost(AL, Y):
    m = Y.shape[1]
    cost = np.sum((AL - Y) ** 2) / (2 * m)
    return cost

def backward_propagation(AL, Y, caches, weights):
    grads = {}
    L = len(caches)
    m = AL.shape[1]
    dAL = (AL - Y)
    current_cache = caches[L - 1]
    A_prev, Z_last = current_cache
    grads['dW' + str(L)] = np.dot(dAL, A_prev.T) / m
    grads['db' + str(L)] = np.sum(dAL, axis=1, keepdims=True) / m
    dA = np.dot(weights['W' + str(L)].T, dAL)
    for l in reversed(range(L - 1)):
        current_cache = caches[l]
        A_prev, Z = current_cache
        dZ = dA * sigmoid_derivative(Z)
        grads['dW' + str(l + 1)] = np.dot(dZ, A_prev.T) / m
        grads['db' + str(l + 1)] = np.sum(dZ, axis=1, keepdims=True) / m
        dA = np.dot(weights['W' + str(l + 1)].T, dZ)
    return grads
