In [1]:
# Stack overflow
from typing import Optional, Union

import numpy as np

from tensorflow.keras.losses import binary_crossentropy, mean_squared_error
import tensorflow as tf

def deriv_bce(y, y_hat):
    if y == 1:
        return -1 / y_hat
    else:
        return 1 / (1 - y_hat)

class MeanSquaredError:
    """Mean squared error cost (loss) function.

    The predictions are the activations of the network. The order of
    arguments in the `derivative` was based on
    `Four fundamental equations behind backpropagation` from
    Nielsen (Ch.2, 2015). Similarly, the gradient calculation in BP1a of 
    is described in the same resource.
    """

    def gradient(
            self, inputs: tuple[np.ndarray, np.ndarray]) -> np.ndarray:
        """Computes the gradient with respect to all activations (preds).

        This is a vectorized function and is called on each element of 
        an activation vector in order to compute the partial derivative
        of the cost with respect to the j^{th} activation for the 
        l^{th} layer.

        MSE = (1/dims) * (pred - true)^{2}
        dMSE/dPred =  (2/dim) * (pred - true)

        Args:
            inputs: Targets, predictions vectors.

        Returns:
            Vector (gradient) of values.
        """

        targets, predictions = inputs
        return (2 / targets.shape[-1]) * (predictions - targets)

    def __call__(
            self,
            inputs: tuple[np.ndarray, np.ndarray],
            axis: Optional[int] = None) -> np.float64:
        """Compute cost given inputs.

        Args:
            inputs: Targets and predictions vectors.

        Return:
            Scalar cost.
        """

        targets, predictions = inputs
        return np.mean(np.square(targets - predictions), axis=axis)

class BinaryCrossEntropy:
    """Binary cross entropy loss (cost) function."""

    def __init__(self, from_logits: bool = False):
        """Initializes sigmoid function for binary cross entropy.

        Args:
         from_logits: True for logits, false for normalized log 
                probabilities (i.e., used sigmoid activation function).
                Assumes not from logits.
        """

        self.sigmoid = lambda t: 1 / (1 + np.exp(-t))
        self.from_logits = from_logits

    def gradient(self, inputs: tuple[np.ndarray, np.ndarray]) -> np.ndarray:
        """Derivative with respect to a single activation (same as derivative).

        Should there be a from logits check here??

        Args:
            inputs: Targets, predictions vectors. Presumably, the inputs 
            here also have to be normalized log probabilities.

        Returns:
            Vector (gradient) of values.
        """
        targets, predictions = inputs

        if self.from_logits:
            predictions = self.sigmoid(predictions)

        return -1 * ((targets/predictions) - ((1-targets) / (1-predictions)))

    def __call__(self,
                 inputs: tuple[np.ndarray, np.ndarray],
                 axis: Optional[int] = None) -> np.ndarray:
        """Compute cost given inputs.

        Args:
            inputs: Targets and predictions vectors. 
                Assumes predictions are not from logits.

        Return:
            Scalar cost.
        """

        targets, predictions = inputs

        if self.from_logits:
            predictions = self.sigmoid(predictions)

        return -1 * np.mean(targets * np.log(predictions) + (1 - targets) * np.log(1 - predictions), axis=axis)

# MSE gradient example

# Instantiate cost function objects
mse = MeanSquaredError()
bce = BinaryCrossEntropy()
sigmoid = lambda t: 1 / (1 + np.exp(-t))

# Validate MSE grad
a_L_np = np.array([0.12, 0.35, 0.61])
y_true_np = np.array([0.11, 0.01, 0.59])
a_L_tf = tf.Variable(a_L_np)
y_true_tf = tf.constant(y_true_np)

# tf gradient context
with tf.GradientTape() as tape:
    C = mean_squared_error(y_true=y_true_tf, y_pred=a_L_tf)

dC_daL = tape.gradient(C, a_L_tf)
print('-- MSE -- ')
print('tf gradient tape:', dC_daL.numpy())

# My implementation
dC_daL_np = mse.gradient((y_true_np, a_L_np))
print('mse.gradient:', dC_daL_np)
print()

#### BCE ####
y_true = tf.constant(np.array([0., 1., 0., 0.]))
y_pred_logits = np.array([-18.6, 0.51, 2.94, -12.8])
y_pred_proba = tf.Variable(sigmoid(y_pred_logits))

with tf.GradientTape() as tape:
    C = binary_crossentropy(y_true, y_pred_proba)

print('-- BCE --')
dC_dProbaActivation = tape.gradient(C, y_pred_proba)
print('tf gradient tape:', dC_dProbaActivation.numpy())
dC_dProbaActivationMine = bce.gradient((y_true, y_pred_proba))
print('bce.gradient:', dC_dProbaActivationMine.numpy())
print('ml master deriv bce:', [deriv_bce(y_true[sample].numpy(), y_pred_proba[sample].numpy()) for sample in range(y_true.shape[0])])

#### Outputs ####
# -- MSE -- 
# tf gradient tape: [0.00666667 0.22666667 0.01333333]
# mse.gradient: [0.00666667 0.22666667 0.01333333]

# -- BCE --
# tf gradient tape: [ 0.         -0.40012383  4.97895166  0.25000067]
# bce.gradient: [ 1.00000001 -1.60049558 19.91584631  1.00000276]

-- MSE -- 
tf gradient tape: [0.00666667 0.22666667 0.01333333]
mse.gradient: [0.00666667 0.22666667 0.01333333]

-- BCE --
tf gradient tape: [ 0.         -0.40012383  4.97895166  0.25000067]
bce.gradient: [ 1.00000001 -1.60049558 19.91584631  1.00000276]
ml master deriv bce: [1.00000000835839, -1.600495578812266, 19.915846312255084, 1.000002760772572]
