In [21]:
import numpy as np

class SoftmaxActivation:
    def __init__(self):
        self.input = None
        self.output = None

    def softmax(self, x):
        exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))
        return exp_x / np.sum(exp_x, axis=1, keepdims=True)

    def forward(self, x):
        self.input = x
        self.output = self.softmax(x)
        return self.output

    def backward(self, grad_output):
        grad_input = self.output * (grad_output - np.sum(self.output * grad_output, axis=-1, keepdims=True))
        return grad_input

# Example usage:
softmax_layer = SoftmaxActivation()

# Forward pass
x = np.array([[1.0, 2.0, 3.0]])
output = softmax_layer.forward(x)
print("Softmax output:", output)

# Example gradient from the next layer
grad_output = np.array([[0.1, 0.2, 0.3]])

# Backward pass
grad_input = softmax_layer.backward(grad_output)
print("Softmax gradient:", grad_input)


Softmax output: [[0.09003057 0.24472847 0.66524096]]
Softmax gradient: [[-0.01418171 -0.01407704  0.02825875]]


In [22]:
# https://github.com/TheIndependentCode/Neural-Network/blob/master/activations.py
class Softmax:
    def forward(self, input):
        tmp = np.exp(input)
        self.output = tmp / np.sum(tmp)
        return self.output
    
    def backward(self, output_gradient, learning_rate):
        # This version is faster than the one presented in the video
        n = np.size(self.output)
        return np.dot(output_gradient, (np.identity(n) - self.output.T) * self.output)
        # Original formula:
        # tmp = np.tile(self.output, n)
        # return np.dot(tmp * (np.identity(n) - np.transpose(tmp)), output_gradient)
    
# Example usage:
softmax_layer = SoftmaxActivation()

# Forward pass
x = np.array([[1.0, 2.0, 3.0]])
output = softmax_layer.forward(x)
print("Softmax output:", output)

# Example gradient from the next layer
grad_output = np.array([[0.1, 0.2, 0.3]])

# Backward pass
grad_input = softmax_layer.backward(grad_output)
print("Softmax gradient:", grad_input)

Softmax output: [[0.09003057 0.24472847 0.66524096]]
Softmax gradient: [[-0.01418171 -0.01407704  0.02825875]]


In [23]:

# Softmax activation
class Activation_Softmax:

    # Forward pass
    def forward(self, inputs, training):
        # Remember input values
        self.inputs = inputs

        # Get unnormalized probabilities
        exp_values = np.exp(inputs - np.max(inputs, axis=1,
                                            keepdims=True))
        # Normalize them for each sample
        probabilities = exp_values / np.sum(exp_values, axis=1,
                                            keepdims=True)

        self.output = probabilities

    # Backward pass
    def backward(self, dvalues):

        # Create uninitialized array
        self.dinputs = np.empty_like(dvalues)

        # Enumerate outputs and gradients
        for index, (single_output, single_dvalues) in \
                enumerate(zip(self.output, dvalues)):
            # Flatten output array
            single_output = single_output.reshape(-1, 1)
            # Calculate Jacobian matrix of the output
            jacobian_matrix = np.diagflat(single_output) - \
                              np.dot(single_output, single_output.T)
            # Calculate sample-wise gradient
            # and add it to the array of sample gradients
            self.dinputs[index] = np.dot(jacobian_matrix,
                                         single_dvalues)

    # Calculate predictions for outputs
    def predictions(self, outputs):
        return np.argmax(outputs, axis=1)
    

# Example usage:
softmax_layer = Activation_Softmax()

# Forward pass
x = np.array([[1.0, 2.0, 3.0]])
softmax_layer.forward(x, None)
print("Softmax output:", softmax_layer.output)

# Example gradient from the next layer
grad_output = np.array([[0.1, 0.2, 0.3]])

# Backward pass
softmax_layer.backward(grad_output)
print("Softmax gradient:", softmax_layer.dinputs)

Softmax output: [[0.09003057 0.24472847 0.66524096]]
Softmax gradient: [[-0.01418171 -0.01407704  0.02825875]]


In [25]:
class Softmax:
    # Forward pass
    @staticmethod
    def forward(inputs):
        # get row max to subtract from each row
        # this is to prevent overflow
        # the max value in each row will be 0
        # the output will be extactly the same
        row_maxs = np.max(inputs, axis=1, keepdims=True)

        # Get unnormalized probabilities
        # exponentiate values
        exp_values = np.exp(inputs - row_maxs)

        # Get normalization bases for each sample
        norm_bases = np.sum(exp_values, axis=1, keepdims=True)

        # Normalize them for each sample
        probabilities = exp_values / norm_bases

        return probabilities
    

    # back propagation
    @staticmethod
    def backward(output_grads, inputs):
        # Create uninitialized array
        input_grads = np.empty_like(output_grads)

        # Enumerate outputs and gradients
        outputs = Softmax.forward(inputs)
        # Loop over the samples
        for idx, (output_grad, output) in enumerate(zip(output_grads, outputs)):
            # Flatten output array
            single_output = output.reshape(-1, 1)

            # Calculate Jacobian matrix of the output
            jacobian_matrix = np.diagflat(single_output) - np.dot(single_output, single_output.T)

            # Calculate sample-wise gradient
            # and add it to the array of sample gradients
            input_grads[idx] = np.dot(jacobian_matrix, output_grad)

        return input_grads
    
# Example usage:
softmax_layer = Softmax()

# Forward pass
x = np.array([[1.0, 2.0, 3.0]])
output = softmax_layer.forward(x)
print("Softmax output:", output)

# Example gradient from the next layer
grad_output = np.array([[0.1, 0.2, 0.3]])

# Backward pass
grad_input = softmax_layer.backward(grad_output, x)
print("Softmax gradient:", grad_input)

Softmax output: [[0.09003057 0.24472847 0.66524096]]
Softmax gradient: [[-0.01418171 -0.01407704  0.02825875]]
