In [None]:
import numpy as np
import matplotlib.pyplot as plt

def spiral_data(samples, classes):
    X = np.zeros((samples*classes, 2))
    y = np.zeros(samples*classes, dtype='uint8')
    for class_number in range(classes):
        ix = range(samples*class_number, samples*(class_number+1))
        r = np.linspace(0.0, 1, samples)
        t = np.linspace(class_number*4, (class_number+1)*4, samples) + np.random.randn(samples)*0.2
        X[ix] = np.c_[r*np.sin(t*2.5), r*np.cos(t*2.5)]
        y[ix] = class_number
    return X, y

X, y = spiral_data(100, 3)

fig, ax = plt.subplots(figsize=(12, 12))
ax.scatter(X[:, 0], X[:, 1], c=y, cmap="brg", marker="o", s=500, alpha=0.6)
plt.show()

# Learning Rate

In [None]:
import numpy as np
import matplotlib.pyplot as plt

def our_function(x):
    y = (x**2 + 2*x) + np.sin(x) + 5*np.sin(x*2) - 10*np.sin(x/3) + 5
    return y

def d_f(x):
    df = 2*x + 2 + np.cos(x) + 10*np.cos(x*2) - (10/3) * np.cos(x/3) 
    return df

x = np.linspace(-5, 5, 100)
y = our_function(x)
df = d_f(x)

fig, ax = plt.subplots(figsize=(12, 12))
ax.plot(x, y)
#ax.plot(x, df)
#ax.grid()
plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt

def our_function(x):
    y = (x**2 + 2*x) + np.sin(x) + 5*np.sin(x*2) - 10*np.sin(x/3) + 5
    return y

def d_f(x):
    df = 2*x + 2 + np.cos(x) + 10*np.cos(x*2) - (10/3) * np.cos(x/3) 
    return df

x = np.linspace(-5, 5, 100)
y = our_function(x)
df = d_f(x)

x1 = -4
y1 = our_function(x1)
print(f"X1: {x1:.2f} \t Y1: {y1:.2f}")

learning_rate = 0.01
dx1 = d_f(x1)
new_x1 = x1 - learning_rate*dx1
new_y1 = our_function(new_x1)
print(f"X1: {new_x1:.2f} \t Y1: {new_y1:.2f}")

fig, axs = plt.subplots(nrows=2, ncols=1, figsize=(12, 24))
axs[0].plot(x, y, alpha=0.4)
axs[0].scatter([x1, new_x1], [y1, new_y1], color="orange", s=50)

axs[1].plot([1,2], [y1, new_y1])
axs[1].set_ylabel("Loss")
plt.show()

In [None]:
def plot_learning(x_path, axs):
    axs[0].plot(x, y, linewidth=3.5, alpha=0.4) # plot the "loss" function

    alpha_points = np.linspace(0.4, 1, len(x_path))
    for i in range(1, len(alpha_points)):
        xs = np.array([ x_path[i-1], x_path[i] ]) # get two by two points and plot every line with different alpha
        axs[0].plot(xs, our_function(xs), c="orange", marker="o", alpha=alpha_points[i])
    
    loss_values = our_function(x_path)
    axs[1].plot(range(len(x_path)), loss_values)
    axs[1].set_ylabel("Loss")
        
x = np.linspace(-5, 5, 100)
y = our_function(x)

x_path = [-4.5]
#x_path = [np.random.randint(-5, 5)]
epochs = 10
learning_rate = 0.01
for i in range(epochs):
    current_x = x_path[i]
    d_value = d_f(current_x)
    new_x = current_x - learning_rate*d_value
    x_path.append(new_x)

x_path = np.array(x_path)
for x_ in x_path:
    print(f"Loss: {our_function(x_)}")

fig, axs = plt.subplots(nrows=2, ncols=1, figsize=(12, 24))
plot_learning(x_path, axs)
plt.show()

In [None]:
def plot_learning(x_path, axs, show_n_xpath=10):
    axs[0].plot(x, y, linewidth=3.5, alpha=0.4) # plot the "loss" function

    alpha_points = np.linspace(0.4, 1, len(x_path[:show_n_xpath]))
    for i in range(1, len(alpha_points)):
        xs = np.array([ x_path[i-1], x_path[i] ]) # get two by two points and plot every line with different alpha
        axs[0].plot(xs, our_function(xs), c="orange", marker="o", alpha=alpha_points[i])
    
    loss_values = our_function(x_path)
    axs[1].plot(range(len(x_path)), loss_values)
    axs[1].set_ylabel("Loss")
        
x = np.linspace(-5, 5, 100)
y = our_function(x)

x_path = [-4.5]
#x_path = [np.random.randint(-5, 5)]
# <=== HERE ===>
epochs = 25
learning_rate = 1
# <=== HERE ===>
for i in range(epochs):
    current_x = x_path[i]
    d_value = d_f(current_x)
    new_x = current_x - learning_rate*d_value
    x_path.append(new_x)

x_path = np.array(x_path)
for x_ in x_path:
    print(f"Loss: {our_function(x_)}")

fig, axs = plt.subplots(nrows=2, ncols=1, figsize=(12, 24))
plot_learning(x_path, axs)
plt.show()

In [None]:
def plot_learning(x_path, axs, show_n_xpath=10):
    axs[0].plot(x, y, linewidth=3.5, alpha=0.4) # plot the "loss" function

    alpha_points = np.linspace(0.4, 1, len(x_path[:show_n_xpath]))
    for i in range(1, len(alpha_points)):
        xs = np.array([ x_path[i-1], x_path[i] ]) # get two by two points and plot every line with different alpha
        axs[0].plot(xs, our_function(xs), c="orange", marker="o", alpha=alpha_points[i])
    
    loss_values = our_function(x_path)
    axs[1].plot(range(len(x_path)), loss_values)
    axs[1].set_ylabel("Loss")
        
x = np.linspace(-5, 5, 100)
y = our_function(x)

x_path = [-4.5]
#x_path = [np.random.randint(-5, 5)]
# <=== HERE ===>
epochs = 10
learning_rate = 0.19
# <=== HERE ===>
for i in range(epochs):
    current_x = x_path[i]
    d_value = d_f(current_x)
    new_x = current_x - learning_rate*d_value
    x_path.append(new_x)

x_path = np.array(x_path)
for x_ in x_path:
    print(f"Loss: {our_function(x_)}")

fig, axs = plt.subplots(nrows=2, ncols=1, figsize=(12, 24))
plot_learning(x_path, axs)
plt.show()

# Learning Rate Decay

Splošna enačba za learning decay je:

$\Large l_r = l_{r start} * \frac{1}{1 + decay * step} $

* $l_r$ - new learning rate
* $l_{r start}$ - starting learning rate
* $desay$ - learning rate decay
* $step$ - current step we are on

In [None]:
def calc_lr(step):
    return starting_learning_rate * (1. / (1 + learning_rate_decay * step))

starting_learning_rate = 1.
learning_rate_decay = 0.1
steps = np.linspace(0,100,100)

learning_rate = calc_lr(steps)


plt.plot(steps, learning_rate)
plt.show()

In [None]:
# <=== HERE ===>
def calc_lr(step):
    return starting_learning_rate * (1. / (1 + learning_rate_decay * step))

starting_learning_rate = 0.19
learning_rate_decay = 0.1
# <=== HERE ===>

x = np.linspace(-5, 5, 100)
y = our_function(x)

x_path = [-4.5]
epochs = 10

for i in range(epochs):
    current_x = x_path[i]
    d_value = d_f(current_x)
    # <=== HERE ===>
    new_x = current_x - calc_lr(i)*d_value
    # <=== HERE ===>
    x_path.append(new_x)

x_path = np.array(x_path)
for x_ in x_path:
    print(f"Loss: {our_function(x_)}")

fig, axs = plt.subplots(nrows=2, ncols=1, figsize=(12, 24))
plot_learning(x_path, axs)
plt.show()

In [None]:
def calc_lr(step):
    return starting_learning_rate * (1. / (1 + learning_rate_decay * step))

starting_learning_rate = 0.19
learning_rate_decay = 0.1

x = np.linspace(-5, 5, 100)
y = our_function(x)

x_path = [-4.5]
# <=== HERE ===>
epochs = 50
# <=== HERE ===>

for i in range(epochs):
    current_x = x_path[i]
    d_value = d_f(current_x)
    new_x = current_x - calc_lr(i)*d_value
    x_path.append(new_x)

x_path = np.array(x_path)
for x_ in x_path:
    print(f"Loss: {our_function(x_)}")

fig, axs = plt.subplots(nrows=2, ncols=1, figsize=(12, 24))
plot_learning(x_path, axs)
plt.show()

In [None]:
# SGD optimizer
class Optimizer_SGD:

    # Initialize optimizer - set settings,
    # learning rate of 1. is default for this optimizer
    def __init__(self, learning_rate=1., decay=0.):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0

    # Call once before any parameter updates
    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * (1. / (1. + self.decay * self.iterations))

    # Update parameters
    def update_params(self, layer):
        layer.weights += -self.current_learning_rate * layer.dweights
        layer.biases += -self.current_learning_rate * layer.dbiases

    # Call once after any parameter updates
    def post_update_params(self):
        self.iterations += 1


In [None]:
import numpy as np
import matplotlib.pyplot as plt
np.random.seed(2020)

# Dense layer
class Layer_Dense:

    # Layer initialization
    def __init__(self, n_inputs, n_neurons):
        # Initialize weights and biases
        self.weights = 0.01 * np.random.randn(n_inputs, n_neurons)
        self.biases = np.zeros((1, n_neurons))

    # Forward pass
    def forward(self, inputs):
        # Remember input values
        self.inputs = inputs
        # Calculate output values from inputs, weights and biases
        self.output = np.dot(inputs, self.weights) + self.biases

    # Backward pass
    def backward(self, dvalues):
        # Gradients on parameters
        self.dweights = np.dot(self.inputs.T, dvalues)
        self.dbiases = np.sum(dvalues, axis=0, keepdims=True)
        # Gradient on values
        self.dinputs = np.dot(dvalues, self.weights.T)


# ReLU activation
class Activation_ReLU:

    # Forward pass
    def forward(self, inputs):

        # Remember input values
        self.inputs = inputs
        # Calculate output values from inputs
        self.output = np.maximum(0, inputs)

    # Backward pass
    def backward(self, dvalues):
        # Since we need to modify original variable,
        # let's make a copy of values first
        self.dinputs = dvalues.copy()

        # Zero gradient where input values were negative
        self.dinputs[self.inputs <= 0] = 0


# Softmax activation
class Activation_Softmax:

    # Forward pass
    def forward(self, inputs):
        # Remember input values
        self.inputs = inputs

        # Get unnormalized probabilities
        exp_values = np.exp(inputs - np.max(inputs, axis=1,
                                            keepdims=True))
        # Normalize them for each sample
        probabilities = exp_values / np.sum(exp_values, axis=1,
                                            keepdims=True)

        self.output = probabilities

    # Backward pass
    def backward(self, dvalues):

        # Create uninitialized array
        self.dinputs = np.empty_like(dvalues)

        # Enumerate outputs and gradients
        for index, (single_output, single_dvalues) in \
                enumerate(zip(self.output, dvalues)):
            # Flatten output array
            single_output = single_output.reshape(-1, 1)
            # Calculate Jacobian matrix of the output and
            jacobian_matrix = np.diagflat(single_output) - \
                              np.dot(single_output, single_output.T)

            # Calculate sample-wise gradient
            # and add it to the array of sample gradients
            self.dinputs[index] = np.dot(jacobian_matrix,
                                         single_dvalues)


# Common loss class
class Loss:

    # Calculates the data and regularization losses
    # given model output and ground truth values
    def calculate(self, output, y):

        # Calculate sample losses
        sample_losses = self.forward(output, y)

        # Calculate mean loss
        data_loss = np.mean(sample_losses)

        # Return loss
        return data_loss


# Cross-entropy loss
class Loss_CategoricalCrossentropy(Loss):

    # Forward pass
    def forward(self, y_pred, y_true):

        # Number of samples in a batch
        samples = len(y_pred)

        # Clip data to prevent division by 0
        # Clip both sides to not drag mean towards any value
        y_pred_clipped = np.clip(y_pred, 1e-7, 1 - 1e-7)

        # Probabilities for target values -
        # only if categorical labels
        if len(y_true.shape) == 1:
            correct_confidences = y_pred_clipped[
                range(samples),
                y_true
            ]


        # Mask values - only for one-hot encoded labels
        elif len(y_true.shape) == 2:
            correct_confidences = np.sum(
                y_pred_clipped * y_true,
                axis=1
            )

        # Losses
        negative_log_likelihoods = -np.log(correct_confidences)
        return negative_log_likelihoods

    # Backward pass
    def backward(self, dvalues, y_true):

        # Number of samples
        samples = len(dvalues)
        # Number of labels in every sample
        # We'll use the first sample to count them
        labels = len(dvalues[0])

        # If labels are sparse, turn them into one-hot vector
        if len(y_true.shape) == 1:
            y_true = np.eye(labels)[y_true]

        # Calculate gradient
        self.dinputs = -y_true / dvalues
        # Normalize gradient
        self.dinputs = self.dinputs / samples


# Softmax classifier - combined Softmax activation
# and cross-entropy loss for faster backward step
class Activation_Softmax_Loss_CategoricalCrossentropy():

    # Creates activation and loss function objects
    def __init__(self):
        self.activation = Activation_Softmax()
        self.loss = Loss_CategoricalCrossentropy()

    # Forward pass
    def forward(self, inputs, y_true):
        # Output layer's activation function
        self.activation.forward(inputs)
        # Set the output
        self.output = self.activation.output
        # Calculate and return loss value
        return self.loss.calculate(self.output, y_true)


    # Backward pass
    def backward(self, dvalues, y_true):

        # Number of samples
        samples = len(dvalues)

        # If labels are one-hot encoded,
        # turn them into discrete values
        if len(y_true.shape) == 2:
            y_true = np.argmax(y_true, axis=1)

        # Copy so we can safely modify
        self.dinputs = dvalues.copy()
        # Calculate gradient
        self.dinputs[range(samples), y_true] -= 1
        # Normalize gradient
        self.dinputs = self.dinputs / samples


        
# SGD optimizer
class Optimizer_SGD:

    # Initialize optimizer - set settings,
    # learning rate of 1. is default for this optimizer
    def __init__(self, learning_rate=1., decay=0.):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0

    # Call once before any parameter updates
    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * (1. / (1. + self.decay * self.iterations))

    # Update parameters
    def update_params(self, layer):
        layer.weights += -self.current_learning_rate * layer.dweights
        layer.biases += -self.current_learning_rate * layer.dbiases

    # Call once after any parameter updates
    def post_update_params(self):
        self.iterations += 1


# Create dataset
X, y = spiral_data(samples=100, classes=3)

# Create Dense layer with 2 input features and 64 output values
dense1 = Layer_Dense(2, 128)

# Create ReLU activation (to be used with Dense layer):
activation1 = Activation_ReLU()

# Create second Dense layer with 64 input features (as we take output
# of previous layer here) and 3 output values (output values)
dense2 = Layer_Dense(128, 3)

# Create Softmax classifier's combined loss and activation
loss_activation = Activation_Softmax_Loss_CategoricalCrossentropy()

# Create optimizer
optimizer = Optimizer_SGD(learning_rate=1, decay=1e-4)

# Train in loop
losses = [] # Used to plot loss values and see how our model learned
for epoch in range(10_001):

    # Perform a forward pass of our training data through this layer
    dense1.forward(X)

    # Perform a forward pass through activation function
    # takes the output of first dense layer here
    activation1.forward(dense1.output)

    # Perform a forward pass through second Dense layer
    # takes outputs of activation function of first layer as inputs
    dense2.forward(activation1.output)

    # Perform a forward pass through the activation/loss function
    # takes the output of second dense layer here and returns loss
    loss = loss_activation.forward(dense2.output, y)

    # Calculate accuracy from output of activation2 and targets
    # calculate values along first axis
    predictions = np.argmax(loss_activation.output, axis=1)
    if len(y.shape) == 2:
        y = np.argmax(y, axis=1)
    accuracy = np.mean(predictions==y)

    if not epoch % 100:
        losses.append(loss)
        print(f'epoch: {epoch}, ' +
              f'acc: {accuracy:.3f}, ' +
              f'loss: {loss:.3f}, ' +
              f'lr: {optimizer.current_learning_rate}')
    # Backward pass
    loss_activation.backward(loss_activation.output, y)
    dense2.backward(loss_activation.dinputs)
    activation1.backward(dense2.dinputs)
    dense1.backward(activation1.dinputs)

    # Update weights and biases
    optimizer.pre_update_params()
    optimizer.update_params(dense1)
    optimizer.update_params(dense2)
    optimizer.post_update_params()

    
fig, axs = plt.subplots(nrows=2, ncols=1, figsize=(12, 24))
axs[0].scatter(X[:, 0], X[:, 1], c=y, cmap="brg", marker="o", s=500, alpha=0.6)
ax2 = axs[0].twinx()
ax2.scatter(X[:, 0], X[:, 1], c=predictions, cmap="brg", marker="o", s=100, edgecolors="black")

axs[1].plot(range(len(losses)), losses)
axs[1].set_xlabel("100 * Epoch")
axs[1].set_ylabel("Loss")
plt.show()

In [None]:
# Validate the model

# Create test dataset
X_test, y_test = spiral_data(samples=100, classes=3)





# Perform a forward pass of our training data through this layer
dense1.forward(X_test)

# Perform a forward pass through activation function
# takes the output of first dense layer here
activation1.forward(dense1.output)

# Perform a forward pass through second Dense layer
# takes outputs of activation function of first layer as inputs
dense2.forward(activation1.output)

# Perform a forward pass through the activation/loss function
# takes the output of second dense layer here and returns loss
loss = loss_activation.forward(dense2.output, y_test)

# Calculate accuracy from output of activation2 and targets
# calculate values along first axis
predictions = np.argmax(loss_activation.output, axis=1)
if len(y_test.shape) == 2:
    y_test = np.argmax(y_test, axis=1)
accuracy = np.mean(predictions==y_test)

print(f'validation, acc: {accuracy:.3f}, loss: {loss:.3f}')

fig, ax = plt.subplots(figsize=(12, 12))
ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap="brg", marker="o", s=500, alpha=0.6)
ax2 = ax.twinx()
ax2.scatter(X_test[:, 0], X_test[:, 1], c=predictions, cmap="brg", marker="o", s=100, edgecolors="black")
plt.show()

![Loss krivulje](images/loss_krivulje.png)

---

# L1 and L2 Regularization

**L2 Regularization penalty for weights**

$\Large L_{2w} = \lambda \sum_m w_m^2 $

**L2 Regularization penalty for biases**

$\Large L_{2b} = \lambda \sum_n b_n^2 $

**L1 Regularization penalty for weights**

$\Large L_{1w} = \lambda \sum_m |w_m| $

**L1 Regularization penalty for biases**

$\Large L_{1b} = \lambda \sum_n |b_n| $

$\Large total\_loss = loss + L_{1w} + L_{1w} + L_{2w} + L_{2b} $

In [None]:
# Layer initialization
def __init__(self, n_inputs, n_neurons, 
             weight_regularizer_l1=0, 
             weight_regularizer_l2=0,
             bias_regularizer_l1=0, 
             bias_regularizer_l2=0):
    # Initialize weights and biases
    self.weights = 0.01 * np.random.randn(n_inputs, n_neurons)
    self.biases = np.zeros((1, n_neurons))
    # Set regularization strength
    self.weight_regularizer_l1 = weight_regularizer_l1
    self.weight_regularizer_l2 = weight_regularizer_l2
    self.bias_regularizer_l1 = bias_regularizer_l1
    self.bias_regularizer_l2 = bias_regularizer_l2

In [None]:
# Regularization loss calculation
def regularization_loss(self, layer):

    # 0 by default
    regularization_loss = 0

    # L1 regularization - weights
    # calculate only when factor greater than 0
    if layer.weight_regularizer_l1 > 0:
        regularization_loss += layer.weight_regularizer_l1 * np.sum(np.abs(layer.weights))

    # L2 regularization - weights
    if layer.weight_regularizer_l2 > 0:
        regularization_loss += layer.weight_regularizer_l2 * np.sum(layer.weights * layer.weights)

    # L1 regularization - biases
    # calculate only when factor greater than 0
    if layer.bias_regularizer_l1 > 0:
        regularization_loss += layer.bias_regularizer_l1 * np.sum(np.abs(layer.biases))

    # L2 regularization - biases
    if layer.bias_regularizer_l2 > 0:
        regularization_loss += layer.bias_regularizer_l2 * np.sum(layer.biases * layer.biases)

    return regularization_loss

In [None]:
# Calculate loss from output of activation2 so softmax activation
data_loss = loss_function.forward(activation2.output, y)

# Calculate regularization penalty
regularization_loss = loss_function.regularization_loss(dense1) + \
                      loss_function.regularization_loss(dense2)

# Calculate overall loss
loss = data_loss + regularization_loss

---

**L2 odvod uteži**

$\Large \frac{\partial L_{2w}}{\partial w_m} = \frac{\partial }{\partial w_m}( \lambda \sum_{m}w_m^2) = \lambda \frac{\partial }{\partial w_m}w_m^2 = \lambda 2 w_m$

**L2 odvod bias**

$\Large \frac{\partial L_{2b}}{\partial b_n} = \frac{\partial }{\partial b_n}( \lambda \sum_{n}b_n^2) = \lambda \frac{\partial }{\partial b_n}b_n^2 = \lambda 2 b_n$

$\Large 
|x| = 
\left\{
	\begin{array}{ll}
		x  & x > 0 \\
        -x  & x < 0
	\end{array}
\right.
$

Odvod absolutne funkcije je:

$\Large 
|x|' = 
\left\{
	\begin{array}{ll}
		1  & x > 0 \\
        -1  & x < 0
	\end{array}
\right.
$

**L1 odvod uteži**

$\Large \frac{\partial L_{1w}}{\partial w_m} = \frac{\partial }{\partial w_m}( \lambda \sum_m {|w_m|}) = \lambda \frac{\partial }{\partial w_m} |w_m| = \lambda \left\{
	\begin{array}{ll}
		1  & w_m > 0 \\
        -1  & w_m < 0
	\end{array}
\right.$

**L1 odvod bias**

$\Large \frac{\partial L_{1w}}{\partial b_n} = \frac{\partial }{\partial b_n}( \lambda \sum_n {|b_n|}) = \lambda \frac{\partial }{\partial b_n} |b_n| = \lambda \left\{
	\begin{array}{ll}
		1  & b_n > 0 \\
        -1  & b_n < 0
	\end{array}
\right.$

In [None]:
weights = [0.2, 0.8, -0.5]
weight_regularizer_l1 = 0.1 # lambda
dL1 = []  # array of partial derivatives of L1 regularization
for weight in weights:
    if weight >= 0:
        dL1.append(weight_regularizer_l1*1)
    else:
        dL1.append(weight_regularizer_l1*-1)
print(dL1)

In [None]:
weights = [[0.2, 0.8, -0.5, 1],  # now we have 3 sets of weights
           [0.5, -0.91, 0.26, -0.5],
           [-0.26, -0.27, 0.17, 0.87]]
weight_regularizer_l1 = 0.1 # lambda
dL1 = []  # array of partial derivatives of L1 regularization
for neuron in weights:
    neuron_dL1 = []  # derivatives related to one neuron
    for weight in neuron:
        if weight >= 0:
            neuron_dL1.append(weight_regularizer_l1*1)
        else:
            neuron_dL1.append(weight_regularizer_l1*-1)
    dL1.append(neuron_dL1)

for neuron in dL1:
    print(neuron)

In [None]:
import numpy as np

weights = np.array([[0.2, 0.8, -0.5, 1],
                    [0.5, -0.91, 0.26, -0.5],
                    [-0.26, -0.27, 0.17, 0.87]])
weight_regularizer_l1 = 0.1 # lambda
dL1 = np.ones_like(weights)
dL1[weights < 0] = -1
dL1 = dL1 * weight_regularizer_l1

print(dL1)

---

In [None]:
import numpy as np
import matplotlib.pyplot as plt
np.random.seed(2020)

class Layer_Dense:

    # Layer initialization
    def __init__(self, n_inputs, n_neurons,
                         weight_regularizer_l1=0, 
                         weight_regularizer_l2=0,
                         bias_regularizer_l1=0, 
                         bias_regularizer_l2=0):
        # Initialize weights and biases
        self.weights = 0.01 * np.random.randn(n_inputs, n_neurons)
        self.biases = np.zeros((1, n_neurons))
        # Set regularization strength
        self.weight_regularizer_l1 = weight_regularizer_l1
        self.weight_regularizer_l2 = weight_regularizer_l2
        self.bias_regularizer_l1 = bias_regularizer_l1
        self.bias_regularizer_l2 = bias_regularizer_l2

    # Forward pass
    def forward(self, inputs):
        # Remember input values
        self.inputs = inputs
        # Calculate output values from inputs, weights and biases
        self.output = np.dot(inputs, self.weights) + self.biases

    # Backward pass
    def backward(self, dvalues):
        # Gradients on parameters
        self.dweights = np.dot(self.inputs.T, dvalues)
        self.dbiases = np.sum(dvalues, axis=0, keepdims=True)

        # Gradients on regularization
        # L1 on weights
        if self.weight_regularizer_l1 > 0:
            dL1 = np.ones_like(self.weights)
            dL1[self.weights < 0] = -1
            self.dweights += self.weight_regularizer_l1 * dL1
        # L2 on weights
        if self.weight_regularizer_l2 > 0:
            self.dweights += 2 * self.weight_regularizer_l2 * self.weights
        # L1 on biases
        if self.bias_regularizer_l1 > 0:
            dL1 = np.ones_like(self.biases)
            dL1[self.biases < 0] = -1
            self.dbiases += self.bias_regularizer_l1 * dL1
        # L2 on biases
        if self.bias_regularizer_l2 > 0:
            self.dbiases += 2 * self.bias_regularizer_l2 * self.biases

        # Gradient on values
        self.dinputs = np.dot(dvalues, self.weights.T)



# ReLU activation
class Activation_ReLU:

    # Forward pass
    def forward(self, inputs):

        # Remember input values
        self.inputs = inputs
        # Calculate output values from inputs
        self.output = np.maximum(0, inputs)

    # Backward pass
    def backward(self, dvalues):
        # Since we need to modify original variable,
        # let's make a copy of values first
        self.dinputs = dvalues.copy()

        # Zero gradient where input values were negative
        self.dinputs[self.inputs <= 0] = 0


# Softmax activation
class Activation_Softmax:

    # Forward pass
    def forward(self, inputs):
        # Remember input values
        self.inputs = inputs

        # Get unnormalized probabilities
        exp_values = np.exp(inputs - np.max(inputs, axis=1,
                                            keepdims=True))
        # Normalize them for each sample
        probabilities = exp_values / np.sum(exp_values, axis=1,
                                            keepdims=True)

        self.output = probabilities

    # Backward pass
    def backward(self, dvalues):

        # Create uninitialized array
        self.dinputs = np.empty_like(dvalues)

        # Enumerate outputs and gradients
        for index, (single_output, single_dvalues) in \
                enumerate(zip(self.output, dvalues)):
            # Flatten output array
            single_output = single_output.reshape(-1, 1)
            # Calculate Jacobian matrix of the output and
            jacobian_matrix = np.diagflat(single_output) - \
                              np.dot(single_output, single_output.T)

            # Calculate sample-wise gradient
            # and add it to the array of sample gradients
            self.dinputs[index] = np.dot(jacobian_matrix,
                                         single_dvalues)


# Common loss class
class Loss:
    
    # Regularization loss calculation
    def regularization_loss(self, layer):

        # 0 by default
        regularization_loss = 0

        # L1 regularization - weights
        # calculate only when factor greater than 0
        if layer.weight_regularizer_l1 > 0:
            regularization_loss += layer.weight_regularizer_l1 * np.sum(np.abs(layer.weights))

        # L2 regularization - weights
        if layer.weight_regularizer_l2 > 0:
            regularization_loss += layer.weight_regularizer_l2 * np.sum(layer.weights * layer.weights)


        # L1 regularization - biases
        # calculate only when factor greater than 0
        if layer.bias_regularizer_l1 > 0:
            regularization_loss += layer.bias_regularizer_l1 * np.sum(np.abs(layer.biases))

        # L2 regularization - biases
        if layer.bias_regularizer_l2 > 0:
            regularization_loss += layer.bias_regularizer_l2 * np.sum(layer.biases * layer.biases)

        return regularization_loss

    # Calculates the data and regularization losses
    # given model output and ground truth values
    def calculate(self, output, y):

        # Calculate sample losses
        sample_losses = self.forward(output, y)

        # Calculate mean loss
        data_loss = np.mean(sample_losses)

        # Return loss
        return data_loss


# Cross-entropy loss
class Loss_CategoricalCrossentropy(Loss):

    # Forward pass
    def forward(self, y_pred, y_true):

        # Number of samples in a batch
        samples = len(y_pred)

        # Clip data to prevent division by 0
        # Clip both sides to not drag mean towards any value
        y_pred_clipped = np.clip(y_pred, 1e-7, 1 - 1e-7)

        # Probabilities for target values -
        # only if categorical labels
        if len(y_true.shape) == 1:
            correct_confidences = y_pred_clipped[
                range(samples),
                y_true
            ]


        # Mask values - only for one-hot encoded labels
        elif len(y_true.shape) == 2:
            correct_confidences = np.sum(
                y_pred_clipped * y_true,
                axis=1
            )

        # Losses
        negative_log_likelihoods = -np.log(correct_confidences)
        return negative_log_likelihoods

    # Backward pass
    def backward(self, dvalues, y_true):

        # Number of samples
        samples = len(dvalues)
        # Number of labels in every sample
        # We'll use the first sample to count them
        labels = len(dvalues[0])

        # If labels are sparse, turn them into one-hot vector
        if len(y_true.shape) == 1:
            y_true = np.eye(labels)[y_true]

        # Calculate gradient
        self.dinputs = -y_true / dvalues
        # Normalize gradient
        self.dinputs = self.dinputs / samples


# Softmax classifier - combined Softmax activation
# and cross-entropy loss for faster backward step
class Activation_Softmax_Loss_CategoricalCrossentropy():

    # Creates activation and loss function objects
    def __init__(self):
        self.activation = Activation_Softmax()
        self.loss = Loss_CategoricalCrossentropy()

    # Forward pass
    def forward(self, inputs, y_true):
        # Output layer's activation function
        self.activation.forward(inputs)
        # Set the output
        self.output = self.activation.output
        # Calculate and return loss value
        return self.loss.calculate(self.output, y_true)


    # Backward pass
    def backward(self, dvalues, y_true):

        # Number of samples
        samples = len(dvalues)

        # If labels are one-hot encoded,
        # turn them into discrete values
        if len(y_true.shape) == 2:
            y_true = np.argmax(y_true, axis=1)

        # Copy so we can safely modify
        self.dinputs = dvalues.copy()
        # Calculate gradient
        self.dinputs[range(samples), y_true] -= 1
        # Normalize gradient
        self.dinputs = self.dinputs / samples


        
# SGD optimizer
class Optimizer_SGD:

    # Initialize optimizer - set settings,
    # learning rate of 1. is default for this optimizer
    def __init__(self, learning_rate=1., decay=0.):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0

    # Call once before any parameter updates
    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * (1. / (1. + self.decay * self.iterations))

    # Update parameters
    def update_params(self, layer):
        layer.weights += -self.current_learning_rate * layer.dweights
        layer.biases += -self.current_learning_rate * layer.dbiases

    # Call once after any parameter updates
    def post_update_params(self):
        self.iterations += 1


# Create dataset
X, y = spiral_data(samples=100, classes=3)

# Create Dense layer with 2 input features and 64 output values
dense1 = Layer_Dense(2, 128)

# Create ReLU activation (to be used with Dense layer):
activation1 = Activation_ReLU()

# Create second Dense layer with 64 input features (as we take output
# of previous layer here) and 3 output values (output values)
dense2 = Layer_Dense(128, 3)

# Create Softmax classifier's combined loss and activation
loss_activation = Activation_Softmax_Loss_CategoricalCrossentropy()

# Create optimizer
optimizer = Optimizer_SGD(learning_rate=1, decay=1e-4)

# Train in loop
losses = [] # Used to plot loss values and see how our model learned
for epoch in range(10_001):

    # Perform a forward pass of our training data through this layer
    dense1.forward(X)

    # Perform a forward pass through activation function
    # takes the output of first dense layer here
    activation1.forward(dense1.output)

    # Perform a forward pass through second Dense layer
    # takes outputs of activation function of first layer as inputs
    dense2.forward(activation1.output)
    
    
    # Perform a forward pass through the activation/loss function
    # takes the output of second dense layer here and returns loss
    data_loss = loss_activation.forward(dense2.output, y)

    # Calculate regularization penalty
    regularization_loss = \
        loss_activation.loss.regularization_loss(dense1) + \
        loss_activation.loss.regularization_loss(dense2)

    # Calculate overall loss
    loss = data_loss + regularization_loss

    # Calculate accuracy from output of activation2 and targets
    # calculate values along first axis
    predictions = np.argmax(loss_activation.output, axis=1)
    if len(y.shape) == 2:
        y = np.argmax(y, axis=1)
    accuracy = np.mean(predictions==y)

    if not epoch % 100:
        losses.append(data_loss)
        print(f'epoch: {epoch}, ' +
              f'acc: {accuracy:.3f}, ' +
              f'loss: {loss:.3f}, ' +
              f'lr: {optimizer.current_learning_rate}')
    # Backward pass
    loss_activation.backward(loss_activation.output, y)
    dense2.backward(loss_activation.dinputs)
    activation1.backward(dense2.dinputs)
    dense1.backward(activation1.dinputs)

    # Update weights and biases
    optimizer.pre_update_params()
    optimizer.update_params(dense1)
    optimizer.update_params(dense2)
    optimizer.post_update_params()

    
fig, axs = plt.subplots(nrows=2, ncols=1, figsize=(12, 24))
axs[0].scatter(X[:, 0], X[:, 1], c=y, cmap="brg", marker="o", s=500, alpha=0.6)
ax2 = axs[0].twinx()
ax2.scatter(X[:, 0], X[:, 1], c=predictions, cmap="brg", marker="o", s=100, edgecolors="black")

axs[1].plot(range(len(losses)), losses)
axs[1].set_xlabel("100 * Epoch")
axs[1].set_ylabel("Loss")
plt.show()

In [None]:
# Validate the model

# Create test dataset
X_test, y_test = spiral_data(samples=100, classes=3)

# Perform a forward pass of our training data through this layer
dense1.forward(X_test)

# Perform a forward pass through activation function
# takes the output of first dense layer here
activation1.forward(dense1.output)

# Perform a forward pass through second Dense layer
# takes outputs of activation function of first layer as inputs
dense2.forward(activation1.output)

# Perform a forward pass through the activation/loss function
# takes the output of second dense layer here and returns loss
loss = loss_activation.forward(dense2.output, y_test)

# Calculate accuracy from output of activation2 and targets
# calculate values along first axis
predictions = np.argmax(loss_activation.output, axis=1)
if len(y_test.shape) == 2:
    y_test = np.argmax(y_test, axis=1)
accuracy = np.mean(predictions==y_test)

print(f'acc: {accuracy:.3f}, loss: {loss:.3f}')

fig, ax = plt.subplots(figsize=(12, 12))
ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap="brg", marker="o", s=500, alpha=0.6)
ax2 = ax.twinx()
ax2.scatter(X_test[:, 0], X_test[:, 1], c=predictions, cmap="brg", marker="o", s=100, edgecolors="black")
plt.show()

---

In [None]:
import numpy as np
import matplotlib.pyplot as plt
np.random.seed(2020)

class Layer_Dense:

    # Layer initialization
    def __init__(self, n_inputs, n_neurons,
                         weight_regularizer_l1=0, 
                         weight_regularizer_l2=0,
                         bias_regularizer_l1=0, 
                         bias_regularizer_l2=0):
        # Initialize weights and biases
        self.weights = 0.01 * np.random.randn(n_inputs, n_neurons)
        self.biases = np.zeros((1, n_neurons))
        # Set regularization strength
        self.weight_regularizer_l1 = weight_regularizer_l1
        self.weight_regularizer_l2 = weight_regularizer_l2
        self.bias_regularizer_l1 = bias_regularizer_l1
        self.bias_regularizer_l2 = bias_regularizer_l2

    # Forward pass
    def forward(self, inputs):
        # Remember input values
        self.inputs = inputs
        # Calculate output values from inputs, weights and biases
        self.output = np.dot(inputs, self.weights) + self.biases

    # Backward pass
    def backward(self, dvalues):
        # Gradients on parameters
        self.dweights = np.dot(self.inputs.T, dvalues)
        self.dbiases = np.sum(dvalues, axis=0, keepdims=True)

        # Gradients on regularization
        # L1 on weights
        if self.weight_regularizer_l1 > 0:
            dL1 = np.ones_like(self.weights)
            dL1[self.weights < 0] = -1
            self.dweights += self.weight_regularizer_l1 * dL1
        # L2 on weights
        if self.weight_regularizer_l2 > 0:
            self.dweights += 2 * self.weight_regularizer_l2 * self.weights
        # L1 on biases
        if self.bias_regularizer_l1 > 0:
            dL1 = np.ones_like(self.biases)
            dL1[self.biases < 0] = -1
            self.dbiases += self.bias_regularizer_l1 * dL1
        # L2 on biases
        if self.bias_regularizer_l2 > 0:
            self.dbiases += 2 * self.bias_regularizer_l2 * self.biases

        # Gradient on values
        self.dinputs = np.dot(dvalues, self.weights.T)



# ReLU activation
class Activation_ReLU:

    # Forward pass
    def forward(self, inputs):

        # Remember input values
        self.inputs = inputs
        # Calculate output values from inputs
        self.output = np.maximum(0, inputs)

    # Backward pass
    def backward(self, dvalues):
        # Since we need to modify original variable,
        # let's make a copy of values first
        self.dinputs = dvalues.copy()

        # Zero gradient where input values were negative
        self.dinputs[self.inputs <= 0] = 0


# Softmax activation
class Activation_Softmax:

    # Forward pass
    def forward(self, inputs):
        # Remember input values
        self.inputs = inputs

        # Get unnormalized probabilities
        exp_values = np.exp(inputs - np.max(inputs, axis=1,
                                            keepdims=True))
        # Normalize them for each sample
        probabilities = exp_values / np.sum(exp_values, axis=1,
                                            keepdims=True)

        self.output = probabilities

    # Backward pass
    def backward(self, dvalues):

        # Create uninitialized array
        self.dinputs = np.empty_like(dvalues)

        # Enumerate outputs and gradients
        for index, (single_output, single_dvalues) in \
                enumerate(zip(self.output, dvalues)):
            # Flatten output array
            single_output = single_output.reshape(-1, 1)
            # Calculate Jacobian matrix of the output and
            jacobian_matrix = np.diagflat(single_output) - \
                              np.dot(single_output, single_output.T)

            # Calculate sample-wise gradient
            # and add it to the array of sample gradients
            self.dinputs[index] = np.dot(jacobian_matrix,
                                         single_dvalues)


# Common loss class
class Loss:
    
    # Regularization loss calculation
    def regularization_loss(self, layer):

        # 0 by default
        regularization_loss = 0

        # L1 regularization - weights
        # calculate only when factor greater than 0
        if layer.weight_regularizer_l1 > 0:
            regularization_loss += layer.weight_regularizer_l1 * np.sum(np.abs(layer.weights))

        # L2 regularization - weights
        if layer.weight_regularizer_l2 > 0:
            regularization_loss += layer.weight_regularizer_l2 * np.sum(layer.weights * layer.weights)


        # L1 regularization - biases
        # calculate only when factor greater than 0
        if layer.bias_regularizer_l1 > 0:
            regularization_loss += layer.bias_regularizer_l1 * np.sum(np.abs(layer.biases))

        # L2 regularization - biases
        if layer.bias_regularizer_l2 > 0:
            regularization_loss += layer.bias_regularizer_l2 * np.sum(layer.biases * layer.biases)

        return regularization_loss

    # Calculates the data and regularization losses
    # given model output and ground truth values
    def calculate(self, output, y):

        # Calculate sample losses
        sample_losses = self.forward(output, y)

        # Calculate mean loss
        data_loss = np.mean(sample_losses)

        # Return loss
        return data_loss


# Cross-entropy loss
class Loss_CategoricalCrossentropy(Loss):

    # Forward pass
    def forward(self, y_pred, y_true):

        # Number of samples in a batch
        samples = len(y_pred)

        # Clip data to prevent division by 0
        # Clip both sides to not drag mean towards any value
        y_pred_clipped = np.clip(y_pred, 1e-7, 1 - 1e-7)

        # Probabilities for target values -
        # only if categorical labels
        if len(y_true.shape) == 1:
            correct_confidences = y_pred_clipped[
                range(samples),
                y_true
            ]


        # Mask values - only for one-hot encoded labels
        elif len(y_true.shape) == 2:
            correct_confidences = np.sum(
                y_pred_clipped * y_true,
                axis=1
            )

        # Losses
        negative_log_likelihoods = -np.log(correct_confidences)
        return negative_log_likelihoods

    # Backward pass
    def backward(self, dvalues, y_true):

        # Number of samples
        samples = len(dvalues)
        # Number of labels in every sample
        # We'll use the first sample to count them
        labels = len(dvalues[0])

        # If labels are sparse, turn them into one-hot vector
        if len(y_true.shape) == 1:
            y_true = np.eye(labels)[y_true]

        # Calculate gradient
        self.dinputs = -y_true / dvalues
        # Normalize gradient
        self.dinputs = self.dinputs / samples


# Softmax classifier - combined Softmax activation
# and cross-entropy loss for faster backward step
class Activation_Softmax_Loss_CategoricalCrossentropy():

    # Creates activation and loss function objects
    def __init__(self):
        self.activation = Activation_Softmax()
        self.loss = Loss_CategoricalCrossentropy()

    # Forward pass
    def forward(self, inputs, y_true):
        # Output layer's activation function
        self.activation.forward(inputs)
        # Set the output
        self.output = self.activation.output
        # Calculate and return loss value
        return self.loss.calculate(self.output, y_true)


    # Backward pass
    def backward(self, dvalues, y_true):

        # Number of samples
        samples = len(dvalues)

        # If labels are one-hot encoded,
        # turn them into discrete values
        if len(y_true.shape) == 2:
            y_true = np.argmax(y_true, axis=1)

        # Copy so we can safely modify
        self.dinputs = dvalues.copy()
        # Calculate gradient
        self.dinputs[range(samples), y_true] -= 1
        # Normalize gradient
        self.dinputs = self.dinputs / samples


        
# SGD optimizer
class Optimizer_SGD:

    # Initialize optimizer - set settings,
    # learning rate of 1. is default for this optimizer
    def __init__(self, learning_rate=1., decay=0.):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0

    # Call once before any parameter updates
    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * (1. / (1. + self.decay * self.iterations))

    # Update parameters
    def update_params(self, layer):
        layer.weights += -self.current_learning_rate * layer.dweights
        layer.biases += -self.current_learning_rate * layer.dbiases

    # Call once after any parameter updates
    def post_update_params(self):
        self.iterations += 1

# <=== HERE ===>
# Create dataset
X, y = spiral_data(samples=500, classes=3)
# <=== HERE ===>

# Create Dense layer with 2 input features and 64 output values
dense1 = Layer_Dense(2, 128)

# Create ReLU activation (to be used with Dense layer):
activation1 = Activation_ReLU()

# Create second Dense layer with 64 input features (as we take output
# of previous layer here) and 3 output values (output values)
dense2 = Layer_Dense(128, 3)

# Create Softmax classifier's combined loss and activation
loss_activation = Activation_Softmax_Loss_CategoricalCrossentropy()

# Create optimizer
optimizer = Optimizer_SGD(learning_rate=1, decay=1e-4)

# Train in loop
losses = [] # Used to plot loss values and see how our model learned
for epoch in range(10_001):

    # Perform a forward pass of our training data through this layer
    dense1.forward(X)

    # Perform a forward pass through activation function
    # takes the output of first dense layer here
    activation1.forward(dense1.output)

    # Perform a forward pass through second Dense layer
    # takes outputs of activation function of first layer as inputs
    dense2.forward(activation1.output)
    
    
    # Perform a forward pass through the activation/loss function
    # takes the output of second dense layer here and returns loss
    data_loss = loss_activation.forward(dense2.output, y)

    # Calculate regularization penalty
    regularization_loss = \
        loss_activation.loss.regularization_loss(dense1) + \
        loss_activation.loss.regularization_loss(dense2)

    # Calculate overall loss
    loss = data_loss + regularization_loss

    # Calculate accuracy from output of activation2 and targets
    # calculate values along first axis
    predictions = np.argmax(loss_activation.output, axis=1)
    if len(y.shape) == 2:
        y = np.argmax(y, axis=1)
    accuracy = np.mean(predictions==y)

    if not epoch % 100:
        losses.append(loss)
        print(f'epoch: {epoch}, ' +
              f'acc: {accuracy:.3f}, ' +
              f'loss: {loss:.3f}, ' +
              f'lr: {optimizer.current_learning_rate}')
    # Backward pass
    loss_activation.backward(loss_activation.output, y)
    dense2.backward(loss_activation.dinputs)
    activation1.backward(dense2.dinputs)
    dense1.backward(activation1.dinputs)

    # Update weights and biases
    optimizer.pre_update_params()
    optimizer.update_params(dense1)
    optimizer.update_params(dense2)
    optimizer.post_update_params()

    
fig, axs = plt.subplots(nrows=2, ncols=1, figsize=(12, 24))
axs[0].scatter(X[:, 0], X[:, 1], c=y, cmap="brg", marker="o", s=500, alpha=0.6)
ax2 = axs[0].twinx()
ax2.scatter(X[:, 0], X[:, 1], c=predictions, cmap="brg", marker="o", s=100, edgecolors="black")

axs[1].plot(range(len(losses)), losses)
axs[1].set_xlabel("100 * Epoch")
axs[1].set_ylabel("Loss")
plt.show()

In [None]:
# Validate the model

# Create test dataset
X_test, y_test = spiral_data(samples=100, classes=3)

# Perform a forward pass of our training data through this layer
dense1.forward(X_test)

# Perform a forward pass through activation function
# takes the output of first dense layer here
activation1.forward(dense1.output)

# Perform a forward pass through second Dense layer
# takes outputs of activation function of first layer as inputs
dense2.forward(activation1.output)

# Perform a forward pass through the activation/loss function
# takes the output of second dense layer here and returns loss
loss = loss_activation.forward(dense2.output, y_test)

# Calculate accuracy from output of activation2 and targets
# calculate values along first axis
predictions = np.argmax(loss_activation.output, axis=1)
if len(y_test.shape) == 2:
    y_test = np.argmax(y_test, axis=1)
accuracy = np.mean(predictions==y_test)

print(f'acc: {accuracy:.3f}, loss: {loss:.3f}')

fig, ax = plt.subplots(figsize=(12, 12))
ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap="brg", marker="o", s=500, alpha=0.6)
ax2 = ax.twinx()
ax2.scatter(X_test[:, 0], X_test[:, 1], c=predictions, cmap="brg", marker="o", s=100, edgecolors="black")
plt.show()

----