In [1]:
from typing import List

import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
import matplotlib.animation as animation
import time
import struct
import tensorflow as tf
import random as rd

from math import *
from array import array

# import keras._tf_keras.keras as keras 
# from keras._tf_keras.keras
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.linear_model import LogisticRegression
from tensorflow.python.ops.init_ops_v2 import glorot_uniform

# my project
from module.conf import PROJECT_DIR

# matplotlib.use("QTAgg")
%matplotlib inline

Load data:
- Train data: 60k 28x28 images
- Test data: 

In [2]:
mnist_path = "/data/sample/mnist"
training_images_filepath = "".join([PROJECT_DIR, mnist_path, "/train-images.idx3-ubyte"])
training_labels_filepath = "".join([PROJECT_DIR, mnist_path, "/train-labels.idx1-ubyte"])
test_images_filepath = "".join([PROJECT_DIR, mnist_path, "/t10k-images.idx3-ubyte"])
test_labels_filepath = "".join([PROJECT_DIR, mnist_path, "/t10k-labels.idx1-ubyte"])

def read_images_labels(images_filepath, labels_filepath) -> tuple:
    labels = []
    with open(labels_filepath, 'rb') as file:
        magic, size = struct.unpack(">II", file.read(8))
        if magic != 2049:
            raise ValueError('Magic number mismatch, expected 2049, got {}'.format(magic))
        # labels = array("B", file.read())
        labels = array("B", file.read())

    with open(images_filepath, 'rb') as file:
        magic, size, rows, cols = struct.unpack(">IIII", file.read(16))
        if magic != 2051:
            raise ValueError('Magic number mismatch, expected 2051, got {}'.format(magic))
        image_data = array("B", file.read())       
     
    images = []
    # for i in range(size):
    #     images.append([0] * rows * cols)
    for i in range(size):
        img = np.array(image_data[i * rows * cols:(i + 1) * rows * cols])
        img = img.reshape(28, 28)
        # images[i][:] = img
        images.append(img)
    
    return images, labels

def load_data() -> tuple:
    x_train, y_train = read_images_labels(training_images_filepath, training_labels_filepath)
    x_test, y_test = read_images_labels(test_images_filepath, test_labels_filepath)
    return (x_train, y_train),(x_test, y_test)

(X_train, y_train), (X_test, y_test) = load_data()

In [3]:
# print(f"{type(X_train[0])}")
# mnist = tf.keras.datasets.mnist

# (x_train, y_train), (x_test, y_test) = mnist.load_data()


In [4]:
X_train = np.asarray(X_train) / 255
y_train = np.asarray(y_train)
X_test  = np.asarray(X_test) / 255
y_test  = np.asarray(y_test)

## 1. Activation functions:

### 1.1. Linear:
$ \begin{align}
f(\mathbf z) &= \mathbf z \\
\rightarrow \frac{\partial f(\mathbf z)}{\partial \mathbf z} &=\mathbf 1 \\ 
\end{align} $

In [5]:
def linear(z): return z
def grad_linear(z): return 1

### 1.2. ReLU:
$\begin{align}
ReLU(\mathbf z) &= \max(\mathbf z, \mathbf 0) \\
\rightarrow \frac{\partial ReLU(\mathbf z)}{\partial \mathbf z} &= \begin{cases}
z_i = 1 \text{ if } z_i > 0 \\
z_i = 0 \text{ if } z_i \leqslant 0 \\
\end{cases} \\
\end{align}$

In [6]:
def relu(z): return np.maximum(0, z)
def grad_relu(z): return np.where(z > 0, 1, 0) # (x > 0).astype(float) # np.array([1 if z_i > 0 else 0 for z_i in z])

In [7]:
z = np.array([1,-6, 3, 4, 0])
relu_z = relu(z)
relu_z
grad_relu_z = grad_relu(relu_z)
grad_relu_z

array([1, 0, 1, 1, 0])

### 1.2. Sigmoid:
$\begin{align}
\sigma(\mathbf z) &= \frac{1}{1 + e^{-z}} \\
\rightarrow \frac{\partial\sigma(\mathbf z)}{\partial \mathbf z} &= \sigma(\mathbf z)\cdot\left(1 - \sigma(\mathbf z)\right) \\ 
\end{align} $

In [8]:
def sigmoid(z): return 1/(1 + np.exp(-z))
def grad_sigmoid(z): return sigmoid(z) * (1 - sigmoid(z))    

In [9]:
z = np.array([1,-6, 3, 4, 0])
sigmoid(z)
grad_sigmoid(z)

array([0.19661193, 0.00246651, 0.04517666, 0.01766271, 0.25      ])

### 1.3. Softmax:
$\begin{align}
\sigma(\mathbf z) &= \frac{e^{\mathbf z}}{\sum_{i=1}^{C}e^{z_i}} \\
\rightarrow \frac{\partial \sigma(\mathbf z)}{\partial \mathbf z} &= \sigma(z_i) \cdot (\delta_{ij} - \sigma(z_j)) 
\rightarrow \delta_{ij} = \begin{cases} 
1 \text{ if } i = j \\
0 \text{ if } i \neq j  
\end{cases} \\
&= diag(\mathbf z) - \mathbf z * \mathbf z^T \\
C &\text{ is number of class} \\
diag &\text{ is diagonal matrix }
\end{align}$

In [10]:
# def softmax(z): return np.exp(z)/np.sum(np.exp(z))
# def grad_softmax(z): return np.diag(z) - np.outer(z,z)
def softmax(Z):
    Z_max = np.max(Z, axis=-1, keepdims=True)
    Z_exp = np.exp(Z - Z_max)
    Z_sum = np.sum(Z_exp, axis=-1, keepdims=True)
    return Z_exp / Z_sum
    # e_x = np.exp(Z - np.max(Z, axis=-1, keepdims=True))
    # return e_x / np.sum(e_x, axis=-1, keepdims=True)

def grad_softmax(Z):
    S = softmax(Z)  # Tính softmax cho Z
    batch_size, num_classes = S.shape
    # Init Jacobian matrix: gradient foreach row
    dSoftmax = np.zeros((batch_size, num_classes, num_classes))
    for i in range(batch_size):
        # S_i is softmax for i-row
        s_i = S[i].reshape(-1, 1)  # Transpose to Column vector
        # Jacobian matrix for i-row
        # dSoftmax[i] = np.diagflat(s_i) - np.dot(s_i, s_i.T)
        dSoftmax[i] = np.diagflat(s_i) - s_i @ s_i.T
    # dSoftmax in hidden layer - transpose of Jacobian matrix    
    return dSoftmax.transpose()

In [11]:
z = np.array([[1,-6, 3, 4, 0],
              [1,-6, 3, 7, 0]])
# softmax(z)
z_max = np.max(z, axis=z.ndim-1, keepdims=True)
z_max
# np.exp(z-z_max)
# grad_softmax(z)

array([[4],
       [7]])

## 2. Loss function:

### 2.1. Cross Entropy:
$\begin{align}
CrossEntropy = - \log(\hat{y}_{true})\\
\end{align}$

### 2.2. Categorical Crossentropy:
$\begin{align}
Y &\text{ is label in one-hot matrix } N \times C  \\
\hat{Y} &\text{ is predicted matrix } N \times C \\
C &\text{ is number of classes}\\
L &= -\sum_{i=1}^{C} Y_i \log(\hat{Y}_{i}) \\
\hat{Y}_{i,j} &= \frac{exp(Z_{i,j})}{\sum_{k=1}^{C} exp(Z_{i,k})} \\
\rightarrow \mathcal L &= -\frac{1}{N} \sum_{i=1}^{N}\sum_{j=1}^{C} Y_{i,j} \log(\hat{Y}_{i,j}) \\
&= -\frac{1}{N} \sum_{i=1}^{N} \log(\hat{Y}_{i,true}) \\
\hat{Y}_{i,true} &\text{ is predicted result corresponding to one-hot is 1}
\end{align}$

Gradient:
$\begin{align}
\frac{\partial L}{\partial Z} &= \hat{Y}_{i,j} - Y_{i,j} \\
\end{align}$

### 2.3. Sparse Categorical Crossentropy:
$ \begin{align}
\hat{Y} &= A_n = softmax(Z) \\
Z &\text{ is } n \times C \text{ matrix. n is number of samples, C is number of classes} \\
CrossEntropy_i &= -\log(\hat{y}_{i, y_{sparse}}) \\
CrossEntropy &\text{ is a vector size n} \\
\rightarrow \frac{\partial L}{\partial Z_{i,j}} &= \hat{Y}_{i,j} - \delta(j, y_{sparse,i}) 
\rightarrow \delta(j, y_{sparse,i}) = \begin{cases}
1 \text{ if } j = y_{sparse,i} \\
0 \text{ if } j \neq y_{sparse,i}\\
\end{cases} \\
\rightarrow \frac{\partial L}{\partial Z_n} &= \hat{Y} - Y = \hat{Y} - SparseLabels \\
SparseLabels &\text{ can be considered as one-hot matrix}
\end{align} $

In [12]:
# should apply the Vectorization
def delta_kronecker_matrix(y_train, mY_pred):
    """
    Transform to one-hot encoding
    y_train: a vector size n
    mY_pred: a matrix (C, n)
    """
    mY_train = np.zeros(shape=mY_pred.T.shape)
    for i in range(len(y_train)): mY_train[i][y_train[i]] = 1
    return mY_train

def sparse_categorical_crossentropy_Z(y_train, mY_pred):
    y_pred = np.array([mY_pred[i][y_train[i]] for i in range(len(y_train))])
    return -np.sum(np.log(y_pred))

def grad_sparse_categorical_crossentropy_Z(y_train, mY_pred):
    return mY_pred - delta_kronecker_matrix(y_train=y_train, mY_pred=mY_pred)

def sparse_categorical_crossentropy(y_true, y_pred):
    # Loss func
    # batch_size = y_pred.shape[0]
    # y_true_indices = (np.arange(batch_size), y_true)
    # correct_class_probabilities = y_pred[y_true_indices]
    # loss = -np.log(correct_class_probabilities + 1e-9)
    # loss = -np.log(y_pred[:y_pred.shape[0], y_true] + 1e-9)
    # return loss
    batch_size = y_true.shape[0]
    y_true_indices = (y_true, np.arange(batch_size))
    correct_class_probabilities = y_pred[y_true_indices]
    # loss = -np.sum(np.log(correct_class_probabilities + 1e-9)) / batch_size
    loss = -np.log(correct_class_probabilities + 1e-7).mean()
    return loss

def grad_sparse_categorical_crossentropy(y_true, y_pred):
    # Grad
    grad = np.zeros_like(y_pred)
    grad[:y_pred.shape[0], y_true] = -1 / y_pred[:y_pred.shape[0], y_true]  # Grad for true-class
    return grad

In [13]:
# test detal_kronecker_matrix
y_train_validate = np.array([0, 2, 2, 1, 0])
mY_pred = np.array([[0.2, 0.1 , 0.3],
                    [0.3, 0.2, 0.7],
                    [0.3, 0.2, 0.7],
                    [0.3, 0.2, 0.2],
                    [0.3, 0.2, 0.4]]).T
delta = delta_kronecker_matrix(y_train_validate, mY_pred)
# mY_pred[:mY_pred.shape[0], [1, 2]]

In [14]:
cross_entropy = sparse_categorical_crossentropy(y_train_validate, mY_pred)
print(cross_entropy)

1.0272393796048616


In [15]:
import numpy as np
from scipy.sparse import csr_matrix
# ex
# [[0, 0, 0, 0],
#  [5, 8, 0, 0],
#  [0, 0, 3, 0],
#  [0, 6, 0, 0]]
#

data = np.array([5, 8, 3, 6, 7])        # values
indices = np.array([0, 1, 2, 1, 0])     # col index for each value
indptr = np.array([0, 2, 3, 4, 5])      # start - end in data values
csr_m = csr_matrix((data, indices, indptr), shape=(4, 4))
csr_m.toarray()


array([[5, 8, 0, 0],
       [0, 0, 3, 0],
       [0, 6, 0, 0],
       [7, 0, 0, 0]])

## 3. Optimizers: 

In [16]:
def optimize_basicGD(w:np.ndarray, b:np.ndarray, grad_w: np.ndarray, grad_b: np.ndarray, learning_rate=1e-2):
    w -= learning_rate * grad_w
    b -= learning_rate * grad_b
    return w, b

### 3.1. SGD
$\begin{align}
\theta &= \theta - \eta \cdot \nabla_{\theta} L(\theta, x_i, y_i) \\
\theta &\text{ is weight}\\
\eta &\text{ is learning rate}\\
\nabla_{\theta} L(\theta, x_i, y_i) &\text{ is gradient respect to }\theta \text{ of }(x_i, y_i) \\
\end{align}$

### 3.2. RMSProps:
$\begin{align}
v_{t} &= \beta v_{t-1} + (1 + \beta)g_t^2 \\
\\
\theta_t &= \theta_{t-1} - \frac{\eta}{\sqrt{v_t} + \epsilon} g_t \\
\eta &\text{ is learning rate} \\
v_t &\text{ is velocity at } t \text{ time} \\
g_t &\text{ is gradient at } t \text{ time} \\
\epsilon &\text{ is very small number - avoid device by 0} \\
\theta &\text{ is weight matrix or bias vector} \\
\end{align}$

In [17]:
# def optimize_RMSProps(w: np.ndarray, learning_rate=0.01, beta=0.99, epsilon=1e-7, loss=None, gradient=None):
    # """
    # update weight matrix or bias
    # :param w: weight matrix or bias
    # :param learning_rate: 
    # :param beta: 
    # :param epsilon: default
    # :param loss: 
    # :param gradient: 
    # :return: None
    # """
    # v = 0 
    # epochs = 1000
    
    # RMSProp
    # for epoch in range(epochs):
        # g = gradient(w)
        # v = beta * v + (1 - beta) * g ** 2
        # w = w - learning_rate * g / (np.sqrt(v) + epsilon)
    
        # if epoch % 100 == 0:
        #     print(f'Epoch {epoch}: w = {w}, Loss = {loss(w)}')
    # print(f'Final w: {w}, Final Loss: {loss(w)}')

def optimize_RMSProps(w: np.ndarray, b: np.ndarray, grad_w: np.ndarray, grad_b: np.ndarray, v_w:np.ndarray=None, v_b:np.ndarray=None,
                      learning_rate=0.01, beta=0.9, epsilon=1e-7):
    # if v_w is None: v_w = np.zeros_like(w)
    # if v_b is None: v_b = np.zeros_like(b)
    v_w = beta * v_w + (1 - beta) * grad_w ** 2
    v_b = beta * v_b + (1 - beta) * grad_b ** 2
    # print(f"v_w: {v_w.shape} v_b: {v_b.shape}")
    w -= learning_rate * grad_w / (np.sqrt(v_w) + epsilon)
    b -= learning_rate * grad_b / (np.sqrt(v_b) + epsilon)
    return w, b, v_w, v_b

def optimize_RMSPropsL1(w: np.ndarray, b: np.ndarray, grad_w: np.ndarray, grad_b: np.ndarray, v_w:np.ndarray=None, v_b:np.ndarray=None,
                      learning_rate=0.01, beta=0.9, epsilon=1e-7, lambda_l1 = 1e-2, loss=0.0):
    # l1_loss = lambda_l1 * np.sum(np.abs(w))
    l1_loss = lambda_l1 * np.sum(w**2)
    loss += l1_loss
    grad_w += lambda_l1 * np.sign(w)
    # grad_b += lambda_l1 * np.sign(b)
    v_w = beta * v_w + (1 - beta) * grad_w ** 2
    v_b = beta * v_b + (1 - beta) * grad_b ** 2
    # print(f"v_w: {v_w.shape} v_b: {v_b.shape}")
    w -= learning_rate * grad_w / (np.sqrt(v_w) + epsilon)
    b -= learning_rate * grad_b / (np.sqrt(v_b) + epsilon)
    return w, b, v_w, v_b, loss + l1_loss

### 3.3. Adagrad
$\begin{align}
G_t &= G_{t-1} + g_t^2 \\
\theta_t &= \theta_{t-1} - \frac{\eta}{\sqrt{G_t - \epsilon}} \cdot g_t \\
\eta &\text{ is learning rate} \\
g_t &\text{ is gradient at } t \text{ time} \\
\epsilon &\text{ is very small number - avoid device by 0} \\
\theta &\text{ is weight matrix or bias vector} \\
G &\text{ sum of square of gradient} \\
\end{align}$

### 3.4. Adaprops:

### 3.5. Adamax:

### 3.6.Adam: 

In [18]:
def optimize_Adam(w: np.ndarray, b: np.ndarray, grad_w: np.ndarray, grad_b: np.ndarray,
                    cache_w: list[np.ndarray], cache_b:list[np.ndarray],
                    learning_rate=0.01, beta1=0.9, beta2=0.999, epsilon=1e-8):
    # if v_w is None: v_w = np.zeros_like(w)
    # if v_b is None: v_b = np.zeros_like(b)
    cache_w[1] = beta1 * cache_w[1] + (1 - beta1) * grad_w
    cache_b[1] = beta1 * cache_b[1] + (1 - beta1) * grad_b
    cache_w[0] = beta2 * cache_w[0] + (1 - beta2) * grad_w ** 2
    cache_b[0] = beta2 * cache_b[0] + (1 - beta2) * grad_b ** 2
    m_w_hat = cache_w[1] / (1 - beta1)
    m_b_hat = cache_b[1] / (1 - beta1)
    v_w_hat = cache_w[0] / (1 - beta2)
    v_b_hat = cache_b[0] / (1 - beta2)
    # print(f"v_w: {v_w.shape} v_b: {v_b.shape}")
    w -= learning_rate * m_w_hat / (np.sqrt(v_w_hat) + epsilon)
    b -= learning_rate * m_b_hat / (np.sqrt(v_b_hat) + epsilon)
    return w, b, cache_w, cache_b

### 5. Demo NN


### 5.1. Load/Prepare data

### 5.2. Neural network manually:
- Flat 28 x 28 data
- There are 03 layers: `[32, "relu"] [128, "sigmoid"] [10, "softmax"]`
- Loss func: `SparseCategoricalCrossentropy`, `digits = False`
- Optimizer: `RMSProp` with `learning_rate=1e-3`
- **(opt)** metrics: `accuracy`

### 5.3. Flat input data:

In [19]:
def flat_data(imp_data: np.ndarray) -> np.ndarray:
    """
    Flat data from 02 dim matrix to vector
    :param imp_data: (n, m_0, m_1) matrix, n is number of rows 
    :return: matrix: (n, m_0 * m_1)
    """
    if len(imp_data.shape) < 3 : return imp_data
    return imp_data.reshape((imp_data.shape[0], imp_data.shape[1]* imp_data.shape[2]))

In [20]:
marr = np.array([[[1, 2, 3, 3],[4, 5, 6, 6],[7, 8, 9, 9]],
        [[1, 2, 3, 3],[4, 5, 6, 6],[7, 8, 9, 9]],
        [[1, 2, 3, 3],[4, 5, 6, 6],[7, 8, 9, 9]],
        [[1, 2, 3, 3],[4, 5, 6, 6],[7, 8, 9, 9]],
        [[1, 2, 3, 3],[4, 5, 6, 6],[7, 8, 9, 9]]])
x_0 = flat_data(marr)
x_0
# marr.shape
# X_train.shape
x_1 = X_train[:5].copy()
x_1.shape
x_1 = x_1.reshape((x_1.shape[0], x_1.shape[1] * x_1.shape[2]))
x_1

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

### 5.4. Loop

In [21]:
class DenseLayer (object):
    activation_map = {"linear": linear, "relu": relu, "sigmoid": sigmoid, "softmax": softmax}
    grad_map = {"linear": grad_linear, "relu": grad_relu, "sigmoid": grad_sigmoid, "softmax": grad_softmax}
    # weights = None;
    # bias = None;

    def glorot_normal(self, shape, n_in, n_out):
        stddev = np.sqrt(2 / (n_in + n_out))
        return np.random.normal(loc=0, scale=stddev, size=shape)

    def glorot_uniform(self, shape, n_in, n_out):
        limit = np.sqrt(6 / (n_in + n_out))
        return np.random.uniform(low=-limit, high=limit, size=shape)

    def __init__(self, input_size, output_size, activation = 'linear'):
        self._activation_func = self.activation_map['linear']
        self._grad_func = self.grad_map['linear']
        self._activation = activation.lower()
        # self._weights = np.random.rand(output_size, input_size) * 1e-2
        # self._weights = np.random.randn(output_size, input_size) * np.sqrt(2 / input_size)
        self._weights = self.glorot_uniform(shape=(output_size, input_size), n_in=input_size, n_out=output_size)
        # self._bias = np.random.rand(output_size, 1)
        # self._bias = np.random.randn(output_size, 1) * np.sqrt(2 / input_size)
        self._bias = np.zeros(shape=(output_size, 1))
        self._cache_w = [np.zeros_like(self._weights), np.zeros_like(self._weights)]
        self._cache_b = [np.zeros_like(self._bias), np.zeros_like(self._bias)]
        self._init()
        return
    def _init(self) -> None:
        self._activation_func = DenseLayer.activation_map[self._activation] if self._activation in DenseLayer.activation_map else DenseLayer.activation_map['linear']
        self._grad_func = DenseLayer.grad_map[self._activation] if self._activation in DenseLayer.grad_map else DenseLayer.grad_map['linear']
        return None

    @property
    def activation_func(self):
        return self._activation_func
    @property
    def grad_func(self):
        return self._grad_func

    @property
    def weights(self) -> np.ndarray:
        return self._weights
    @weights.setter
    def weights(self, val):
        self._weights = val
    @property
    def bias(self) -> np.ndarray:
        return self._bias
    @bias.setter
    def bias(self, val):
        self._bias = val

    @property
    def cache_w(self):
        return self._cache_w
    @cache_w.setter
    def cache_w(self, val):
        self._cache_w = val
    @property
    def cache_b(self):
        return self._cache_b
    @cache_b.setter
    def cache_b(self, val):
        self._cache_b = val

    pass

In [22]:
def fit(X_train, Y_train, epochs=1, learning_rate=1e-3) -> list[DenseLayer]:
    """
    :param X_train: matrix(n, m) n - number of rows, m - features
    :param Y_train: vector(n)
    :param epoch: 
    :return: list of weight matrix, from 0
    """
    model: list[DenseLayer] = []
    # [32, "relu"] [128, "sigmoid"] [10, "softmax"]
    # X_treated shape [m, n]
    X_treated = flat_data(X_train).transpose()
    if X_treated is not None and len(X_treated) > 0:
        input_size = X_treated.shape[0]
        denses: list[DenseLayer] = [DenseLayer(input_size=input_size, output_size=32, activation='relu'),
                  DenseLayer(input_size=32, output_size=128, activation='relu'),
                  DenseLayer(input_size=128, output_size=10, activation='linear')]
        # denses: list[DenseLayer] = [DenseLayer(input_size=input_size, output_size=3, activation='relu'),
        #         #   DenseLayer(input_size=32, output_size=128, activation='relu'),
        #           DenseLayer(input_size=3, output_size=2, activation='linear')]
        fwd_A = [np.ndarray, len(denses)]   # ndarray(m, n)
        fwd_Z = [np.ndarray, len(denses)]   # ndarray(m, n)
        # fwd_b = [float, len(denses)]        # bias

        grad_A: list[np.ndarray] = [np.zeros(shape=(1,1))] * len(denses)
        grad_Z: list[np.ndarray] = [np.zeros(shape=(1,1))] * len(denses)
        # grad_W: list[np.ndarray] = [np.zeros(shape=(1,1))] * len(denses)
        # grad_b: list[np.ndarray] = [np.zeros(shape=(1,))] * len(denses)
        # fwd_A.append(A)
        # print(f"denses: {len(denses)}")
        for epoch in range(epochs):
            fwd_A.clear()
            fwd_Z.clear()
            A = X_treated
            # fwd_Z.append(X_treated)
            # Forward propagation
            for dense in denses:
                # calculate linear
                Z = A
                # print(f"dense.weights:{dense.weights.shape} Z:{Z.shape} bias:{dense.bias.shape}")
                Z = dense.weights @ Z + dense.bias
                # print(f"fwd {idx}: {Z}")
                # apply activation function
                A = dense.activation_func(Z)    # return ndarray
                fwd_A.append(A)
                fwd_Z.append(Z)
                # fwd_b.append(dense.bias.copy())
                pass
            # Calculate Cost function
            # Here, A has shape(C, input_size)
            A = softmax(A)
            loss = sparse_categorical_crossentropy(Y_train, A)
            # grad_cost = grad_sparse_categorical_crossentropy(Y_train, A)
            # Combine the grad_sparse_categorical_crossentropy and softmax (~ from_digit = True)
            # dL/dZ = dA/dZ @ dL/dA = Y_pred - Y_train
            # grad_A[-1] = grad_Z[-1] = (fwd_A[-1] - Y_train)/m
            # print(f"grad_Z:{fwd_A[-1].shape} - {Y_train.shape}")
            # grad_Z[-1] = (fwd_A[-1] - Y_train) / Y_train.shape[0]
            batch_size = Y_train.shape[0]
            grad_Z[-1] = A
            grad_Z[-1][Y_train, np.arange(batch_size)] -= 1
            grad_Z[-1] /= batch_size
            
            last_Z = fwd_A[-2] if len(fwd_A) > 1 else X_treated
            # print(f"grad_Z[-1] @ last_Z:{grad_Z[-1].shape} @ {last_Z.shape}")
            denses[-1].weights -= learning_rate * (grad_Z[-1] @ last_Z.T)
            denses[-1].bias -= learning_rate * (grad_Z[-1] @ np.ones(shape=(grad_Z[-1].shape[1], 1)))
            # denses[-1].weights = (grad_Z[-1] @ last_Z.T)
            # denses[-1].bias = (grad_Z[-1] @ np.ones(shape=(grad_Z[-1].shape[1], 1)))
            # print(f"len(grad_Z): {len(grad_Z)}")
            # Back propagation\
            grad_dZdW = [None] * len(denses)
            for idx, dense in reversed(list(enumerate(denses))):
                # ignore the last layer
                if idx >= len(denses) - 1: continue
                # calculate grad of activation function
                # calculate dA/dZ = grad_func(A[idx])
                # grad_dA_dZ = dense.grad_func(fwd_Z[idx])
                # calculate dL/dA = dZ[idx + 1]/dA @ dL/dZ[idx + 1]
                # print(f"in {idx}: {fwd_Z[idx].shape} @ {grad_Z[idx + 1].shape}")
                # grad_A[idx] = fwd_Z[idx] @ grad_Z[idx + 1]
                # print(f"in {idx} weights: {denses[idx+1].weights.shape} grad_Z:{grad_Z[idx + 1].shape}")
                grad_A[idx] = denses[idx+1].weights.T @ grad_Z[idx + 1]
                # calculate dL/dZ = dA/dZ @ dL/dA
                # print(f"in {idx} grad_A * grad_func = {grad_A[idx].shape} * {dense.grad_func(fwd_Z[idx]).shape}")
                grad_Z[idx] = grad_A[idx] * dense.grad_func(fwd_Z[idx]) # dense.grad_func(fwd_Z[idx]).T @ grad_A[idx]
                # calculate dZ/dW = grad_func(W[idx])
                last_Z = fwd_Z[idx-1] if idx > 0 else X_treated
                # print(f"in {idx} grad_dZ_dW = {grad_Z[idx].shape} @ {last_Z.shape}")
                grad_dZ_dW = grad_Z[idx] @ last_Z.T
                # calculate dZ/db = grad_func(b[idx])
                grad_dZ_db = grad_Z[idx] @ np.ones(shape=(grad_Z[idx].shape[1], 1))

                grad_dZdW[idx] = (grad_dZ_dW, grad_dZ_db)
                # update W and b
                # dense.weights, dense.bias = optimize_basicGD(w=dense.weights, b=dense.bias,
                #     grad_w=grad_dZ_dW, grad_b=grad_dZ_db, learning_rate=learning_rate)
                # dense.weights, dense.bias, dense.cache_w, dense.cache_b = optimize_RMSProps(w=dense.weights, b=dense.bias,
                #     grad_w=grad_dZ_dW, grad_b=grad_dZ_db, v_w=dense.cache_w, v_b=dense.cache_b,
                #     learning_rate=learning_rate, beta=0.9)
                pass
            for idx in range(len(denses)):
                if idx >= len(denses) - 1: continue
                denses[idx].weights, denses[idx].bias = optimize_basicGD(
                    w=denses[idx].weights, b=denses[idx].bias,
                    grad_w=grad_dZdW[idx][0], grad_b=grad_dZdW[idx][1], learning_rate=learning_rate)
                # denses[idx].weights, denses[idx].bias, denses[idx].cache_w, denses[idx].cache_b = optimize_RMSProps(
                #     w=denses[idx].weights, b=denses[idx].bias,
                #     grad_w=grad_dZdW[idx][0], grad_b=grad_dZdW[idx][1], 
                #     v_w=denses[idx].cache_w, v_b=denses[idx].cache_b,
                #     learning_rate=learning_rate, beta=0.9)
                # denses[idx].weights, denses[idx].bias, denses[idx].cache_w, denses[idx].cache_b, dL = optimize_RMSPropsL1(
                #     w=denses[idx].weights, b=denses[idx].bias,
                #     grad_w=grad_dZdW[idx][0], grad_b=grad_dZdW[idx][1], 
                #     v_w=denses[idx].cache_w, v_b=denses[idx].cache_b,
                #     learning_rate=learning_rate, beta=0.9, loss=loss, lambda_l1=1e-3)
                # loss = dL
                # denses[idx].weights, denses[idx].bias, denses[idx].cache_w, denses[idx].cache_b = optimize_Adam(
                #     w=denses[idx].weights, b=denses[idx].bias,
                #     grad_w=grad_dZdW[idx][0], grad_b=grad_dZdW[idx][1], 
                #     cache_w=denses[idx].cache_w, cache_b=denses[idx].cache_b,
                #     learning_rate=learning_rate, beta1=0.9, beta2=0.999)
                pass
            if (epoch+1) % 10 == 0:
                print(f"Cost {epoch+1:4d}: {loss}")
                pass
            pass
        model.extend(denses)
    return model

In [23]:
def evaluate(model: list[DenseLayer], X_test, Y_test):
    """
    :param X_test: matrix(n, m) n - number of rows, m - features
    :param Y_test: vector(n,)
    :return: [lost, accuracy] 
    """
    loss = 0.0
    accuracy = 0.0
    X_test = flat_data(X_test).transpose()
    A = X_test
    # print(f"A:{A.shape}")
    for dense in model:
        Z = A
        Z = dense.weights @ Z + dense.bias
        A = dense.activation_func(Z)
        pass
    Y_pred = softmax(A)
    loss = sparse_categorical_crossentropy(Y_test, Y_pred)
    # print(f"{Y_test.shape} {np.argmax(Y_pred[Y_test, np.arange(Y_test.shape[0])], keepdims=True)}")
    Y_ans = np.argmax(Y_pred, axis=0, keepdims=False)
    # print(f"Y_ans:{Y_ans.shape}")
    accuracy = accuracy_score(Y_test, Y_ans)
    rs = [loss, accuracy]
    return rs

In [24]:
# split the train data
X_train_20 = train_test_split(X_train, test_size=0.2, random_state=42)[1]
Y_train_20 = train_test_split(y_train, test_size=0.2, random_state=42)[1]

len(X_train_20)
X_train_20.shape

(12000, 28, 28)

In [25]:
rs1 = fit(X_train_20, Y_train_20, epochs=100, learning_rate=1e-3)

Cost   10: 9.392159276966616
Cost   20: 9.390475943938826
Cost   30: 9.388807622584487
Cost   40: 9.38715954386288
Cost   50: 9.385533259690451
Cost   60: 9.383929723573226
Cost   70: 9.382343266013347
Cost   80: 9.380776022718067
Cost   90: 9.37923261767259
Cost  100: 9.377714050984844


In [26]:
# split the train data
rand_seed = np.random.randint(low=0, high=1000)
X_test_20 = train_test_split(X_train, test_size=0.1, random_state=rand_seed)[1]
Y_test_20 = train_test_split(y_train, test_size=0.1, random_state=rand_seed)[1]

In [27]:
evaluate(rs1, X_test_20, Y_test_20)

[8.683935182299814, 0.13133333333333333]

#### Draft:

In [28]:
class ManualNN (object):
    """
    Layers:
    - (units=32, activation=tf.keras.activations.relu),
    - (units=128, activation=tf.keras.activations.relu),
    - (units=10, activation=tf.keras.activations.linear)
    Loss: softmax + SparseCategoricalCrossEntropy
    Optimizer: basicGD
    """
    def glorot_normal(self, shape, n_in, n_out):
        stddev = np.sqrt(2 / (n_in + n_out))
        return np.random.normal(loc=0, scale=stddev, size=shape)
    
    def glorot_uniform(self, shape, n_in, n_out):
        limit = np.sqrt(6 / (n_in + n_out))
        return np.random.uniform(low=-limit, high=limit, size=shape)
    
    def __init__(self, input_size: int, output_size: int, learning_rate=1e-2):
        self.list_W = [None]
        self.list_b = [None]
        self.learning_rate = learning_rate
        # - (units=32, activation=tf.keras.activations.relu),
        # W = np.random.randn(32, input_size) * learning_rate
        W = self.glorot_uniform(shape=(32, input_size), n_in=input_size, n_out=output_size)
        b = np.zeros((32, 1))
        self.list_W.append(W)
        self.list_b.append(b)
        # - (units=128, activation=tf.keras.activations.relu),
        # W = np.random.randn(128, 32) * learning_rate
        W = self.glorot_uniform(shape=(128, 32), n_in=input_size, n_out=output_size)
        b = np.zeros((128, 1))
        self.list_W.append(W)
        self.list_b.append(b)
        # - (units=10, activation=tf.keras.activations.linear)
        # W = np.random.randn(output_size, 128) * learning_rate
        W = self.glorot_uniform(shape=(output_size, 128), n_in=input_size, n_out=output_size)
        b = np.zeros((output_size, 1))
        self.list_W.append(W)
        self.list_b.append(b)
        return

    def linear(self, x): return x

    def relu(self, x): return np.maximum(0, x)
    
    def grad_relu(self, x): return np.where(x > 0, 1, 0) #return x > 0

    def softmax(self, x):
        # exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))
        # return exp_x / np.sum(exp_x, axis=1, keepdims=True)
        x_max = np.max(x, axis=-1, keepdims=True)
        x_exp = np.exp(x - x_max)
        x_sum = np.sum(x_exp, axis=-1, keepdims=True)
        return x_exp / x_sum

    def compute_loss(self, y_true, y_pred):
        selector = (y_true, np.arange(y_true.shape[0]))
        value = y_pred[selector]
        # print(f"loss: {value} <=0:{np.any(value <= 0)} isnan:{np.any(np.isnan(value))}")
        loss = -np.log(value + 1e-7)
        return np.mean(loss)
    
    def forward(self, X):
        self.a = []
        self.z = [0]    # default z0 is not use
        # input layer: a[0] = X
        self.a.append(X)
        # layer 1
        tmp_z = self.list_W[1] @ self.a[0] + self.list_b[1]
        tmp_a = self.relu(tmp_z)
        self.z.append(tmp_z)
        self.a.append(tmp_a)
        # layer 2
        tmp_z = self.list_W[2] @ self.a[1] + self.list_b[2]
        tmp_a = self.relu(tmp_z)
        self.z.append(tmp_z)
        self.a.append(tmp_a)
        # layer 3
        tmp_z = self.list_W[3] @ self.a[2] + self.list_b[3]
        tmp_a = self.softmax(tmp_z) #self.linear(tmp_z)
        self.z.append(tmp_z)
        self.a.append(tmp_a)
        return self.a[-1]
    
    def backward(self, X, y_true, y_pred):
        """
        
        :param self: 
        :param X: (n, m): n features, m samples 
        :param y_true: (m,): m labels 
        :param y_pred: (n, m): predicted labels
        :return: 
        """
        m = X.shape[1]
        grad_W: list = [None] * 4
        grad_b: list = [None] * 4
        
        # y_pred - y_true
        grad_Z3 = y_pred
        grad_Z3[y_true, range(m)] -= 1
        grad_Z3 /= m
        
        # grad for layer 3
        # grad_W[3] = grad_Z3 @ dZ3/dW[3] = grad_Z3 @ d(W[3]A[2] + b[3])/dW[3]
        grad_W[3] = grad_Z3 @ self.a[2].T
        # grad_b[3] = grad_Z3 @ np.ones(shape=(grad_Z3.shape[1], 1))
        grad_b[3] = np.sum(grad_Z3, axis=1, keepdims=True)
        
        # grad for layer 2
        # grad_A2 = grad_Z3 @ dZ3/dA2 = grad_Z3 @ d(W[3]A[2] + b[3])/dA2 
        grad_A2 = self.list_W[3].T @ grad_Z3
        # grad_Z2 = grad_Z3 * dA2/dZ2 = grad_relu(Z2) * grad_A2
        grad_Z2 = self.grad_relu(self.z[2]) * grad_A2
        grad_W[2] = grad_Z2 @ self.a[1].T
        grad_b[2] = np.sum(grad_Z2, axis=1, keepdims=True)
        
        # grad for layer 1
        grad_A1 = self.list_W[2].T @ grad_Z2
        grad_Z1 = self.grad_relu(self.z[1]) * grad_A1
        grad_W[1] = grad_Z1 @ self.a[0].T
        grad_b[1] = np.sum(grad_Z1, axis=1, keepdims=True)
        
        # update W, b
        self.list_W[3] -= self.learning_rate * grad_W[3]
        self.list_b[3] -= self.learning_rate * grad_b[3]
        self.list_W[2] -= self.learning_rate * grad_W[2]
        self.list_b[2] -= self.learning_rate * grad_b[2]
        self.list_W[1] -= self.learning_rate * grad_W[1]
        self.list_b[1] -= self.learning_rate * grad_b[1]
        
        return
    
    def fit(self, X: np.ndarray, y: np.ndarray, epochs=1):
        """
        fit
        :param self: 
        :param X: (n, m): n features, m samples 
        :param y: (m,): m labels
        :param epochs: 
        :param learning_rate: 
        :return: 
        """
        for epoch in range(epochs):
            y_pred = self.forward(X=X)
            loss = self.compute_loss(y, y_pred)
            self.backward(X, y, y_pred)
            if (epoch + 1) % 10 == 0:
                print(f"Epoch {epoch+1:4d}, Loss: {loss}")
            pass
        return
    
    def evaluate(self, X_test: np.ndarray, y_test: np.ndarray):
        """
        :param X_test: matrix(n, m) n - number of rows, m - features
        :param Y_test: vector(n,)
        :return: [lost, accuracy] 
        """
        X_test = flat_data(X_test).transpose()
        A = X_test
        # print(f"A:{A.shape}")
        A = self.forward(A)
        Y_pred = softmax(A)
        loss = sparse_categorical_crossentropy(y_test, Y_pred)
        # print(f"{Y_test.shape} {np.argmax(Y_pred[Y_test, np.arange(Y_test.shape[0])], keepdims=True)}")
        y_ans = np.argmax(Y_pred, axis=0, keepdims=False)
        # print(f"Y_ans:{Y_ans.shape}")
        accuracy = accuracy_score(y_test, y_ans)
        return loss, accuracy
    pass

In [29]:
class ManualNN2 (object):
    """
    Layers:
    - (units=32, activation=tf.keras.activations.relu),
    - (units=128, activation=tf.keras.activations.relu),
    - (units=10, activation=tf.keras.activations.linear)
    Loss: softmax + SparseCategoricalCrossEntropy
    Optimizer: basicGD
    """
    def glorot_normal(self, shape, n_in, n_out):
        stddev = np.sqrt(2 / (n_in + n_out))
        return np.random.normal(loc=0, scale=stddev, size=shape)

    def glorot_uniform(self, shape, n_in, n_out):
        limit = np.sqrt(6 / (n_in + n_out))
        return np.random.uniform(low=-limit, high=limit, size=shape)

    def __init__(self, input_size: int, output_size: int, learning_rate=1e-2):
        self.list_W = [None]
        self.list_b = [None]
        self.learning_rate = learning_rate
        # - (units=32, activation=tf.keras.activations.relu),
        # W = np.random.randn(32, input_size) * learning_rate
        W = self.glorot_uniform(shape=(32, input_size), n_in=input_size, n_out=output_size)
        b = np.zeros((32, 1))
        self.list_W.append(W)
        self.list_b.append(b)
        # - (units=128, activation=tf.keras.activations.relu),
        # W = np.random.randn(128, 32) * learning_rate
        W = self.glorot_uniform(shape=(128, 32), n_in=input_size, n_out=output_size)
        b = np.zeros((128, 1))
        self.list_W.append(W)
        self.list_b.append(b)
        # - (units=10, activation=tf.keras.activations.linear)
        # W = np.random.randn(output_size, 128) * learning_rate
        W = self.glorot_uniform(shape=(output_size, 128), n_in=input_size, n_out=output_size)
        b = np.zeros((output_size, 1))
        self.list_W.append(W)
        self.list_b.append(b)
        return

    def linear(self, x): return x

    def relu(self, x): return np.maximum(0, x)

    def grad_relu(self, x): return np.where(x > 0, 1, 0) #return x > 0

    def softmax(self, x):
        # exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))
        # return exp_x / np.sum(exp_x, axis=1, keepdims=True)
        x_max = np.max(x, axis=-1, keepdims=True)
        x_exp = np.exp(x - x_max)
        # x_exp = np.exp(x)
        x_sum = np.sum(x_exp, axis=-1, keepdims=True)
        return x_exp / x_sum

    def compute_loss(self, y_true, y_pred):
        selector = (y_true, np.arange(y_true.shape[0]))
        value = y_pred[selector]
        # print(f"loss: {value} <=0:{np.any(value <= 0)} isnan:{np.any(np.isnan(value))}")
        loss = -np.log(value + 1e-7)
        return np.mean(loss)

    def forward(self, X):
        self.a = []
        self.z = [0]    # default z0 is not use
        # input layer: a[0] = X
        self.a.append(X)
        # layer 1
        tmp_z = self.list_W[1] @ self.a[0] + self.list_b[1]
        tmp_a = self.relu(tmp_z)
        self.z.append(tmp_z)
        self.a.append(tmp_a)
        # layer 2
        tmp_z = self.list_W[2] @ self.a[1] + self.list_b[2]
        tmp_a = self.relu(tmp_z)
        self.z.append(tmp_z)
        self.a.append(tmp_a)
        # layer 3
        tmp_z = self.list_W[3] @ self.a[2] + self.list_b[3]
        tmp_a = self.softmax(tmp_z) #self.linear(tmp_z)
        self.z.append(tmp_z)
        self.a.append(tmp_a)
        return self.a[-1]

    def backward(self, X, y_true, y_pred):
        """
        
        :param self: 
        :param X: (n, m): n features, m samples 
        :param y_true: (m,): m labels 
        :param y_pred: (n, m): predicted labels
        :return: 
        """
        m = X.shape[1]
        grad_W: list = [None] * 4
        grad_b: list = [None] * 4
        grad_A: list = [None] * 4
        grad_Z: list = [None] * 4

        # y_pred - y_true
        grad_Z[3] = y_pred
        grad_Z[3][y_true, range(m)] -= 1
        grad_Z[3] /= m

        # grad for layer 3
        # grad_W[3] = grad_Z3 @ dZ3/dW[3] = grad_Z3 @ d(W[3]A[2] + b[3])/dW[3]
        grad_W[3] = grad_Z[3] @ self.a[2].T
        # grad_b[3] = grad_Z3 @ np.ones(shape=(grad_Z3.shape[1], 1))
        grad_b[3] = np.sum(grad_Z[3], axis=1, keepdims=True)
        
        # grad for layer 2
        # grad_A2 = grad_Z3 @ dZ3/dA2 = grad_Z3 @ d(W[3]A[2] + b[3])/dA2 
        grad_A[2] = self.list_W[3].T @ grad_Z[3]
        # grad_Z2 = grad_Z3 * dA2/dZ2 = grad_relu(Z2) * grad_A2
        grad_Z[2] = self.grad_relu(self.z[2]) * grad_A[2]
        grad_W[2] = grad_Z[2] @ self.a[1].T
        grad_b[2] = np.sum(grad_Z[2], axis=1, keepdims=True)
        
        # grad for layer 1
        grad_A[1] = self.list_W[2].T @ grad_Z[2]
        grad_Z[1] = self.grad_relu(self.z[1]) * grad_A[1]
        grad_W[1] = grad_Z[1] @ self.a[0].T
        grad_b[1] = np.sum(grad_Z[1], axis=1, keepdims=True)

        self.list_W[3] -= self.learning_rate * grad_W[3]
        self.list_b[3] -= self.learning_rate * grad_b[3]
        self.list_W[2] -= self.learning_rate * grad_W[2]
        self.list_b[2] -= self.learning_rate * grad_b[2]
        self.list_W[1] -= self.learning_rate * grad_W[1]
        self.list_b[1] -= self.learning_rate * grad_b[1]

        return

    def fit(self, X: np.ndarray, y: np.ndarray, epochs=1):
        """
        fit
        :param self: 
        :param X: (n, m): n features, m samples 
        :param y: (m,): m labels
        :param epochs: 
        :param learning_rate: 
        :return: 
        """
        for epoch in range(epochs):
            y_pred = self.forward(X_treated)
            loss = self.compute_loss(y, y_pred)
            self.backward(X, y, y_pred)
            if (epoch + 1) % 10 == 0:
                print(f"Epoch {epoch+1:4d}, Loss: {loss}")
            pass
        return

    def evaluate(self, X_test: np.ndarray, y_test: np.ndarray):
        """
        :param X_test: matrix(n, m) n - number of rows, m - features
        :param Y_test: vector(n,)
        :return: [lost, accuracy] 
        """
        X_test = flat_data(X_test).transpose()
        A = X_test
        # print(f"A:{A.shape}")
        A = self.forward(A)
        Y_pred = softmax(A)
        loss = sparse_categorical_crossentropy(y_test, Y_pred)
        # print(f"{Y_test.shape} {np.argmax(Y_pred[Y_test, np.arange(Y_test.shape[0])], keepdims=True)}")
        y_ans = np.argmax(Y_pred, axis=0, keepdims=False)
        # print(f"Y_ans:{Y_ans.shape}")
        accuracy = accuracy_score(y_test, y_ans)
        return loss, accuracy
    pass

In [38]:
X_treated = flat_data(X_train_20).transpose()
nn = ManualNN2(input_size=X_treated.shape[0], output_size=10, learning_rate=1e-3)
nn.fit(X_treated, Y_train_20, epochs=100)

Epoch   10, Loss: 9.394127909503426
Epoch   20, Loss: 9.393994215551787
Epoch   30, Loss: 9.39386151523321
Epoch   40, Loss: 9.39372954715865
Epoch   50, Loss: 9.393598434994782
Epoch   60, Loss: 9.393467920440175
Epoch   70, Loss: 9.393338194143345
Epoch   80, Loss: 9.393208985834102
Epoch   90, Loss: 9.393080055816034
Epoch  100, Loss: 9.392951589294913


In [39]:
rs2 = nn.evaluate(X_test_20, Y_test_20)
rs2

(8.698915236367096, 0.08683333333333333)

\begin{align}
\frac{\partial \mathcal{L} }{ \partial W^{[2]} } &= 
\frac{1}{m}\left(A^{[2]}-Y\right)\left(A^{[1]}\right)^T,\\
\frac{\partial \mathcal{L} }{ \partial b^{[2]} } &= 
\frac{1}{m}\left(A^{[2]}-Y\right)\mathbf{1},\\
\frac{\partial \mathcal{L} }{ \partial W^{[1]}} &= \frac{1}{m}\left(\left(W^{[2]}\right)^T \left(A^{[2]} - Y\right)\cdot \left(A^{[1]}\cdot\left(1-A^{[1]}\right)\right)\right)X^T,\\
\frac{\partial \mathcal{L} }{ \partial b^{[1]}} &= \frac{1}{m}\left(\left(W^{[2]}\right)^T \left(A^{[2]} - Y\right)\cdot \left(A^{[1]}\cdot\left(1-A^{[1]}\right)\right)\right)\mathbf{1},\\
\tag{15}
\end{align}

In [32]:
a = np.array([[1, 2, 3, 3],
              [4, 5, 6, 6],
              [7, 8, 9, 9]])
b = np.array([1, 2, 3])
a[np.arange(a.shape[0]), b]
# a

array([2, 6, 9])

In [33]:
# Sample
model = tf.keras.models.Sequential(layers=[
    tf.keras.layers.Flatten(input_shape=(28, 28,)),
    tf.keras.layers.Dense(units=32, activation=tf.keras.activations.relu),
    tf.keras.layers.Dense(units=128, activation=tf.keras.activations.relu),
    # tf.keras.layers.Dropout(rate=0.2),
    tf.keras.layers.Dense(units=10, activation=tf.keras.activations.linear)
    # tf.keras.layers.Dense(units=10, activation=tf.keras.activations.softmax)
])
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
# loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False)
model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=1e-3), #tf.keras.optimizers.RMSprop(learning_rate=1e-3),
              loss=loss_fn,
              metrics=["accuracy"])
# model.compile(optimizer=tf.keras.optimizers.RMSprop(learning_rate=1e-3), #tf.keras.optimizers.RMSprop(learning_rate=1e-3),
#               loss=loss_fn,
#               metrics=["accuracy"])

In [36]:
model.fit(x=X_train_20, y=Y_train_20, epochs=100, batch_size=Y_train_20.shape[0], workers=8, use_multiprocessing=True)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x13f9d4910>

In [37]:
rs3 = model.evaluate(X_test_20, Y_test_20, verbose=0)
rs3  # [lost, accuracy]

[2.293394088745117, 0.1536666601896286]

In [None]:
def show_image(img_data: np.ndarray) -> tuple:
    fig, axes = plt.subplots(figsize=(1.60, 1.20))
    axes.imshow(X=img_data, cmap="gray")
    return fig, axes

# print(y_test[5854])
show_image(X_test[4823])
plt.show()

In [None]:
show_image(X_train[0])

### 5.2. Linear and Activation Function:

### 5.3. Loss/Cost Function

### 5.4. Optimizer

In [None]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

# resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
# tf.config.experimental_connect_to_cluster(resolver)
# # This is the TPU initialization code that has to be at the beginning.
# tf.tpu.experimental.initialize_tpu_system(resolver)
# print("All devices: ", tf.config.list_logical_devices('TPU'))