In [45]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scipy
import time
import seaborn as sns
import math


In [46]:
def timer(func):
    def wrapper(*args, **kwargs):
        print('Start time: ', time.ctime())
        start_time = time.time()  # start time

        result = func(*args, **kwargs)  # run

        end_time = time.time()  # end time
        print('End time: ', time.ctime())
        print(f"{func.__name__} executed in {(end_time - start_time):.4f} seconds")
        return result
    return wrapper

### Pre-process

Min-max normalization:

$$x_{min-max} = {{x-min(x)}\over{max(x)-min(x)}}$$

Standardization:

$$x_{norm} = {{x-\mu}\over{\sigma}}$$

In [47]:
def pre_processing(X, mode=None):
    if mode == 'min-max':
        print('Pre-process: min-max normalization')
        min_each_feature = np.min(X, axis=0)
        max_each_feature = np.max(X, axis=0)
        scale = max_each_feature - min_each_feature
        scale[scale == 0] = 1   # To avoid divided by 0
        scaled_train = (X - min_each_feature) / scale
        return scaled_train

    if mode == 'standardization':
        print('Pre-process: standardization')
        std_each_feature = np.std(X, axis=0)
        mean_each_feature = np.mean(X, axis=0)
        std_each_feature[std_each_feature == 0] = 1     # To avoid divided by 0
        norm_train = (X - mean_each_feature) / std_each_feature
        norm_test = (X - mean_each_feature) / std_each_feature
        return norm_train

    print('No pre-process')

    return X

In [48]:
def accuracy(y_hat,y):
    '''
    y_hat : predicted value
    :param y_hat: [batch_size,num_of_class]
    :param y: [batch_size,1
    :return: 
    '''
    preds=y_hat.argmax(axis=1,keepdims=True)
    return np.mean(preds == y)*100

In [49]:
def calculate_gain(nonlinearity, param=None):
    if nonlinearity == 'sigmoid':
        return 1
    elif nonlinearity == 'tanh':
        return 5.0 / 3
    elif nonlinearity == 'relu':
        return math.sqrt(2.0)
    elif nonlinearity == 'leaky_relu':
        if param is None:
            negative_slope = 0.01
        elif not isinstance(param, bool) and isinstance(param, int) or isinstance(param, float):
            # True/False are instances of int, hence check above
            negative_slope = param
        else:
            raise ValueError(f"negative_slope {param} not a valid number")
        return math.sqrt(2.0 / (1 + negative_slope ** 2))
    elif nonlinearity == 'selu':
        return 3.0 / 4  # Value found empirically (https://github.com/pytorch/pytorch/pull/50664)
    else:
        raise ValueError(f"Unsupported nonlinearity {nonlinearity}")

def _calculate_fan_in_and_fan_out(array):
    dimensions = len(array.shape)
    if dimensions < 2:
        raise ValueError("Fan in and fan out can not be computed for tensor with fewer than 2 dimensions")

    num_input_fmaps = array.shape[1]
    num_output_fmaps = array.shape[0]
    receptive_field_size = 1
    if dimensions > 2:
        # math.prod is not always available, accumulate the product manually
        # we could use functools.reduce but that is not supported by TorchScript
        for s in array.shape[2:]:
            receptive_field_size *= s
    fan_in = num_input_fmaps * receptive_field_size
    fan_out = num_output_fmaps * receptive_field_size

    return fan_in, fan_out

def _calculate_correct_fan(array, mode):
    mode = mode.lower()
    valid_modes = ['fan_in', 'fan_out']
    if mode not in valid_modes:
        raise ValueError(f"Mode {mode} not supported, please use one of {valid_modes}")

    fan_in, fan_out = _calculate_fan_in_and_fan_out(array)
    return fan_in if mode == 'fan_in' else fan_out

def kaiming_normal_(array: np.array, a: float = 0, mode: str = 'fan_in', nonlinearity: str = 'relu'):
    fan = _calculate_correct_fan(array, mode)
    gain = calculate_gain(nonlinearity, a)
    std = gain / math.sqrt(fan)
    return np.random.normal(0, std, array.shape)

In [50]:
# class Parameter(object):
#     """Parameter class for saving data and gradients"""
#     def __init__(self, data, requires_grad, skip_decay=False):
#         self.data = data
#         self.grad = None
#         self.skip_decay = skip_decay
#         self.requires_grad = requires_grad

In [51]:
class Layer(object):
    def __init__(self, name, requires_grad=False):
        self.name = name
        self.requires_grad = requires_grad

    def _forward(self, *args):
        pass

    def _backward(self, *args):
        pass

In [52]:
class ReLU(Layer):
    def __init__(self, name, requires_grad=False):
        super().__init__(name, requires_grad)

    def _forward(self, x):
        self.x = x
        return np.maximum(0, x)

    def _backward(self, gradient_output):
        gradient_output[self.x <= 0] = 0
        return gradient_output

Forward:

$$\mathbf{y} = \mathbf{xW} + \mathbf{b}$$

Backward:

$$\frac{\partial L}{\partial \mathbf{x}} = \frac{\partial L}{\partial \mathbf{y}} \frac{\partial \mathbf{y}}{\partial \mathbf{x}} = \frac{\partial L}{\partial \mathbf{y}} \mathbf{W}^T$$

Gradient of W:

$$\frac{\partial L}{\partial \mathbf{W}} = \frac{\partial L}{\partial \mathbf{y}} \frac{\partial \mathbf{y}}{\partial \mathbf{W}} = \frac{\partial L}{\partial \mathbf{y}} \mathbf{x}^T$$

Gradient of b:

$$\frac{\partial L}{\partial \mathbf{b}} = \frac{\partial L}{\partial \mathbf{y}} $$

Gradient of x:
$$\frac{\partial L}{\partial \mathbf{x}} = \frac{\partial L}{\partial \mathbf{y}} \mathbf{W}^T$$

In [53]:
class FCLayer(Layer):
    def __init__(self, name: str, n_in: int, n_out: int, skip_decay=False) -> None:
        '''
        Weight matrix W is of shape (n_in,n_out)
        and the bias vector b is of shape (n_out,)
        :param n_in: dimensionality of input
        :param n_out: number of hidden units
        '''
        super().__init__(name, requires_grad=True)
        self.n_in = n_in
        self.n_out = n_out
        W = kaiming_normal_(np.array([0] * n_in * n_out).reshape(n_in, n_out), a=math.sqrt(5))
        self.W = W
        self.b = np.zeros(self.n_out)
        self.W_grad = None
        self.b_grad = None
        self.skip_decay = skip_decay

    def _forward(self, x: np.ndarray) -> np.ndarray:
        """
            x: [batch size, n_in]
            W: [n_in, n_out]
            b: [n_out]
        """
        self.x = x
        temp = x @ self.W + self.b
        # [batch_size,n_in] @ [n_in,n_out] + [n_output] => [batch_size,n_out]
        return x @ self.W + self.b

    def _backward(self, delta: np.ndarray) -> np.ndarray:
        '''
        delta: the gradient of the loss function respect to this layer's output 这层损失函数对于这层输出的梯度
        :param delta: [batch size, n_out]:
        :return:
        '''
        batch_size = delta.shape[0]
        self.W_grad = self.x.T @ delta / batch_size  # [batch_size,n_in]^T @ [batch size, n_out] => [n_in,n_out]
        self.b_grad = delta.sum(axis=0) / batch_size  # divide by batch size to get average of gradient
        return delta @ self.W.T  # return the gradient of input(x) back to last layer

### Softmax 
Formula:
$$softmax(z_i) = \frac{e^{z_i}}{\sum_{j=1}^{n} e^{z_j}}$$

不需要计算 softmax 的完整 Jacobian 矩阵，因为与交叉熵结合后公式极大简化了。
只需用 preds - ground_truth 作为梯度，这个计算在 CrossEntropyLoss 里完成了：
self.grad = preds - ground_truth
因此，在 softmax.backward(gradient_output) 时，不需要额外计算 softmax 的梯度，而是直接返回 gradient_output


In [54]:
class Softmax(Layer):
    def __init__(self,name,requires_grad=False):
        super().__init__(name,requires_grad)
    def _forward(self, x: np.ndarray) -> np.ndarray:
        x_exp =  np.exp(x - np.max(x, axis=1, keepdims=True))
        return x_exp/x_exp.sum(axis=1, keepdims=True)
    def _backward(self, gradient_output: np.ndarray) -> np.ndarray:
        return gradient_output

### Loss Function - Cross Entropy
Formula:
$$CrossEntropy= - \sum_{i=1}^{n} y_i log(\hat {y_i})$$

Gradient of softmax:

$$\frac{\partial L}{\partial z_k} = \sum_{i}^{c} \left( \frac{\partial L}{\partial \hat{y}_i} \frac{\partial \hat{y}_i}{\partial z_k} \right)$$

$$\frac{\partial L}{\partial \hat{y}_i} = - \frac{y_i}{\hat{y}_i}, \qquad \frac{\partial \hat{y}_i}{\partial z_k} = \begin{cases}
\hat{y}_i(1 - \hat{y}_i) & \text{if } i = k \\
-\hat{y}_k\hat{y}_i & \text{if } i \neq k
\end{cases}
$$

$$\frac{\partial L}{\partial z_k} = - \left( (y_k(1 - \hat{y}_k)) - \sum_{i \neq k}^{c} y_i \hat{y}_k \right) = -(y_k - \hat{y}_k \sum_{i}^{c} y_i) = \hat{y}_k - y_k
$$

$$=> \frac{\partial L}{\partial z} = \hat{y} - y$$


In [55]:
class CrossEntropy(object):
    def __init__(self):
        self.softmax = Softmax('softmax')

    def __call__(self, x: np.ndarray, y: np.ndarray) -> np.ndarray:
        '''

        :param x:
        :param y: [batch_size, 1]
        :return:
        '''
        self.batch_size = x.shape[0]
        self.class_num = x.shape[1]

        y_hat = self.softmax._forward(x) #[batch_size,num_class]

        y=self.one_hot_encoding(y)
        self.grad = y_hat - y

        loss = -1 * (y * np.log(y_hat + 1e-8)).sum() / self.batch_size  # to avoid divided by 0
        return loss

    def one_hot_encoding(self, x):
        one_hot_encoded = np.zeros((self.batch_size, self.class_num))
        one_hot_encoded[np.arange(x.shape[0]), x.flatten()] = 1
        return one_hot_encoded

In [56]:
class MLP_V2():
    def __init__(self):
        self.layers = [
            FCLayer('fc1', n_in=128, n_out=512),
            # Dropout('dropout1', 0.6),
            # ReLU('relu1'),
            # FCLayer('fc2', n_in=512, n_out=256),
            # Dropout('dropout2', 0.4),
            BatchNormalization("batchnorm1",feature_num=512),
            # ReLU('relu2'),
            FCLayer('fc2', n_in=512, n_out=128),
            ReLU('relu2'),
            FCLayer('fc3', n_in=128, n_out=10)
        ]
        self.parameters = []
        for layer in self.layers:
            if hasattr(layer, "W"):
                # 将每个参数及其梯度、skip_decay 作为引用添加到 parameters 中
                self.parameters.append([layer.W, layer.W_grad, layer.skip_decay])
            if hasattr(layer, "b"):
                self.parameters.append([layer.b, layer.b_grad, layer.skip_decay])
            if hasattr(layer, "gamma"):
                self.parameters.append([layer.gamma, layer.gamma_grad, layer.skip_decay])
            if hasattr(layer, "beta"):
                self.parameters.append([layer.beta, layer.beta_grad, layer.skip_decay])

    def _forward(self, x: np.ndarray) -> np.ndarray:
        for layer in self.layers:
            x= layer._forward(x)
        return x


    def _backward(self, gradient_output: np.ndarray) -> np.ndarray:
        for layer in self.layers[::-1]:
            gradient_output= layer._backward(gradient_output)
        return gradient_output

    def _fit(self,mode='train'):
        if mode=='train':
            for layer in self.layers:
                layer.train=True
        elif mode=='eval':
            for layer in self.layers:
                layer.train=False

### AdamW
Formula:

$$m_t = \beta_1 m_{t-1} + (1 - \beta_1) g_t$$

$$v_t = \beta_2 v_{t-1} + (1 - \beta_2) g_t^2$$

$$ \text{bias correction: } \ \   \hat{m}_t = \frac{m_t}{1 - \beta_1^t}, \hat{v}_t = \frac{v_t}{1 - \beta_2^t}$$

$$\theta_t = \theta_{t-1} - \alpha \cdot \frac{\hat{m}_t}{\sqrt{\hat{v}_t} + \epsilon}$$

use decoupled weight decay:
$$
\theta_t = \theta_{t-1} - \eta \frac{\hat{m_t}}{\sqrt{\hat{v_t}} + \epsilon}
$$

$$
\theta_t = \theta_t (1 - \eta \lambda)
$$

In [57]:
class AdamW(object):
    '''
    the parameters have all the layers W and b
    '''

    def __init__(self, model, lr=1e-3, decoupled_weight_decay=0, beta1=0.9, beta2=0.999, epsilon=1e-8):
        self.model = model
        self.lr = lr
        self.beta1 = beta1
        self.beta2 = beta2
        self.decoupled_weight_decay = decoupled_weight_decay
        self.epsilon = epsilon
        self.t = 0
        self.m = [np.zeros(p[0].shape) for p in self.get_parameters()]
        self.v = [np.zeros(p[0].shape) for p in self.get_parameters()]

    def get_parameters(self):
        parameters = []
        for layer in self.model.layers:
            if hasattr(layer, "W"):
                parameters.append([layer.W, layer.W_grad, layer.skip_decay])
            if hasattr(layer, "b"):
                parameters.append([layer.b, layer.b_grad, layer.skip_decay])
            if hasattr(layer, "gamma"):
                parameters.append([layer.gamma, layer.gamma_grad, layer.skip_decay])
            if hasattr(layer, "beta"):
                parameters.append([layer.beta, layer.beta_grad, layer.skip_decay])
        return parameters

    def step(self):
        parameters = self.get_parameters()  # 动态获取最新的参数
        for i, (param_list, m, v) in enumerate(zip(parameters, self.m, self.v)):
            param, param_grad, skip_decay = param_list
            self.t += 1

            # 计算动量和方差
            m = self.beta1 * m + (1 - self.beta1) * param_grad
            v = self.beta2 * v + (1 - self.beta2) * np.power(param_grad, 2)

            # 更新 m 和 v
            self.m[i] = m
            self.v[i] = v

            # 计算偏差修正后的 m 和 v
            m_hat = m / (1 - np.power(self.beta1, self.t))
            v_hat = v / (1 - np.power(self.beta2, self.t))

            # 更新参数
            update = self.lr * m_hat / (np.sqrt(v_hat) + self.epsilon)

            # 如果 skip_decay 为 True, 不应用权重衰减
            if not skip_decay:
                param -= update
                param *= (1 - self.lr * self.decoupled_weight_decay)  # 权重衰减
            else:
                param -= update  # 仅应用 Adam 更新步骤

### SGD with Momentum
SGD Formula:
$$θ_{t+1}=θ_t−\eta  \cdot ∇L(θ_t)$$

Momentum 梯度下降的公式如下：
$$
\begin{equation}
v_t = \beta v_{t-1} - \eta \nabla L(\theta_t)
\end{equation}

\begin{equation}
\theta_{t+1} = \theta_t + v_t
\end{equation}
$$

其中：
$$
    \( v_t \) 是当前动量\\
    \( \beta \) 是动量系数（通常取 0.9）\\
    \( \eta \) 是学习率\\
    \( \nabla L(\theta_t) \) 是损失函数对参数的梯度
$$

In [58]:
class SGDMomentum:
    def __init__(self, model, lr=0.01, momentum=0.9, weight_decay=0.0001):
        self.model = model  # 引用模型
        self.lr = lr
        self.momentum = momentum
        self.weight_decay = weight_decay
        self.v = [np.zeros(param[0].shape) for param in self.model.parameters]
    def get_parameters(self):
        # 直接从模型获取最新的参数
        parameters = []
        for layer in self.model.layers:
            if hasattr(layer, "W"):
                parameters.append([layer.W, layer.W_grad, layer.skip_decay])
            if hasattr(layer, "b"):
                parameters.append([layer.b, layer.b_grad, layer.skip_decay])
            if hasattr(layer, "gamma"):
                parameters.append([layer.gamma, layer.gamma_grad, layer.skip_decay])
            if hasattr(layer, "beta"):
                parameters.append([layer.beta, layer.beta_grad, layer.skip_decay])
        return parameters

    def step(self):
        # 动态获取最新的模型参数
        self.parameters = self.get_parameters()
        # 直接从 model 中获取 parameters 和 gradients
        for i, (v, param_list) in enumerate(zip(self.v, self.parameters)):
            param, param_grad, skip_decay = param_list
            if param_grad is not None:
                if not skip_decay:
                    param -= self.weight_decay * param  # 应用权重衰减
                v[:] = self.momentum * v + self.lr * param_grad  # 更新动量
                self.v[i] = v
                param -= v  # 更新参数

### Batch Normalization
核心思想是：在每一层输入神经元的激活值进行归一化（normalization）然后进行缩放和平移，以保持模型的表达能力
Forward:
$$\hat x_i = \frac{x_i - \mu}{\sqrt{\sigma^2 + \epsilon}}$$
$$\mathbf{y}=\gamma\frac{\mathbf{x}-E(\mathbf{x})}{\sqrt{\sigma^2_B+\epsilon}}+\beta = \gamma \hat {\mathbf{x}}+\beta$$

$E(\mathbf{x})$ is the mean of the current mini-batch, $\sigma^2_B$ is the variance of the current mini-batch

Backward:

m is the number of batch size



Gradient of $\gamma$:

$$\frac{\partial L}{\partial \gamma} = \frac{1}{m} \sum_{i=1}^{m} \frac{\partial L}{\partial y_i} \frac{\partial y_i}{\partial \gamma} = \frac{1}{m} \sum_{i=1}^{m} \frac{\partial L}{\partial y_i} \hat{x}_i
, \qquad where \ \ \hat{x}_i = \frac{x_i - \mu}{\sqrt{(\sigma^2)+\epsilon}}
$$
Gradient of $\beta$:

$$\frac{\partial L}{\partial \beta} = \frac{1}{m} \sum_{i=1}^{m} \frac{\partial L}{\partial y_i} \frac{\partial y_i}{\partial \beta} = \frac{1}{m} \sum_{i=1}^{m} \frac{\partial L}{\partial y_i}=\frac{1}{m}\sum_{i=1}^{m} gradient\_output$$

Gradient of $x_i$:

$$\frac{\partial L}{\partial \sigma^2} = \frac{\partial L}{\partial y_i} \cdot \frac{\partial y_i}{\partial \hat {x_i}} \cdot \frac{\partial \hat {x_i}}{\partial \sigma^2}\\=\frac{\partial L}{\partial y_i} \cdot \gamma \cdot -\frac{1}{2} \hat{x_i} (\sigma^2+\epsilon)^{-1}\\$$

to simplify the formula let $\sigma = \sqrt{\sigma^2+\epsilon}$ so the final formula is :
$$\frac{\partial L}{\partial \sigma^2} = -\frac{\gamma}{\sigma} \sum_{i=1}^{m} \hat {x_i}\frac{\partial L}{\partial \gamma} $$
$$\frac{\partial \mu}{\partial x_i}=\frac{1}{m}$$
$$\frac{\partial \sigma^2}{\partial x_i}=\frac{2}{m}(x_i-\mu)$$
$$\frac{\partial \hat {x_i}}{\partial x_i}=\frac{1}{\sigma} - \frac{1}{m}\sum_{j=1}^{m}\frac{1}{\sigma}$$

$$\frac{\partial L}{\partial \hat{x_i}}= \gamma \cdot  gradient\_output_i$$
$$\frac{\partial L}{\partial x_i} = \frac{\partial L}{\partial \hat x_i} \cdot \frac{\partial \hat x_i}{\partial x_i} + \frac{\partial L}{\partial \sigma^2} \cdot \frac{\partial \sigma^2}{\partial x_i} + \frac{\partial L}{\partial \mu} \cdot \frac{\partial \mu}{\partial x_i} \\=\frac{\gamma}{\sigma} \left( \frac{\partial L}{\partial y_i} - \frac{1}{m} \hat{x}_i \frac{\partial L}{\partial \gamma} - \frac{\partial L}{\partial \beta}  \right)$$

In [59]:
class BatchNormalization(Layer):
    def __init__(self, name, feature_num,skip_decay=True, epsilon=1e-5, requires_grad=True):
        super().__init__(name)
        self.epsilon = epsilon
        self.requires_grad = requires_grad
        self.skip_decay = skip_decay
        self.gamma = np.ones(feature_num)
        self.beta = np.zeros(feature_num)

        self.gamma_grad = None
        self.beta_grad = None

        self.ema = np.zeros(feature_num)
        self.emv = np.zeros(feature_num)

    def _forward(self, x: np.ndarray) -> np.ndarray:
        '''
        x: [batch_size,feature number]
        gamma: [feature number]
        beta: [feature number]
        :param x:
        :return:
        '''
        if self.train:
            batch_mean = x.mean(axis=0)
            batch_variance = x.var(axis=0)
            batch_std = np.sqrt(batch_variance + self.epsilon)
            # record exponential moving average and variance to
            momentum = 0.9
            self.ema = momentum * self.ema + (1 - momentum) * batch_mean
            self.emv = momentum * self.emv + (1 - momentum) * batch_variance
        else:
            batch_mean = self.ema.data
            batch_std = np.sqrt(self.emv + self.epsilon)
        self.norm = (x - batch_mean) / batch_std
        self.gamma_norm = self.gamma / batch_std

        return self.gamma * self.norm + self.beta

    def _backward(self, gradient_output: np.ndarray) -> np.ndarray:
        # make sure that gradient_output is the gradient of next layer, indicating that the gradient of loss about y
        batch_size = gradient_output.shape[0]
        self.gamma_grad = (gradient_output * self.norm).sum(axis=0) / batch_size
        self.beta_grad = gradient_output.sum(axis=0) / batch_size
        dLdx = self.gamma_norm * (gradient_output - self.norm * self.gamma_grad - self.beta_grad)
        return dLdx

In [60]:
class AverageMeterics(object):
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [61]:
class Adam:
    pass


class CosineLR:
    pass


class Trainer(object):
    def __init__(self,config,model=None,train_loader=None,valid_loader=None):
        self.config=config
        self.epochs = self.config['epoch']
        self.lr=self.config['lr']
        self.model=model
        self.train_loader=train_loader
        self.valid_loader=valid_loader
        self.print_freq=self.config['print_freq']
        # self.scheduler= self.config['scheduler']
        self.train_precision=[]
        self.valid_precision=[]
        self.train_loss=[]
        self.valid_loss=[]
        self.criterion=CrossEntropy()
        if self.config['optimizer'] == 'sgd':
            self.optimizer = SGDMomentum(self.model, self.lr, self.config['momentum'],
                                         self.config['weight_decay'])
        elif self.config['optimizer'] == 'adamw':
            self.optimizer = AdamW(self.model, self.lr, self.config['weight_decay'])
        # if self.scheduler == 'cos':
        #     self.train_scheduler = CosineLR(self.optimizer, T_max=self.epochs)
    @timer
    def train(self):
        best_accuracy=0
        for epoch in range(self.epochs):
            print('current lr {:.5e}'.format(self.optimizer.lr))
            self.train_per_epoch(epoch)
            acc1 = self.validate(epoch)

            # remember best prec@1
            best_acc1 = max(acc1, best_accuracy)
            output_best = 'Best Prec@1: %.3f\n' % (best_acc1)
            print(output_best)
    def train_per_epoch(self,epoch):
        batch_time=AverageMeterics()
        losses=AverageMeterics()
        top1=AverageMeterics()        
        self.model._fit()
        end_time = time.time()
        for i,(X,y) in enumerate(self.train_loader):
            y_hat=self.model._forward(X)
            loss=self.criterion(y_hat,y)
            
            self.model._backward(self.criterion.grad)
            self.optimizer.step()
            precision=accuracy(y_hat,y)
            losses.update(loss,X.shape[0])
            top1.update(precision,X.shape[0])
            
            batch_time.update(time.time() - end_time)
            end_time = time.time()
            if (i%self.print_freq ==0) or (i==len(self.train_loader)-1):
                print('Epoch: [{0}][{1}/{2}]\t'
                    'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                    'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                    'Prec@1 {top1.val:.3f} ({top1.avg:.3f})'.format(
                        epoch + 1, i, len(self.train_loader) - 1, batch_time=batch_time,
                        loss=losses, top1=top1))
        print('EPOCH: {epoch} {flag} Results: Prec@1 {top1.avg:.3f} Loss: {losses.avg:.4f}'.format(epoch=epoch + 1 , flag='train', top1=top1, losses=losses))
        self.train_loss.append(losses.avg)
        self.train_precision.append(top1.avg)
    def validate(self, epoch):
        batch_time = AverageMeterics()
        losses = AverageMeterics()
        top1 = AverageMeterics()

        self.model._fit(mode='test')

        end = time.time()
        for i, (X, y) in enumerate(self.valid_loader):
            # compute output
            y_hat = self.model._forward(X)
            loss = self.criterion(y_hat, y)

            # measure accuracy and record loss
            precision = accuracy(y_hat, y)
            losses.update(loss, X.shape[0])
            top1.update(precision, X.shape[0])

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            if (i % self.print_freq == 0) or (i == len(self.valid_loader) - 1):
                print('Test: [{0}/{1}]\t'
                      'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                      'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                      'Accuracy {top1.val:.3f} ({top1.avg:.3f})'.format(
                    i, len(self.valid_loader) - 1, batch_time=batch_time, loss=losses,
                    top1=top1))

        print('EPOCH: {epoch} {flag} Results: Accuracy {top1.avg:.3f} Loss: {losses.avg:.4f}'.format(epoch=epoch + 1,
                                                                                                   flag='val',
                                                                                                   top1=top1,
                                                                                                   losses=losses))
        self.valid_loss.append(losses.avg)
        self.valid_precision.append(top1.avg)

        return top1.avg

In [62]:
class Dropout(Layer):
    def __init__(self, name, drop_rate=0.5, requires_grad=False):
        super().__init__(name, requires_grad)
        self.drop_rate = drop_rate
        self.fix_value = 1 / (1 - self.drop_rate)   # to keep average fixed

    def _forward(self, x):
        if self.train:
            self.mask = np.random.uniform(0, 1, x.shape) > self.drop_rate
            return x * self.mask * self.fix_value
        else:
            return x

    def _backward(self, grad_output):
        if self.train:
            return grad_output * self.mask
        else:
            return grad_output

In [63]:
class Dataloader(object):
    def __init__(self, X, y, batch_size, shuffle=True, seed=None):
        self.X = X
        self.y = y
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.seed = seed
        self.index = np.arange(X.shape[0])

    def __iter__(self):
        if self.shuffle:
            if self.seed is not None:
                np.random.seed(self.seed)
            np.random.shuffle(self.index)
        self.n = 0
        return self

    def __next__(self):
        if self.n >= len(self.index):
            raise StopIteration

        index = self.index[self.n:self.n + self.batch_size]
        batch_X = self.X[index]
        batch_y = self.y[index]
        self.n += self.batch_size

        return batch_X, batch_y

    def __len__(self):
        """
            num of batch
        """
        return (len(self.index) + self.batch_size - 1) // self.batch_size  # ceiling

In [64]:

batch_size=1024
config={'lr': 0.01,'batch_size': batch_size,'momentum': 0.9,'weight_decay': 5e-4,'seed': 0,'epoch': 200,
    'optimizer': 'adamw',     # adam, sgd
    'scheduler': None,      # cos, None
    'pre-process': 'standardization',      # min-max, standardization, None
    'print_freq': 50000 // batch_size // 5
}
np.random.seed(config['seed'])

In [65]:
dir_path='E:\\Postgraduate\\25S1\\COMP5329\\Assignment\\Assignment1\\Assignment1-Dataset\\'
train_file='train_data.npy'
train_label_file='train_label.npy'
train_data=np.load(dir_path+train_file)
train_label=np.load(dir_path+train_label_file)
test_file='test_data.npy'
test_label_file='test_label.npy'

In [None]:
train_X=pre_processing(train_data,config['pre-process'])
train_dataloader=Dataloader(train_X, train_label, config['batch_size'], shuffle=True, seed=config['seed'])
test_X=np.load(dir_path+test_file)
test_label=np.load(dir_path+test_label_file)
test_X=pre_processing(test_X,config['pre-process'])
test_dataloader=Dataloader(test_X, test_label, config['batch_size'], shuffle=False, seed=config['seed'])

model = MLP_V2()
trainer=Trainer(config,model,train_dataloader,test_dataloader)
trainer.train()


Pre-process: standardization
Pre-process: standardization
Start time:  Wed Mar 26 21:15:47 2025
current lr 1.00000e-02
Epoch: [1][0/48]	Time 0.047 (0.047)	Loss 12.2224 (12.2224)	Prec@1 11.426 (11.426)
Epoch: [1][9/48]	Time 0.032 (0.027)	Loss 6.9798 (7.9511)	Prec@1 26.855 (25.039)
Epoch: [1][18/48]	Time 0.016 (0.026)	Loss 4.7496 (6.8890)	Prec@1 28.906 (26.336)
Epoch: [1][27/48]	Time 0.027 (0.026)	Loss 2.6217 (5.8386)	Prec@1 32.324 (27.560)
Epoch: [1][36/48]	Time 0.016 (0.025)	Loss 1.9379 (4.9444)	Prec@1 31.543 (27.972)
Epoch: [1][45/48]	Time 0.035 (0.025)	Loss 1.9493 (4.3600)	Prec@1 30.762 (28.072)
Epoch: [1][48/48]	Time 0.028 (0.025)	Loss 1.9407 (4.2170)	Prec@1 31.486 (28.288)
EPOCH: 1 train Results: Prec@1 28.288 Loss: 4.2170
Test: [0/9]	Time 0.003 (0.003)	Loss 1.8065 (1.8065)	Accuracy 34.961 (34.961)
Test: [9/9]	Time 0.000 (0.010)	Loss 1.8088 (1.8244)	Accuracy 33.673 (33.910)
EPOCH: 1 val Results: Accuracy 33.910 Loss: 1.8244
Best Prec@1: 33.910

current lr 1.00000e-02
Epoch: [2][0/4