# 520556528

# Import Libs

In [45]:
import numpy as np
import time
import math

# Load Datasets

# Utils

## Test array

In [46]:
test_array = np.random.randn(3, 4)
test_array

array([[-0.85672763,  1.89363852, -0.25110738, -1.09727424],
       [-0.23227008, -0.11473912, -1.69045798,  1.31190671],
       [ 1.44085915,  0.37027676, -0.38280027,  0.4089585 ]])

## Timer

In [47]:
def timer(func):
    def wrapper(*args, **kwargs):
        print('Start time: ', time.ctime())
        start_time = time.time()  # start time

        result = func(*args, **kwargs)  # run

        end_time = time.time()  # end time
        print('End time: ', time.ctime())
        print(f"{func.__name__} executed in {(end_time - start_time):.4f} seconds")
        return result
    return wrapper

@timer
def test_fun(x):
    time.sleep(x)

test_fun(1)

Start time:  Fri Mar 15 20:19:56 2024
End time:  Fri Mar 15 20:19:57 2024
test_fun executed in 1.0017 seconds


## Kaiming Init

Refer from https://github.com/pytorch/pytorch/blob/main/torch/nn/init.py.

Modify tensor to np.array

In [48]:
def calculate_gain(nonlinearity, param=None):
    r"""Return the recommended gain value for the given nonlinearity function.
    The values are as follows:

    ================= ====================================================
    nonlinearity      gain
    ================= ====================================================
    Linear / Identity :math:`1`
    Conv{1,2,3}D      :math:`1`
    Sigmoid           :math:`1`
    Tanh              :math:`\frac{5}{3}`
    ReLU              :math:`\sqrt{2}`
    Leaky Relu        :math:`\sqrt{\frac{2}{1 + \text{negative\_slope}^2}}`
    SELU              :math:`\frac{3}{4}`
    ================= ====================================================
    """
    
    if nonlinearity == 'sigmoid':
        return 1
    elif nonlinearity == 'tanh':
        return 5.0 / 3
    elif nonlinearity == 'relu':
        return math.sqrt(2.0)
    elif nonlinearity == 'leaky_relu':
        if param is None:
            negative_slope = 0.01
        elif not isinstance(param, bool) and isinstance(param, int) or isinstance(param, float):
            # True/False are instances of int, hence check above
            negative_slope = param
        else:
            raise ValueError(f"negative_slope {param} not a valid number")
        return math.sqrt(2.0 / (1 + negative_slope ** 2))
    elif nonlinearity == 'selu':
        return 3.0 / 4  # Value found empirically (https://github.com/pytorch/pytorch/pull/50664)
    else:
        raise ValueError(f"Unsupported nonlinearity {nonlinearity}")

def _calculate_fan_in_and_fan_out(array):
    dimensions = len(array.shape)
    if dimensions < 2:
        raise ValueError("Fan in and fan out can not be computed for tensor with fewer than 2 dimensions")

    num_input_fmaps = array.shape[1]
    num_output_fmaps = array.shape[0]
    receptive_field_size = 1
    if dimensions > 2:
        # math.prod is not always available, accumulate the product manually
        # we could use functools.reduce but that is not supported by TorchScript
        for s in array.shape[2:]:
            receptive_field_size *= s
    fan_in = num_input_fmaps * receptive_field_size
    fan_out = num_output_fmaps * receptive_field_size

    return fan_in, fan_out

def _calculate_correct_fan(array, mode):
    mode = mode.lower()
    valid_modes = ['fan_in', 'fan_out']
    if mode not in valid_modes:
        raise ValueError(f"Mode {mode} not supported, please use one of {valid_modes}")

    fan_in, fan_out = _calculate_fan_in_and_fan_out(array)
    return fan_in if mode == 'fan_in' else fan_out

def kaiming_normal_(array: np.array, a: float = 0, mode: str = 'fan_in', nonlinearity: str = 'leaky_relu'):
    fan = _calculate_correct_fan(array, mode)
    gain = calculate_gain(nonlinearity, a)
    std = gain / math.sqrt(fan)
    return np.random.normal(0, std, array.shape)
    

kaiming_normal_(np.array([0] * 30).reshape(5, 6))

array([[-0.45850532,  0.58140505,  0.29454185,  0.1586542 ,  0.71893752,
        -0.0909816 ],
       [ 0.13477889,  0.27962191, -0.11743695, -0.07979469,  0.12256677,
        -0.55592582],
       [-0.290316  , -0.08481382, -0.53662092,  0.0805716 ,  0.21629346,
        -0.8638151 ],
       [ 1.12419121,  1.25536453,  0.52773539,  0.26890837, -0.49224248,
         0.62645091],
       [ 0.11095123, -0.70820161,  0.66312001,  0.43883652, -0.61102968,
         0.24015082]])

## Parameter

In [49]:
class Parameter(object):
    def __init__(self, data, requires_grad, skip_decay=False):
        self.data = data
        self.grad = None
        self.skip_decay = skip_decay
        self.requires_grad = requires_grad

## AverageMeter

In [50]:
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

## Accuracy

In [51]:
def accuracy(output, target):
    preds = output.argmax(axis=-1, keepdims=True)
    return np.mean(preds == target)

# Layers

## Base layer

In [52]:
class Layer(object):
    def __init__(self, name, requires_grad=False):
        self.name = name 
        self.requires_grad = requires_grad
        
    def forward(self, *args):
        pass

    def backward(self, *args):
        pass

## Activation

### Relu

In [53]:
class relu(Layer):
    def __init__(self, name, requires_grad=False):
        super().__init__(name, requires_grad)

    def forward(self, input):
        self.input = input
        return np.maximum(0, input)
    
    def backward(self, grad_output):
        grad_output[self.input <= 0] = 0
        return grad_output
    

test_relu = relu('test_relu')
_ = test_relu.forward(test_array)
test_relu.backward(test_array)

array([[0.        , 1.89363852, 0.        , 0.        ],
       [0.        , 0.        , 0.        , 1.31190671],
       [1.44085915, 0.37027676, 0.        , 0.4089585 ]])

### Sigmoid

In [54]:
class sigmoid(Layer):
    def __init__(self, name, requires_grad=False):
        super().__init__(name, requires_grad)
        
    def forward(self, input):
        self.y = 1. / (1. + np.exp(-input))   # save sigmoid for more convenient grad computation
        return self.y
    
    def backward(self, grad_output):
        return self.y * (1 - self.y) * grad_output

### Softmax 

In [55]:
class softmax(Layer):
    def __init__(self, name, requires_grad=False):
        super().__init__(name, requires_grad)
        
    def forward(self, input):
        """
            input.shape = [batch size, num_class]
        """
        x_max = input.max(axis=-1, keepdims=True)       # to avoid overflow
        x_exp = np.exp(input - x_max)
        return x_exp / x_exp.sum(axis=-1, keepdims=True)
    
    def backward(self, grad_output):
        # packaged in CrossEntropyLoss
        return grad_output

softmax('test_softmax').forward(test_array)

array([[0.10369682, 0.68890954, 0.10369682, 0.10369682],
       [0.14895921, 0.14895921, 0.14895921, 0.55312236],
       [0.5165657 , 0.17708327, 0.12228365, 0.18406737]])

In [56]:
#TODO: more activation, tanh, gelu, leaky_relu ...

## Hidden layer

In [57]:
class HiddenLayer(Layer):
    def __init__(self, name, in_num, out_num):
        super().__init__(name, requires_grad=True)
        self.in_num = in_num
        self.out_num = out_num

        W = kaiming_normal_(np.array([0] * in_num * out_num).reshape(in_num, out_num), a=math.sqrt(5))     # Kaiming Init
        self.W = Parameter(W, self.requires_grad)
        self.b = Parameter(np.zeros(out_num), self.requires_grad)

    def forward(self, input):
        self.input = input
        return input @ self.W.data + self.b.data      # [batch size, in_num] @ [in_num, out_num] + [out_num] => [batch size, out_num]
    
    def backward(self, grad_output):
        """
            grad_output: [batch size, out_num]
        """
        batch_size = grad_output.shape[0]
        self.W.grad = self.input.T @ grad_output / batch_size
        self.b.grad = grad_output.sum(axis=0) / batch_size
        return grad_output @ self.W.data.T

## Batch Norm

In [58]:
#TODO

## Dropout

In [59]:
#TODO

# Loss Function

Cross Entropy

In [60]:
class CrossEntropyLoss(object):
    def __init__(self):
        self.softmax = softmax('softmax')

    def __call__(self, input, ground_truth):
        self.bacth_size = input.shape[0]
        self.class_num = input.shape[1]

        preds = self.softmax.forward(input)
        ground_truth = self.one_hot_encoding(ground_truth)

        self.grad = preds - ground_truth    #TODO: 推导要写在report上不？

        loss = -1 * (ground_truth * np.log(preds + 1e-8)).sum() / self.bacth_size

        return loss
    
    def one_hot_encoding(self, x):
        one_hot_encoded = np.zeros((self.bacth_size, self.class_num))
        one_hot_encoded[np.arange(x.size), x.flatten()] = 1
        return one_hot_encoded

# MLP

In [61]:
class MLP(object):
    def __init__(self):
        self.layers = []
        self.params = []
        self.num_layers = 0
    
    def add_layer(self, layer):
        self.layers.append(layer)
        if layer.requires_grad:
            if hasattr(layer, 'W'):
                self.params.append(layer.W)
            if hasattr(layer, 'b'):
                self.params.append(layer.b)
            if hasattr(layer, 'gamma'):
                self.params.append(layer.gamma)
            if hasattr(layer, 'beta'):
                self.params.append(layer.beta)
        self.num_layers += 1

    def forward(self, x):
        for layer in self.layers:
            x = layer.forward(x)
        return x
    
    def backward(self, x):
        for layer in self.layers[::-1]:
            x = layer.backward(x)
        return x
    
    def train(self):
        for layer in self.layers:
            layer.train = True
    
    def test(self):
        for layer in self.layers:
            layer.train = False

# Optimizer

## SGD with Momentum

In [62]:
class SGD(object):
    def __init__(self, parameters, momentum, lr, weight_decay):
        self.parameters = parameters
        self.momentum = momentum
        self.lr = lr
        self.weight_decay = weight_decay
        self.v = [np.zeros(p.data.shape) for p in self.parameters]

    def step(self):
        for i, (v, p) in enumerate(zip(self.v, self.parameters)):
            if not p.skip_decay:
                p.data -= self.weight_decay * p.data
            v = self.momentum * v + self.lr * p.grad
            self.v[i] = v
            p.data -= self.v[i]

## Adam

In [63]:
#TODO: Adam

# Scheduler

## Cosine

In [64]:
class CosineLR(object):
    def __init__(self, optimizer, T_max):
        self.optimizer = optimizer
        self.T_max = T_max
        self.n = -1
        self.base_lr = optimizer.lr
        self.step()

    def step(self):
        self.n += 1
        lr = self.get_lr()
        self.optimizer.lr = lr

    def get_lr(self):
        cos = np.cos(np.pi * self.n / self.T_max)
        return self.base_lr * (1 + cos) / 2

## MultiStep

In [65]:
#TODO:

# Trainer

In [66]:
class Trainer(object):
    def __init__(self, config, model=None, train_loader=None, val_loader=None):
        self.config = config
        self.epochs = self.config['epoch']
        self.lr = self.config['lr']
        self.model = model
        self.train_loader = train_loader
        self.val_loader = val_loader

        self.criterion = CrossEntropyLoss()
        self.optimizer = SGD(self.model.params, self.config['momentum'], self.lr, self.config['weight_decay'])
        self.train_scheduler = CosineLR(self.optimizer, T_max=self.epochs)

    def train(self):
        best_acc1 = 0
        for epoch in range(self.epochs):
            print('current lr {:.5e}'.format(self.optimizer.lr))
            self.train_per_epoch(epoch)
            self.train_scheduler.step()

            # evaluate on validation set
            acc1 = self.validate(epoch)

            # remember best prec@1
            best_acc1 = max(acc1, best_acc1)
            output_best = 'Best Prec@1: %.3f\n' % (best_acc1)
            print(output_best)

    
    def train_per_epoch(self, epoch):
        batch_time = AverageMeter()
        data_time = AverageMeter()
        losses = AverageMeter()
        top1 = AverageMeter()

        self.model.train()

        end = time.time()

        for i, (input, target) in enumerate(self.train_loader):
            data_time.update(time.time() - end)

            # compute output
            output = self.model.forward(input)
            loss = self.criterion(output, target)

            # compute gradient and do SGD step
            self.model.backward(self.criterion.grad)
            self.optimizer.step()

            # measure accuracy and record loss
            prec1 = accuracy(output, target)
            losses.update(loss, input.shape[0])
            top1.update(prec1, input.shape[0])

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            if i % 100 == 0:
                print('Epoch: [{0}][{1}/{2}]\t'
                    'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                    'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                    'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                    'Prec@1 {top1.val:.3f} ({top1.avg:.3f})'.format(
                        epoch, i, len(self.train_loader), batch_time=batch_time,
                        data_time=data_time, loss=losses, top1=top1))
                
    def validate(self, epoch):
        batch_time = AverageMeter()
        losses = AverageMeter()
        top1 = AverageMeter()

        self.model.test()

        end = time.time()
        for i, (input, target) in enumerate(self.val_loader):
            # compute output
            output = self.model.forward(input)
            loss = self.criterion(output, target)

            # measure accuracy and record loss
            prec1 = accuracy(output, target)
            losses.update(loss, input.shape[0])
            top1.update(prec1, input.shape[0])

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            if i % 100 == 0:
                print('Test: [{0}/{1}]\t'
                    'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                    'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                    'Prec@1 {top1.val:.3f} ({top1.avg:.3f})'.format(
                        i, len(self.val_loader), batch_time=batch_time, loss=losses,
                        top1=top1))
        
        output = ('EPOCH: {epoch} {flag} Results: Prec@1 {top1.avg:.3f} '.format(epoch=epoch + 1 , flag='val', top1=top1))
        print(output)

        return top1.avg

# Dataloader

In [67]:
class Dataloader(object):
    def __init__(self, X, y, batch_size, shuffle=True, seed=None):
        self.X = X
        self.y = y
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.seed = seed
        self.index = np.arange(X.shape[0])
    
    def __iter__(self):
        if self.shuffle:
            if self.seed is not None:
                np.random.seed(self.seed)
            np.random.shuffle(self.index)
        self.n = 0
        return self
    
    def __next__(self):
        if self.n >= len(self.index):
            raise StopIteration
        
        index = self.index[self.n:self.n + self.batch_size]
        batch_X = self.X[index]
        batch_y = self.y[index]
        self.n += self.batch_size

        return batch_X, batch_y
    
    def __len__(self):
        """
            num of batch
        """
        return (len(self.index) + self.batch_size - 1) // self.batch_size  # ceiling

# Main

## load data

In [68]:
file_path = './Assignment1-Dataset/'

train_X = np.load(file_path + 'train_data.npy')
train_y = np.load(file_path + 'train_label.npy')
test_X = np.load(file_path + 'test_data.npy')
test_y = np.load(file_path + 'test_label.npy')

In [69]:
train_X.shape

(50000, 128)

In [70]:
set([train_y[i][0] for i in range(train_y.shape[0])])

{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}

## run!

In [71]:
def get_model(layers):
    model = MLP()
    str2obj = {
        'linear': HiddenLayer, 
        'relu': relu, 
        'sigmoid': sigmoid, 
        'softmax': softmax,
    }
    for i in layers:
        model.add_layer(str2obj[i['type']](**i['params']))

    return model

layers = [
    {'type': 'linear', 'params': {'name': 'fc1', 'in_num': 128, 'out_num': 64}},
    # {'type': 'batchnorm', 'params': {'name': 'bn1', 'shape': 64}}, 
    {'type': 'relu', 'params': {'name': 'relu1'}}, 
    # {'type': 'linear', 'params': {'name': 'fc2', 'in_num': 256, 'out_num': 128}},
    # {'type': 'relu', 'params': {'name': 'relu2'}}, 
    {'type': 'linear', 'params': {'name': 'fc3', 'in_num': 64, 'out_num': 10}},
]
lr = 0.1
bs = 1024
momentum = 0.9
weight_decay = 5e-4     # 2e-4, 1e-4
seed = 0
epoch = 100

config = {
    'layers': layers,
    'lr': lr, 
    'bs': bs,
    'momentum': momentum,
    'weight_decay': weight_decay,
    'seed': seed,
    'epoch': epoch,
    
}


train_dataloader = Dataloader(train_X, train_y, config['bs'], shuffle=True, seed=config['seed'])
test_dataloader = Dataloader(test_X, test_y, config['bs'], shuffle=False)
model = get_model(config['layers'])
trainer = Trainer(config, model, train_dataloader, test_dataloader)
trainer.train()

current lr 1.00000e-01
Epoch: [0][0/49]	Time 0.011 (0.011)	Data 0.002 (0.002)	Loss 2.7202 (2.7202)	Prec@1 0.100 (0.100)


Test: [0/10]	Time 0.001 (0.001)	Loss 1.6218 (1.6218)	Prec@1 0.431 (0.431)
EPOCH: 1 val Results: Prec@1 0.428 
Best Prec@1: 0.428

current lr 9.99753e-02
Epoch: [1][0/49]	Time 0.002 (0.002)	Data 0.001 (0.001)	Loss 1.5995 (1.5995)	Prec@1 0.443 (0.443)
Test: [0/10]	Time 0.001 (0.001)	Loss 1.5370 (1.5370)	Prec@1 0.444 (0.444)
EPOCH: 2 val Results: Prec@1 0.453 
Best Prec@1: 0.453

current lr 9.99013e-02
Epoch: [2][0/49]	Time 0.004 (0.004)	Data 0.001 (0.001)	Loss 1.5724 (1.5724)	Prec@1 0.460 (0.460)
Test: [0/10]	Time 0.001 (0.001)	Loss 1.4811 (1.4811)	Prec@1 0.468 (0.468)
EPOCH: 3 val Results: Prec@1 0.475 
Best Prec@1: 0.475

current lr 9.97781e-02
Epoch: [3][0/49]	Time 0.002 (0.002)	Data 0.001 (0.001)	Loss 1.4724 (1.4724)	Prec@1 0.496 (0.496)
Test: [0/10]	Time 0.001 (0.001)	Loss 1.4243 (1.4243)	Prec@1 0.493 (0.493)
EPOCH: 4 val Results: Prec@1 0.484 
Best Prec@1: 0.484

current lr 9.96057e-02
Epoch: [4][0/49]	Time 0.004 (0.004)	Data 0.001 (0.001)	Loss 1.4171 (1.4171)	Prec@1 0.491 (0.491)
