# 520556528

# Import Libs

In [84]:
import numpy as np
import time
import math

# Load Datasets

In [85]:
#test
# wye test

# Utils

## Test array

In [86]:
test_array = np.random.randn(3, 4)
test_array

array([[-1.18590096, -1.50606023, -0.03233924, -0.32722782],
       [-1.54116619, -0.52125553,  0.16971559, -0.99620885],
       [ 0.81831748,  0.96090466,  1.35616532, -0.70111426]])

## Timer

In [87]:
def timer(func):
    def wrapper(*args, **kwargs):
        print('Start time: ', time.ctime())
        start_time = time.time()  # start time

        result = func(*args, **kwargs)  # run

        end_time = time.time()  # end time
        print('End time: ', time.ctime())
        print(f"{func.__name__} executed in {(end_time - start_time):.4f} seconds")
        return result
    return wrapper

@timer
def test_fun(x):
    time.sleep(x)

test_fun(1)

Start time:  Fri Mar 29 01:36:20 2024
End time:  Fri Mar 29 01:36:21 2024
test_fun executed in 1.0050 seconds


## Kaiming Init

Refer from https://github.com/pytorch/pytorch/blob/main/torch/nn/init.py.

Modify tensor to np.array

In [88]:
def calculate_gain(nonlinearity, param=None):
    r"""Return the recommended gain value for the given nonlinearity function.
    The values are as follows:

    ================= ====================================================
    nonlinearity      gain
    ================= ====================================================
    Linear / Identity :math:`1`
    Conv{1,2,3}D      :math:`1`
    Sigmoid           :math:`1`
    Tanh              :math:`\frac{5}{3}`
    ReLU              :math:`\sqrt{2}`
    Leaky Relu        :math:`\sqrt{\frac{2}{1 + \text{negative\_slope}^2}}`
    SELU              :math:`\frac{3}{4}`
    ================= ====================================================
    """
    
    if nonlinearity == 'sigmoid':
        return 1
    elif nonlinearity == 'tanh':
        return 5.0 / 3
    elif nonlinearity == 'relu':
        return math.sqrt(2.0)
    elif nonlinearity == 'leaky_relu':
        if param is None:
            negative_slope = 0.01
        elif not isinstance(param, bool) and isinstance(param, int) or isinstance(param, float):
            # True/False are instances of int, hence check above
            negative_slope = param
        else:
            raise ValueError(f"negative_slope {param} not a valid number")
        return math.sqrt(2.0 / (1 + negative_slope ** 2))
    elif nonlinearity == 'selu':
        return 3.0 / 4  # Value found empirically (https://github.com/pytorch/pytorch/pull/50664)
    else:
        raise ValueError(f"Unsupported nonlinearity {nonlinearity}")

def _calculate_fan_in_and_fan_out(array):
    dimensions = len(array.shape)
    if dimensions < 2:
        raise ValueError("Fan in and fan out can not be computed for tensor with fewer than 2 dimensions")

    num_input_fmaps = array.shape[1]
    num_output_fmaps = array.shape[0]
    receptive_field_size = 1
    if dimensions > 2:
        # math.prod is not always available, accumulate the product manually
        # we could use functools.reduce but that is not supported by TorchScript
        for s in array.shape[2:]:
            receptive_field_size *= s
    fan_in = num_input_fmaps * receptive_field_size
    fan_out = num_output_fmaps * receptive_field_size

    return fan_in, fan_out

def _calculate_correct_fan(array, mode):
    mode = mode.lower()
    valid_modes = ['fan_in', 'fan_out']
    if mode not in valid_modes:
        raise ValueError(f"Mode {mode} not supported, please use one of {valid_modes}")

    fan_in, fan_out = _calculate_fan_in_and_fan_out(array)
    return fan_in if mode == 'fan_in' else fan_out

def kaiming_normal_(array: np.array, a: float = 0, mode: str = 'fan_in', nonlinearity: str = 'relu'):
    fan = _calculate_correct_fan(array, mode)
    gain = calculate_gain(nonlinearity, a)
    std = gain / math.sqrt(fan)
    return np.random.normal(0, std, array.shape)

kaiming_normal_(np.array([0] * 30).reshape(5, 6))

array([[ 0.38053532,  0.1735732 , -1.06077562, -0.10529021,  0.75652059,
        -0.71530825],
       [ 0.58908577,  0.65507079, -0.20940775, -1.11856796, -0.02528333,
        -0.10241898],
       [-0.90975463, -0.04098603, -0.60648683,  0.32438372,  0.76327211,
         0.27928326],
       [-0.69533035, -0.13487909,  0.49598378, -0.79924988, -0.01481499,
        -0.49228997],
       [-0.25353233, -0.45903369, -0.16702028,  0.00479639, -0.37243647,
        -0.23640761]])

## Parameter

In [89]:
class Parameter(object):
    def __init__(self, data, requires_grad, skip_decay=False):
        self.data = data
        self.grad = None
        self.skip_decay = skip_decay
        self.requires_grad = requires_grad

## AverageMeter

In [90]:
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

## Accuracy

In [91]:
def accuracy(output, target):
    preds = output.argmax(axis=-1, keepdims=True)
    return np.mean(preds == target) * 100

## Pre-process

In [92]:
def get_transform(train_X, test_X, mode=None):
    if mode == 'min-max':
        min_each_feature = np.min(train_X, axis=0)
        max_each_feature = np.max(train_X, axis=0)
        scale = max_each_feature - min_each_feature
        scale[scale == 0] = 1   # To avoid divided by 0
        scaled_train = (train_X - min_each_feature) / scale
        scaled_test = (test_X - min_each_feature) / scale
        return scaled_train, scaled_test

    if mode == 'norm':
        std_each_feature = np.std(train_X, axis=0)
        mean_each_feature = np.mean(train_X, axis=0)
        std_each_feature[std_each_feature == 0] = 1     # To avoid divided by 0
        norm_train = (train_X - mean_each_feature) / std_each_feature
        norm_test = (test_X - mean_each_feature) / std_each_feature
        return norm_train, norm_test

    return train_X, test_X

# Layers

## Base layer

In [93]:
class Layer(object):
    def __init__(self, name, requires_grad=False):
        self.name = name 
        self.requires_grad = requires_grad
        
    def forward(self, *args):
        pass

    def backward(self, *args):
        pass

## Activation

### Relu

In [94]:
class relu(Layer):
    def __init__(self, name, requires_grad=False):
        super().__init__(name, requires_grad)

    def forward(self, input):
        self.input = input
        return np.maximum(0, input)
    
    def backward(self, grad_output):
        grad_output[self.input <= 0] = 0
        return grad_output
    

test_relu = relu('test_relu')
_ = test_relu.forward(test_array)
test_relu.backward(test_array)

array([[0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.16971559, 0.        ],
       [0.81831748, 0.96090466, 1.35616532, 0.        ]])

### Leaky Relu

In [95]:
class leaky_relu(Layer):
    def __init__(self, name, alpha, requires_grad=False):
        super().__init__(name, requires_grad)
        self.alpha = alpha

    def forward(self, input):
        return np.where(input > 0, input, self.alpha * input)
    
    def backward(self, grad_output):
        x = np.ones_like(grad_output)
        x[grad_output < 0] *= self.alpha
        return x

### Gelu

In [96]:
class gelu(Layer):
    def __init__(self, name, requires_grad=False):
        super().__init__(name, requires_grad)

    def forward(self, input):
        vec_erf = np.vectorize(math.erf)
        return 0.5 * input * (1 + vec_erf(input / np.sqrt(2)))
    
    def backward(self, grad_output):
        vec_erf = np.vectorize(math.erf)
        return 0.5 + 0.5 * vec_erf(grad_output / np.sqrt(2)) + \
            ((0.5 * grad_output * ((2 / np.sqrt(np.pi)) * np.exp(-(grad_output ** 2)))) / np.sqrt(2))

### Sigmoid

In [97]:
class sigmoid(Layer):
    def __init__(self, name, requires_grad=False):
        super().__init__(name, requires_grad)
        
    def forward(self, input):
        self.y = 1. / (1. + np.exp(-input))   # save sigmoid for more convenient grad computation
        return self.y
    
    def backward(self, grad_output):
        return self.y * (1 - self.y) * grad_output

### Tanh

In [98]:
class tanh(Layer):
    def __init__(self, name, requires_grad=False):
        super().__init__(name, requires_grad)

    def forward(self, input):
        return np.tanh(input)
    
    def backward(self, grad_output):
        return 1 - np.tanh(grad_output) ** 2

### Softmax 

In [99]:
class softmax(Layer):
    def __init__(self, name, requires_grad=False):
        super().__init__(name, requires_grad)
        
    def forward(self, input):
        """
            input.shape = [batch size, num_class]
        """
        x_max = input.max(axis=-1, keepdims=True)       # to avoid overflow
        x_exp = np.exp(input - x_max)
        return x_exp / x_exp.sum(axis=-1, keepdims=True)
    
    def backward(self, grad_output):
        # packaged in CrossEntropyLoss
        return grad_output

softmax('test_softmax').forward(test_array)

array([[0.25      , 0.25      , 0.25      , 0.25      ],
       [0.23895047, 0.23895047, 0.2831486 , 0.23895047],
       [0.23219394, 0.2677785 , 0.39758979, 0.10243777]])

## Hidden layer

In [100]:
class HiddenLayer(Layer):
    def __init__(self, name, in_num, out_num):
        super().__init__(name, requires_grad=True)
        self.in_num = in_num
        self.out_num = out_num

        W = kaiming_normal_(np.array([0] * in_num * out_num).reshape(in_num, out_num), a=math.sqrt(5))     # Kaiming Init
        self.W = Parameter(W, self.requires_grad)
        self.b = Parameter(np.zeros(out_num), self.requires_grad)

    def forward(self, input):
        self.input = input
        return input @ self.W.data + self.b.data      # [batch size, in_num] @ [in_num, out_num] + [out_num] => [batch size, out_num]
    
    def backward(self, grad_output):
        """
            grad_output: [batch size, out_num]
        """
        batch_size = grad_output.shape[0]
        self.W.grad = self.input.T @ grad_output / batch_size
        self.b.grad = grad_output.sum(axis=0) / batch_size
        return grad_output @ self.W.data.T

## Batch Norm

In [101]:
class batchnorm(Layer):
    def __init__(self, name, shape, requires_grad=True):
        super().__init__(name)
        self.gamma = Parameter(np.random.uniform(0.9, 1.1, shape), requires_grad, skip_decay=True)
        self.beta = Parameter(np.random.uniform(-0.1, 0.1, shape), requires_grad, skip_decay=True)
        self.requires_grad = requires_grad

        self.running_mean = Parameter(np.zeros(shape), False)
        self.running_var = Parameter(np.zeros(shape), False)

    
    def forward(self, input):
        if self.train:
            batch_mean = input.mean(axis=0)
            batch_var = input.var(axis=0)
            batch_std = np.sqrt(batch_var + 1e-8)    # To avoid divided by 0

            momentum = 0.9
            self.running_mean.data = momentum * self.running_mean.data + (1 - momentum) * batch_mean
            self.running_var.data = momentum * self.running_var.data + (1 - momentum) * batch_var
            
        
        else:
            batch_mean = self.running_mean.data
            batch_std = np.sqrt(self.running_var.data)

        self.norm = (input - batch_mean) / batch_std
        self.gamma_norm = self.gamma.data / batch_std

        return self.gamma.data * self.norm + self.beta.data
        
    
    def backward(self, grad_output):        
        batch_size = grad_output.shape[0]
        self.gamma.grad = (grad_output * self.norm).sum(axis=0) / batch_size
        self.beta.grad = grad_output.sum(axis=0) / batch_size
        return self.gamma_norm * (grad_output - self.norm * self.gamma.grad - self.beta.grad)       # TODO: 推导

## Dropout

In [102]:
class dropout(Layer):
    def __init__(self, name, drop_rate, requires_grad=False):
        super().__init__(name, requires_grad)
        self.drop_rate = drop_rate
        self.fix_value = 1 / (1 - self.drop_rate)   # to keep average fixed

    def forward(self, input):
        if self.train:
            self.mask = np.random.uniform(0, 1, input.shape) > self.drop_rate
            return input * self.mask * self.fix_value
        else:
            return input

    def backward(self, grad_output):
        if self.train:
            return grad_output * self.mask
        else:
            return grad_output

# Loss Function

Cross Entropy

In [103]:
class CrossEntropyLoss(object):
    def __init__(self):
        self.softmax = softmax('softmax')

    def __call__(self, input, ground_truth):
        self.bacth_size = input.shape[0]
        self.class_num = input.shape[1]

        preds = self.softmax.forward(input)
        ground_truth = self.one_hot_encoding(ground_truth)

        self.grad = preds - ground_truth    #TODO: 推导要写在report上不？

        loss = -1 * (ground_truth * np.log(preds + 1e-8)).sum() / self.bacth_size

        return loss
    
    def one_hot_encoding(self, x):
        one_hot_encoded = np.zeros((self.bacth_size, self.class_num))
        one_hot_encoded[np.arange(x.size), x.flatten()] = 1
        return one_hot_encoded

# MLP

In [104]:
class MLP(object):
    def __init__(self):
        self.layers = []
        self.params = []
        self.num_layers = 0
    
    def add_layer(self, layer):
        self.layers.append(layer)
        if layer.requires_grad:
            if hasattr(layer, 'W'):
                self.params.append(layer.W)
            if hasattr(layer, 'b'):
                self.params.append(layer.b)
            if hasattr(layer, 'gamma'):
                self.params.append(layer.gamma)
            if hasattr(layer, 'beta'):
                self.params.append(layer.beta)
        self.num_layers += 1

    def forward(self, x):
        for layer in self.layers:
            x = layer.forward(x)
        return x
    
    def backward(self, x):
        for layer in self.layers[::-1]:
            x = layer.backward(x)
        return x
    
    def train(self):
        for layer in self.layers:
            layer.train = True
    
    def test(self):
        for layer in self.layers:
            layer.train = False

# Optimizer

## SGD with Momentum

In [105]:
class SGD(object):
    def __init__(self, parameters, momentum, lr, weight_decay):
        self.parameters = parameters
        self.momentum = momentum
        self.lr = lr
        self.weight_decay = weight_decay
        self.v = [np.zeros(p.data.shape) for p in self.parameters]

    def step(self):
        for i, (v, p) in enumerate(zip(self.v, self.parameters)):
            if not p.skip_decay:
                p.data -= self.weight_decay * p.data
            v = self.momentum * v + self.lr * p.grad
            self.v[i] = v
            p.data -= self.v[i]

## Adam

In [106]:
class Adam(object):
    def __init__(self, parameters, lr, weight_decay=0, beta=(0.9, 0.999), eps=1e-8):
        self.beta1 = beta[0]
        self.beta2 = beta[1]
        self.lr = lr
        self.weight_decay = weight_decay
        self.eps = eps
        self.parameters = parameters
        self.m = [np.zeros(p.data.shape) for p in self.parameters]
        self.v = [np.zeros(p.data.shape) for p in self.parameters]

        self.iterations = 0
    
    def step(self):
        self.iterations += 1
        for i, (p, m, v) in enumerate(zip(self.parameters, self.m, self.v)):
            if not p.skip_decay:
                p.data -= self.weight_decay * p.data
            m = self.beta1 * m + (1 - self.beta1) * p.grad
            v = self.beta2 * v + (1 - self.beta2) * np.power(p.grad, 2)

            self.m[i] = m
            self.v[i] = v
            
            # bias correction
            m = m / (1 - np.power(self.beta1, self.iterations))
            v = v / (1 - np.power(self.beta2, self.iterations))

            p.data -= self.lr * m / (np.sqrt(v + self.eps))

# Scheduler

## Cosine

In [107]:
class CosineLR(object):
    def __init__(self, optimizer, T_max):
        self.optimizer = optimizer
        self.T_max = T_max
        self.n = -1
        self.base_lr = optimizer.lr
        self.step()

    def step(self):
        self.n += 1
        lr = self.get_lr()
        self.optimizer.lr = lr

    def get_lr(self):
        cos = np.cos(np.pi * self.n / self.T_max)
        return self.base_lr * (1 + cos) / 2

# Trainer

In [108]:
class Trainer(object):
    def __init__(self, config, model=None, train_loader=None, val_loader=None):
        self.config = config
        self.epochs = self.config['epoch']
        self.lr = self.config['lr']
        self.model = model
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.print_freq = self.config['print_freq']

        self.criterion = CrossEntropyLoss()
        if self.config['optimizer'] == 'sgd':
            self.optimizer = SGD(self.model.params, self.config['momentum'], self.lr, self.config['weight_decay'])
        elif self.config['optimizer'] == 'adam':
            self.optimizer = Adam(self.model.params, self.lr, self.config['weight_decay'])
        self.train_scheduler = CosineLR(self.optimizer, T_max=self.epochs)

    def train(self):
        best_acc1 = 0
        for epoch in range(self.epochs):
            print('current lr {:.5e}'.format(self.optimizer.lr))
            self.train_per_epoch(epoch)
            self.train_scheduler.step()

            # evaluate on validation set
            acc1 = self.validate(epoch)

            # remember best prec@1
            best_acc1 = max(acc1, best_acc1)
            output_best = 'Best Prec@1: %.3f\n' % (best_acc1)
            print(output_best)
            # time.sleep(1)

    
    def train_per_epoch(self, epoch):
        batch_time = AverageMeter()
        losses = AverageMeter()
        top1 = AverageMeter()

        self.model.train()

        end = time.time()

        for i, (input, target) in enumerate(self.train_loader):
            # compute output
            output = self.model.forward(input)
            loss = self.criterion(output, target)

            # compute gradient and do SGD step
            self.model.backward(self.criterion.grad)
            self.optimizer.step()

            # measure accuracy and record loss
            prec1 = accuracy(output, target)
            losses.update(loss, input.shape[0])
            top1.update(prec1, input.shape[0])

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            if (i % self.print_freq == 0) or (i == len(self.train_loader) - 1):
                print('Epoch: [{0}][{1}/{2}]\t'
                    'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                    'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                    'Prec@1 {top1.val:.3f} ({top1.avg:.3f})'.format(
                        epoch + 1, i, len(self.train_loader) - 1, batch_time=batch_time,
                        loss=losses, top1=top1))
        
        output = ('EPOCH: {epoch} {flag} Results: Prec@1 {top1.avg:.3f} '.format(epoch=epoch + 1 , flag='train', top1=top1))
        print(output)
                
    def validate(self, epoch):
        batch_time = AverageMeter()
        losses = AverageMeter()
        top1 = AverageMeter()

        self.model.test()

        end = time.time()
        for i, (input, target) in enumerate(self.val_loader):
            # compute output
            output = self.model.forward(input)
            loss = self.criterion(output, target)

            # measure accuracy and record loss
            prec1 = accuracy(output, target)
            losses.update(loss, input.shape[0])
            top1.update(prec1, input.shape[0])

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            if (i % self.print_freq == 0) or (i == len(self.val_loader) - 1):
                print('Test: [{0}/{1}]\t'
                    'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                    'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                    'Prec@1 {top1.val:.3f} ({top1.avg:.3f})'.format(
                        i, len(self.val_loader) - 1, batch_time=batch_time, loss=losses,
                        top1=top1))
        
        output = ('EPOCH: {epoch} {flag} Results: Prec@1 {top1.avg:.3f} '.format(epoch=epoch + 1 , flag='val', top1=top1))
        print(output)

        return top1.avg

# Dataloader

In [109]:
class Dataloader(object):
    def __init__(self, X, y, batch_size, shuffle=True, seed=None):
        self.X = X
        self.y = y
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.seed = seed
        self.index = np.arange(X.shape[0])
    
    def __iter__(self):
        if self.shuffle:
            if self.seed is not None:
                np.random.seed(self.seed)
            np.random.shuffle(self.index)
        self.n = 0
        return self
    
    def __next__(self):
        if self.n >= len(self.index):
            raise StopIteration
        
        index = self.index[self.n:self.n + self.batch_size]
        batch_X = self.X[index]
        batch_y = self.y[index]
        self.n += self.batch_size

        return batch_X, batch_y
    
    def __len__(self):
        """
            num of batch
        """
        return (len(self.index) + self.batch_size - 1) // self.batch_size  # ceiling

# Main

## load data

In [110]:
file_path = './Assignment1-Dataset/'

train_X = np.load(file_path + 'train_data.npy')
train_y = np.load(file_path + 'train_label.npy')
test_X = np.load(file_path + 'test_data.npy')
test_y = np.load(file_path + 'test_label.npy')

In [111]:
train_X.shape

(50000, 128)

In [112]:
set([train_y[i][0] for i in range(train_y.shape[0])])

{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}

## run!

In [113]:
def get_model(layers):
    model = MLP()
    str2obj = {
        'linear': HiddenLayer, 
        'relu': relu, 
        'leaky_relu': leaky_relu,
        'gelu': gelu,
        'sigmoid': sigmoid, 
        'tanh': tanh,
        'batchnorm': batchnorm,
        'dropout': dropout
    }
    for i in layers:
        model.add_layer(str2obj[i['type']](**i['params']))
    
    return model

@timer
def main():
    file_path = './Assignment1-Dataset/'

    train_X = np.load(file_path + 'train_data.npy')
    train_y = np.load(file_path + 'train_label.npy')
    test_X = np.load(file_path + 'test_data.npy')
    test_y = np.load(file_path + 'test_label.npy')

    layers = [
        {'type': 'linear', 'params': {'name': 'fc1', 'in_num': 128, 'out_num': 64}},
        {'type': 'batchnorm', 'params': {'name': 'bn1', 'shape': 64}}, 
        {'type': 'dropout', 'params': {'name': 'dropout', 'drop_rate': 0.1}},
        # {'type': 'sigmoid', 'params': {'name': 'sigmoid'}},  
        # {'type': 'leaky_relu', 'params': {'name': 'leaky_relu1', 'alpha': 0.1}},  
        {'type': 'relu', 'params': {'name': 'relu1'}},  
        #{'type': 'tanh', 'params': {'name': 'tanh1'}},  
        #{'type': 'gelu', 'params': {'name': 'gelu1'}},  
        # {'type': 'linear', 'params': {'name': 'fc2', 'in_num': 256, 'out_num': 128}},
        # {'type': 'relu', 'params': {'name': 'relu2'}}, 
        {'type': 'linear', 'params': {'name': 'fc3', 'in_num': 64, 'out_num': 10}},
    ]
  
    bs = 1024
    config = {
        'layers': layers,
        'lr': 0.001, 
        'bs': bs,
        'momentum': 0.9,
        'weight_decay': 5e-4,   # 5e-4, 2e-4, 1e-4, 5e-3, 0
        'seed': 0,
        'epoch': 100,
        'optimizer': 'adam',  # adam, sgd
        'pre-process': 'norm',      # min-max, norm, None
        'print_freq': 50000 // bs // 5
    }
    np.random.seed(config['seed'])

    # pre process
    train_X, test_X = get_transform(train_X, test_X, config['pre-process'])

    train_dataloader = Dataloader(train_X, train_y, config['bs'], shuffle=True, seed=config['seed'])
    test_dataloader = Dataloader(test_X, test_y, config['bs'], shuffle=False)
    model = get_model(config['layers'])
    trainer = Trainer(config, model, train_dataloader, test_dataloader)
    trainer.train()

main()

Start time:  Fri Mar 29 01:36:21 2024
current lr 1.00000e-03
Epoch: [1][0/48]	Time 0.019 (0.019)	Loss 4.4425 (4.4425)	Prec@1 10.742 (10.742)
Epoch: [1][9/48]	Time 0.065 (0.027)	Loss 4.2057 (4.2830)	Prec@1 9.766 (10.400)


Epoch: [1][18/48]	Time 0.036 (0.025)	Loss 3.7002 (4.1060)	Prec@1 14.062 (10.855)
Epoch: [1][27/48]	Time 0.021 (0.026)	Loss 3.6169 (3.9681)	Prec@1 10.742 (11.133)
Epoch: [1][36/48]	Time 0.003 (0.021)	Loss 3.4748 (3.8420)	Prec@1 12.305 (11.407)
Epoch: [1][45/48]	Time 0.003 (0.018)	Loss 3.1282 (3.7278)	Prec@1 13.574 (11.702)
Epoch: [1][48/48]	Time 0.004 (0.017)	Loss 3.0823 (3.6922)	Prec@1 13.915 (11.874)
EPOCH: 1 train Results: Prec@1 11.874 
Test: [0/9]	Time 0.001 (0.001)	Loss 2.8707 (2.8707)	Prec@1 15.332 (15.332)
Test: [9/9]	Time 0.001 (0.002)	Loss 2.9027 (2.9146)	Prec@1 13.903 (14.680)
EPOCH: 1 val Results: Prec@1 14.680 
Best Prec@1: 14.680

current lr 9.99753e-04
Epoch: [2][0/48]	Time 0.014 (0.014)	Loss 3.0145 (3.0145)	Prec@1 15.625 (15.625)
Epoch: [2][9/48]	Time 0.003 (0.006)	Loss 2.9625 (2.9890)	Prec@1 16.016 (15.098)
Epoch: [2][18/48]	Time 0.009 (0.006)	Loss 2.8462 (2.9251)	Prec@1 16.406 (15.568)
Epoch: [2][27/48]	Time 0.006 (0.006)	Loss 2.6859 (2.8784)	Prec@1 18.555 (15.859)
Epo