In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scipy
import time
import seaborn as sns
import math
from sklearn.model_selection import train_test_split
import itertools
def timer(func):
    def wrapper(*args, **kwargs):
        print('Start time: ', time.ctime())
        start_time = time.time()  # start time
        result = func(*args, **kwargs)  # run
        end_time = time.time()  # end time
        print('End time: ', time.ctime())
        print(f"{func.__name__} executed in {(end_time - start_time):.4f} seconds")
        return result
    return wrapper

def pre_processing(X, mode=None):
    if mode == 'min-max':
        print('Pre-process: min-max normalization')
        min_each_feature = np.min(X, axis=0)
        max_each_feature = np.max(X, axis=0)
        scale = max_each_feature - min_each_feature
        scale[scale == 0] = 1   # To avoid divided by 0
        scaled_train = (X - min_each_feature) / scale
        return scaled_train

    elif mode == 'standardization':
        print('Pre-process: standardization')
        std_each_feature = np.std(X, axis=0)
        mean_each_feature = np.mean(X, axis=0)
        std_each_feature[std_each_feature == 0] = 1     # To avoid divided by 0
        norm_train = (X - mean_each_feature) / std_each_feature
        return norm_train

    else:
        print('No pre-process')
    return X

def accuracy(y_hat, y):
    preds = y_hat.argmax(axis=1, keepdims=True)
    return np.mean(preds == y) * 100

def calculate_gain(nonlinearity, param=None):
    gains = {
        'sigmoid': 1.0,
        'tanh': 5.0 / 3,
        'relu': math.sqrt(2.0),
        'selu': 3.0 / 4
    }

    if nonlinearity in gains:
        return gains[nonlinearity]

    if nonlinearity == 'leaky_relu':
        negative_slope = param if isinstance(param, (int, float)) and not isinstance(param, bool) else 0.01
        return math.sqrt(2.0 / (1 + negative_slope ** 2))

    raise ValueError(f"Unsupported nonlinearity: {nonlinearity}")

def calculate_fan(array):
    if array.ndim < 2:
        raise ValueError("Fan in and fan out require at least 2D tensors")

    fan_in = array.shape[1] * np.prod(array.shape[2:]) if array.ndim > 2 else array.shape[1]
    fan_out = array.shape[0] * np.prod(array.shape[2:]) if array.ndim > 2 else array.shape[0]

    return fan_in, fan_out

def get_correct_fan(array, mode):
    mode = mode.lower()
    if mode not in {'fan_in', 'fan_out'}:
        raise ValueError("Mode must be 'fan_in' or 'fan_out'")

    fan_in, fan_out = calculate_fan(array)
    return fan_in if mode == 'fan_in' else fan_out

def kaiming_normal(array: np.ndarray, a: float = 0, mode: str = 'fan_in', nonlinearity: str = 'relu'):
    fan = get_correct_fan(array, mode)
    gain = calculate_gain(nonlinearity, a)
    std = gain / math.sqrt(fan)
    return np.random.normal(0, std, array.shape)

class Layer(object):
    def __init__(self, name, requires_grad=False):
        self.name = name
        self.requires_grad = requires_grad
        self.train = True  # Added train mode flag

    def _forward(self, *args):
        pass

    def _backward(self, *args):
        pass

    def _fit(self, mode='train'):
        self.train = mode == 'train'

class ReLU(Layer):
    def __init__(self, name, requires_grad=False):
        super().__init__(name, requires_grad)

    def _forward(self, x):
        self.x = x
        return np.maximum(0, x)

    def _backward(self, gradient_output):
        gradient_output[self.x <= 0] = 0
        return gradient_output

class FCLayer(Layer):
    def __init__(self, name: str, n_in: int, n_out: int, skip_decay=False) -> None:
        super().__init__(name, requires_grad=True)
        self.n_in = n_in
        self.n_out = n_out
        W = kaiming_normal(np.array([0] * n_in * n_out).reshape(n_in, n_out), a=math.sqrt(5))
        self.W = W
        self.b = np.zeros(self.n_out)
        self.W_grad = None
        self.b_grad = None
        self.skip_decay = skip_decay

    def _forward(self, x: np.ndarray) -> np.ndarray:
        self.x = x
        temp = x @ self.W + self.b
        return temp

    def _backward(self, delta: np.ndarray) -> np.ndarray:
        batch_size = delta.shape[0]
        self.W_grad = self.x.T @ delta / batch_size
        self.b_grad = delta.sum(axis=0) / batch_size
        return delta @ self.W.T

class Softmax(Layer):
    def __init__(self, name, requires_grad=False):
        super().__init__(name, requires_grad)

    def _forward(self, x: np.ndarray) -> np.ndarray:
        x_exp = np.exp(x - np.max(x, axis=1, keepdims=True))
        return x_exp/x_exp.sum(axis=1, keepdims=True)

    def _backward(self, gradient_output: np.ndarray) -> np.ndarray:
        return gradient_output

class CrossEntropy(object):
    def __init__(self):
        self.softmax = Softmax('softmax')

    def __call__(self, x: np.ndarray, y: np.ndarray) -> np.ndarray:
        self.batch_size = x.shape[0]
        self.class_num = x.shape[1]
        y_hat = self.softmax._forward(x)
        y = self.one_hot_encoding(y)
        self.grad = y_hat - y
        loss = -1 * (y * np.log(y_hat + 1e-8)).sum() / self.batch_size
        return loss

    def one_hot_encoding(self, x):
        one_hot_encoded = np.zeros((self.batch_size, self.class_num))
        one_hot_encoded[np.arange(x.shape[0]), x.flatten()] = 1
        return one_hot_encoded

class BatchNormalization(Layer):
    def __init__(self, name, feature_num, skip_decay=True, epsilon=1e-5, requires_grad=True):
        super().__init__(name)
        self.epsilon = epsilon
        self.requires_grad = requires_grad
        self.skip_decay = skip_decay
        self.gamma = np.ones(feature_num)
        self.beta = np.zeros(feature_num)
        self.gamma_grad = None
        self.beta_grad = None
        self.ema = np.zeros(feature_num)
        self.emv = np.zeros(feature_num)

    def _forward(self, x: np.ndarray) -> np.ndarray:
        if self.train:
            batch_mean = x.mean(axis=0)
            batch_variance = x.var(axis=0)
            batch_std = np.sqrt(batch_variance + self.epsilon)
            momentum = 0.9
            self.ema = momentum * self.ema + (1 - momentum) * batch_mean
            self.emv = momentum * self.emv + (1 - momentum) * batch_variance
        else:
            batch_mean = self.ema
            batch_std = np.sqrt(self.emv + self.epsilon)
        self.norm = (x - batch_mean) / batch_std
        self.gamma_norm = self.gamma / batch_std
        return self.gamma * self.norm + self.beta

    def _backward(self, gradient_output: np.ndarray) -> np.ndarray:
        batch_size = gradient_output.shape[0]
        self.gamma_grad = (gradient_output * self.norm).sum(axis=0) / batch_size
        self.beta_grad = gradient_output.sum(axis=0) / batch_size
        dLdx = self.gamma_norm * (gradient_output - self.norm * self.gamma_grad - self.beta_grad)
        return dLdx

class Dropout(Layer):
    def __init__(self, name, drop_rate=0.5, requires_grad=False):
        super().__init__(name, requires_grad)
        self.drop_rate = drop_rate
        self.fix_value = 1 / (1 - self.drop_rate)

    def _forward(self, x):
        if self.train:
            self.mask = np.random.uniform(0, 1, x.shape) > self.drop_rate
            return x * self.mask * self.fix_value
        else:
            return x

    def _backward(self, grad_output):
        if self.train:
            return grad_output * self.mask
        else:
            return grad_output

class MLP():
    def __init__(self, hidden_units=[256, 128], dropout_rates=[0.3, 0.3]):
        self.layers = []
        # Input layer
        self.layers.append(FCLayer('fc1', n_in=128, n_out=hidden_units[0]))
        self.layers.append(BatchNormalization("batchnorm1", feature_num=hidden_units[0]))
        self.layers.append(Dropout('dropout1', drop_rate=dropout_rates[0]))
        self.layers.append(ReLU('relu1'))

        # Hidden layers
        for i in range(1, len(hidden_units)):
            self.layers.append(FCLayer(f'fc{i+1}', n_in=hidden_units[i-1], n_out=hidden_units[i]))
            self.layers.append(BatchNormalization(f"batchnorm{i+1}", feature_num=hidden_units[i]))
            self.layers.append(Dropout(f'dropout{i+1}', drop_rate=dropout_rates[i]))
            self.layers.append(ReLU(f'relu{i+1}'))

        # Output layer
        self.layers.append(FCLayer(f'fc{len(hidden_units)+1}', n_in=hidden_units[-1], n_out=10))

        # Initialize parameters
        self.parameters = []
        for layer in self.layers:
            if hasattr(layer, "W"):
                self.parameters.append([layer.W, layer.W_grad, layer.skip_decay])
            if hasattr(layer, "b"):
                self.parameters.append([layer.b, layer.b_grad, layer.skip_decay])
            if hasattr(layer, "gamma"):
                self.parameters.append([layer.gamma, layer.gamma_grad, layer.skip_decay])
            if hasattr(layer, "beta"):
                self.parameters.append([layer.beta, layer.beta_grad, layer.skip_decay])

    def _forward(self, x: np.ndarray) -> np.ndarray:
        for layer in self.layers:
            x = layer._forward(x)
        return x

    def _backward(self, gradient_output: np.ndarray) -> np.ndarray:
        for layer in self.layers[::-1]:
            gradient_output = layer._backward(gradient_output)
        return gradient_output

    def _fit(self, mode='train'):
        for layer in self.layers:
            layer._fit(mode)

    def _predict(self, x: np.ndarray) -> np.ndarray:
        self._fit('eval')
        y_hat = self._forward(x)
        return y_hat

class AdamW(object):
    def __init__(self, model, lr=1e-3, decoupled_weight_decay=0, beta1=0.9, beta2=0.999, epsilon=1e-8):
        self.model = model
        self.lr = lr
        self.beta1 = beta1
        self.beta2 = beta2
        self.decoupled_weight_decay = decoupled_weight_decay
        self.epsilon = epsilon
        self.t = 0
        self.m = [np.zeros(p[0].shape) for p in self.get_parameters()]
        self.v = [np.zeros(p[0].shape) for p in self.get_parameters()]

    def get_parameters(self):
        parameters = []
        for layer in self.model.layers:
            if hasattr(layer, "W"):
                parameters.append([layer.W, layer.W_grad, layer.skip_decay])
            if hasattr(layer, "b"):
                parameters.append([layer.b, layer.b_grad, layer.skip_decay])
            if hasattr(layer, "gamma"):
                parameters.append([layer.gamma, layer.gamma_grad, layer.skip_decay])
            if hasattr(layer, "beta"):
                parameters.append([layer.beta, layer.beta_grad, layer.skip_decay])
        return parameters

    def step(self):
        parameters = self.get_parameters()
        for i, (param_list, m, v) in enumerate(zip(parameters, self.m, self.v)):
            param, param_grad, skip_decay = param_list
            self.t += 1
            m = self.beta1 * m + (1 - self.beta1) * param_grad
            v = self.beta2 * v + (1 - self.beta2) * np.power(param_grad, 2)
            self.m[i] = m
            self.v[i] = v
            m_hat = m / (1 - np.power(self.beta1, self.t))
            v_hat = v / (1 - np.power(self.beta2, self.t))

            update = self.lr * m_hat / (np.sqrt(v_hat) + self.epsilon)
            if not skip_decay:
                param -= update
                param *= (1 - self.lr * self.decoupled_weight_decay)
            else:
                param -= update

class SGDMomentum:
    def __init__(self, model, lr=0.01, momentum=0.9, weight_decay=0.0001):
        self.model = model
        self.lr = lr
        self.momentum = momentum
        self.weight_decay = weight_decay
        self.v = [np.zeros(param[0].shape) for param in self.model.parameters]

    def get_parameters(self):
        parameters = []
        for layer in self.model.layers:
            if hasattr(layer, "W"):
                parameters.append([layer.W, layer.W_grad, layer.skip_decay])
            if hasattr(layer, "b"):
                parameters.append([layer.b, layer.b_grad, layer.skip_decay])
            if hasattr(layer, "gamma"):
                parameters.append([layer.gamma, layer.gamma_grad, layer.skip_decay])
            if hasattr(layer, "beta"):
                parameters.append([layer.beta, layer.beta_grad, layer.skip_decay])
        return parameters

    def step(self):
        self.parameters = self.get_parameters()
        for i, (v, param_list) in enumerate(zip(self.v, self.parameters)):
            param, param_grad, skip_decay = param_list
            if param_grad is not None:
                if not skip_decay:
                    param -= self.weight_decay * param
                v[:] = self.momentum * v + self.lr * param_grad
                self.v[i] = v
                param -= v

class AverageMeterics(object):
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

class Trainer(object):
    def __init__(self, config, model=None, train_loader=None, valid_loader=None):
        self.config = config
        self.epochs = self.config['epoch']
        self.lr = self.config['lr']
        self.model = model
        self.train_loader = train_loader
        self.valid_loader = valid_loader
        self.print_freq = self.config['print_freq']
        self.train_accuracy = []
        self.valid_accuracy = []
        self.train_loss = []
        self.valid_loss = []
        self.criterion = CrossEntropy()

        if self.config['optimizer'] == 'sgd':
            self.optimizer = SGDMomentum(self.model, self.lr, self.config['momentum'],
                                       self.config['weight_decay'])
        elif self.config['optimizer'] == 'adamw':
            self.optimizer = AdamW(self.model, self.lr, self.config['weight_decay'])

    @timer
    def train(self):
        best_accuracy = 0
        for epoch in range(self.epochs):
            print('current lr {:.5e}'.format(self.optimizer.lr))
            self.train_per_epoch(epoch)
            acc1 = self.validate(epoch)
            best_accuracy = max(acc1, best_accuracy)
            output_best = f'Best Accuracy: {best_accuracy:.4f}\n'
            print(output_best)

    def train_per_epoch(self, epoch):
        batch_time = AverageMeterics()
        losses = AverageMeterics()
        best_acc = AverageMeterics()
        self.model._fit()
        end_time = time.time()

        for i, (X, y) in enumerate(self.train_loader):
            y_hat = self.model._forward(X)
            loss = self.criterion(y_hat, y)

            self.model._backward(self.criterion.grad)
            self.optimizer.step()

            acc = accuracy(y_hat, y)
            losses.update(loss, X.shape[0])
            best_acc.update(acc, X.shape[0])

            batch_time.update(time.time() - end_time)
            end_time = time.time()

            if (i % self.print_freq == 0) or (i == len(self.train_loader)-1):
                print(f'Epoch: [{epoch + 1}][{i}/{len(self.train_loader) - 1}]\tTime {batch_time.val:.3f} (Avg-Time {batch_time.avg:.3f})\t '
                      f'Loss {losses.val:.4f} (Avg-Loss {losses.avg:.4f})\t'
                      f'Acc {best_acc.val:.4f} (Avg-Acc {best_acc.avg:.4f})')

        print(f'EPOCH: {epoch+1} train Results: Acc {best_acc.avg:.3f} Loss: {losses.avg:.4f}')
        self.train_loss.append(losses.avg)
        self.train_accuracy.append(best_acc.avg)

    def validate(self, epoch):
        batch_time = AverageMeterics()
        losses = AverageMeterics()
        best_acc = AverageMeterics()
        self.model._fit(mode='eval')
        end = time.time()

        for i, (X, y) in enumerate(self.valid_loader):
            y_hat = self.model._forward(X)
            loss = self.criterion(y_hat, y)
            acc = accuracy(y_hat, y)
            losses.update(loss, X.shape[0])
            best_acc.update(acc, X.shape[0])
            batch_time.update(time.time() - end)
            end = time.time()

            if (i % self.print_freq == 0) or (i == len(self.valid_loader) - 1):
                print(f'Epoch: [{epoch + 1}][{i}/{len(self.valid_loader) - 1}]\tTime {batch_time.val:.3f} (Avg-Time {batch_time.avg:.3f})\t '
                      f'Loss {losses.val:.4f} (Avg-Loss {losses.avg:.4f})\t'
                      f'Acc {best_acc.val:.4f} (Avg-Acc {best_acc.avg:.4f})')

        print(f'EPOCH: {epoch+1} Validation Results: Acc {best_acc.avg:.3f} Loss: {losses.avg:.4f}')
        self.valid_loss.append(losses.avg)
        self.valid_accuracy.append(best_acc.avg)
        return best_acc.avg

class Dataloader(object):
    def __init__(self, X, y, batch_size, shuffle=True, seed=None):
        self.X = X
        self.y = y
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.seed = seed
        self.index = np.arange(X.shape[0])

    def __iter__(self):
        if self.shuffle:
            if self.seed is not None:
                np.random.seed(self.seed)
            np.random.shuffle(self.index)
        self.n = 0
        return self

    def __next__(self):
        if self.n >= len(self.index):
            raise StopIteration
        index = self.index[self.n:self.n + self.batch_size]
        batch_X = self.X[index]
        batch_y = self.y[index]
        self.n += self.batch_size
        return batch_X, batch_y

    def __len__(self):
        return (len(self.index) + self.batch_size - 1) // self.batch_size

def hyperparameter_tuning():
    # Define hyperparameter grid for exhaustive search
    param_grid = {
        'lr': [0.005, 0.01, 0.02],
        'batch_size': [512, 1024, 2048],
        'hidden_units': [
            [256, 128],  # Original best
            [512, 256],  # Larger network
            [256, 256],  # Equal size layers
            [128, 64]    # Smaller network
        ],
        'dropout_rates': [
            [0.3, 0.3],  # Original
            [0.2, 0.2],  # Less dropout
            [0.4, 0.4]   # More dropout
        ],
        'pre-process': [None, 'standardization'], #'min-max'
        'weight_decay': [5e-4, 1e-3],
        'optimizer': ['sgd', 'adamw']
    }

    # Generate all combinations
    keys, values = zip(*param_grid.items())
    configs = [dict(zip(keys, v)) for v in itertools.product(*values)]

    # Limit to reasonable number of combinations (optional)
    #configs = configs[:20]  # Test first 20 combinations for demo

    best_acc = 0
    best_config = {}
    results = []

    # Load data
    dir_path = ''
    train_data = np.load(dir_path + 'train_data.npy')
    train_label = np.load(dir_path + 'train_label.npy')
    test_X = np.load(dir_path + 'test_data.npy')
    test_label = np.load(dir_path + 'test_label.npy')

    # Split validation set
    train_X, val_X, train_y, val_y = train_test_split(train_data, train_label,
                                                     test_size=0.2, random_state=5329)

    for i, cfg in enumerate(configs):
        print(f"\n\n=== Testing Configuration {i+1}/{len(configs)} ===")
        print("Current Config:", cfg)

        # Set default values for missing params
        cfg.setdefault('epoch', 200)  # Reduced epochs for faster grid search
        cfg.setdefault('momentum', 0.9)
        cfg.setdefault('seed', 0)

        # Data preprocessing
        train_X_processed = pre_processing(train_X, cfg['pre-process'])
        val_X_processed = pre_processing(val_X, cfg['pre-process'])
        test_X_processed = pre_processing(test_X, cfg['pre-process'])

        # Create dataloaders
        train_dataloader = Dataloader(train_X_processed, train_y, cfg['batch_size'], shuffle=True)
        val_dataloader = Dataloader(val_X_processed, val_y, cfg['batch_size'], shuffle=False)
        test_dataloader = Dataloader(test_X_processed, test_label, cfg['batch_size'], shuffle=False)

        # Training config
        current_config = {
            'lr': cfg['lr'],
            'batch_size': cfg['batch_size'],
            'momentum': cfg['momentum'],
            'weight_decay': cfg['weight_decay'],
            'seed': cfg['seed'],
            'epoch': cfg['epoch'],
            'optimizer': cfg['optimizer'],
            'scheduler': None,
            'pre-process': cfg['pre-process'],
            'print_freq': max(1, 50000 // cfg['batch_size'] // 5)
        }

        # Initialize model
        model = MLP(hidden_units=cfg['hidden_units'],
                   dropout_rates=cfg['dropout_rates'])

        # Train with reduced verbosity
        trainer = Trainer(current_config, model, train_dataloader, val_dataloader)

        # Simple training without detailed prints
        best_val_acc = 0
        for epoch in range(current_config['epoch']):
            trainer.train_per_epoch(epoch)
            val_acc = trainer.validate(epoch)
            best_val_acc = max(val_acc, best_val_acc)

        # Final test evaluation
        model._fit(mode='eval')
        y_hat = model._predict(test_dataloader.X)
        test_acc = accuracy(y_hat, test_dataloader.y)

        # Store results
        results.append({
            'config': cfg,
            'val_accuracy': best_val_acc,
            'test_accuracy': test_acc
        })

        # Update best config
        if test_acc > best_acc:
            best_acc = test_acc
            best_config = cfg.copy()
            best_config['test_accuracy'] = test_acc

        print(f"\nTest Accuracy: {test_acc:.2f}% (Best so far: {best_acc:.2f}%)")

    # Results analysis
    print("\n\n=== Grid Search Results ===")
    print(f"Best Accuracy: {best_acc:.2f}%")
    print("Best Configuration:")
    for k, v in best_config.items():
        print(f"{k:>15}: {v}")


    results_df = pd.DataFrame(results)

    print(results_df)

if __name__ == "__main__":
    hyperparameter_tuning()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Epoch: [61][19/78]	Time 0.047 (Avg-Time 0.081)	 Loss 1.1043 (Avg-Loss 1.0325)	Acc 61.7188 (Avg-Acc 63.1836)
Epoch: [61][38/78]	Time 0.047 (Avg-Time 0.066)	 Loss 0.9973 (Avg-Loss 1.0279)	Acc 65.0391 (Avg-Acc 63.5517)
Epoch: [61][57/78]	Time 0.052 (Avg-Time 0.060)	 Loss 1.0339 (Avg-Loss 1.0319)	Acc 65.6250 (Avg-Acc 63.5237)
Epoch: [61][76/78]	Time 0.048 (Avg-Time 0.058)	 Loss 1.0527 (Avg-Loss 1.0382)	Acc 61.9141 (Avg-Acc 63.2787)
Epoch: [61][78/78]	Time 0.009 (Avg-Time 0.057)	 Loss 1.1588 (Avg-Loss 1.0382)	Acc 54.6875 (Avg-Acc 63.2875)
EPOCH: 61 train Results: Acc 63.288 Loss: 1.0382
Epoch: [61][0/19]	Time 0.014 (Avg-Time 0.014)	 Loss 1.1804 (Avg-Loss 1.1804)	Acc 58.2031 (Avg-Acc 58.2031)
Epoch: [61][19/19]	Time 0.007 (Avg-Time 0.012)	 Loss 1.2617 (Avg-Loss 1.2250)	Acc 51.8382 (Avg-Acc 56.7500)
EPOCH: 61 Validation Results: Acc 56.750 Loss: 1.2250
Epoch: [62][0/78]	Time 0.050 (Avg-Time 0.050)	 Loss 0.9639 (Avg-Loss 0.9639)	

In [3]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scipy
import time
import seaborn as sns
import math
from sklearn.model_selection import train_test_split

def timer(func):
    def wrapper(*args, **kwargs):
        print('Start time: ', time.ctime())
        start_time = time.time()  # start time
        result = func(*args, **kwargs)  # run
        end_time = time.time()  # end time
        print('End time: ', time.ctime())
        print(f"{func.__name__} executed in {(end_time - start_time):.4f} seconds")
        return result
    return wrapper

def pre_processing(X, mode=None):
    if mode == 'min-max':
        print('Pre-process: min-max normalization')
        min_each_feature = np.min(X, axis=0)
        max_each_feature = np.max(X, axis=0)
        scale = max_each_feature - min_each_feature
        scale[scale == 0] = 1   # To avoid divided by 0
        scaled_train = (X - min_each_feature) / scale
        return scaled_train

    elif mode == 'standardization':
        print('Pre-process: standardization')
        std_each_feature = np.std(X, axis=0)
        mean_each_feature = np.mean(X, axis=0)
        std_each_feature[std_each_feature == 0] = 1     # To avoid divided by 0
        norm_train = (X - mean_each_feature) / std_each_feature
        return norm_train

    else:
        print('No pre-process')
    return X

def accuracy(y_hat, y):
    preds = y_hat.argmax(axis=1, keepdims=True)
    return np.mean(preds == y) * 100

def calculate_gain(nonlinearity, param=None):
    gains = {
        'sigmoid': 1.0,
        'tanh': 5.0 / 3,
        'relu': math.sqrt(2.0),
        'selu': 3.0 / 4
    }

    if nonlinearity in gains:
        return gains[nonlinearity]

    if nonlinearity == 'leaky_relu':
        negative_slope = param if isinstance(param, (int, float)) and not isinstance(param, bool) else 0.01
        return math.sqrt(2.0 / (1 + negative_slope ** 2))

    raise ValueError(f"Unsupported nonlinearity: {nonlinearity}")

def calculate_fan(array):
    if array.ndim < 2:
        raise ValueError("Fan in and fan out require at least 2D tensors")

    fan_in = array.shape[1] * np.prod(array.shape[2:]) if array.ndim > 2 else array.shape[1]
    fan_out = array.shape[0] * np.prod(array.shape[2:]) if array.ndim > 2 else array.shape[0]

    return fan_in, fan_out

def get_correct_fan(array, mode):
    mode = mode.lower()
    if mode not in {'fan_in', 'fan_out'}:
        raise ValueError("Mode must be 'fan_in' or 'fan_out'")

    fan_in, fan_out = calculate_fan(array)
    return fan_in if mode == 'fan_in' else fan_out

def kaiming_normal(array: np.ndarray, a: float = 0, mode: str = 'fan_in', nonlinearity: str = 'relu'):
    fan = get_correct_fan(array, mode)
    gain = calculate_gain(nonlinearity, a)
    std = gain / math.sqrt(fan)
    return np.random.normal(0, std, array.shape)

class Layer(object):
    def __init__(self, name, requires_grad=False):
        self.name = name
        self.requires_grad = requires_grad
        self.train = True  # Added train mode flag

    def _forward(self, *args):
        pass

    def _backward(self, *args):
        pass

    def _fit(self, mode='train'):
        self.train = mode == 'train'

class ReLU(Layer):
    def __init__(self, name, requires_grad=False):
        super().__init__(name, requires_grad)

    def _forward(self, x):
        self.x = x
        return np.maximum(0, x)

    def _backward(self, gradient_output):
        gradient_output[self.x <= 0] = 0
        return gradient_output

class FCLayer(Layer):
    def __init__(self, name: str, n_in: int, n_out: int, skip_decay=False) -> None:
        super().__init__(name, requires_grad=True)
        self.n_in = n_in
        self.n_out = n_out
        W = kaiming_normal(np.array([0] * n_in * n_out).reshape(n_in, n_out), a=math.sqrt(5))
        self.W = W
        self.b = np.zeros(self.n_out)
        self.W_grad = None
        self.b_grad = None
        self.skip_decay = skip_decay

    def _forward(self, x: np.ndarray) -> np.ndarray:
        self.x = x
        temp = x @ self.W + self.b
        return temp

    def _backward(self, delta: np.ndarray) -> np.ndarray:
        batch_size = delta.shape[0]
        self.W_grad = self.x.T @ delta / batch_size
        self.b_grad = delta.sum(axis=0) / batch_size
        return delta @ self.W.T

class Softmax(Layer):
    def __init__(self, name, requires_grad=False):
        super().__init__(name, requires_grad)

    def _forward(self, x: np.ndarray) -> np.ndarray:
        x_exp = np.exp(x - np.max(x, axis=1, keepdims=True))
        return x_exp/x_exp.sum(axis=1, keepdims=True)

    def _backward(self, gradient_output: np.ndarray) -> np.ndarray:
        return gradient_output

class CrossEntropy(object):
    def __init__(self):
        self.softmax = Softmax('softmax')

    def __call__(self, x: np.ndarray, y: np.ndarray) -> np.ndarray:
        self.batch_size = x.shape[0]
        self.class_num = x.shape[1]
        y_hat = self.softmax._forward(x)
        y = self.one_hot_encoding(y)
        self.grad = y_hat - y
        loss = -1 * (y * np.log(y_hat + 1e-8)).sum() / self.batch_size
        return loss

    def one_hot_encoding(self, x):
        one_hot_encoded = np.zeros((self.batch_size, self.class_num))
        one_hot_encoded[np.arange(x.shape[0]), x.flatten()] = 1
        return one_hot_encoded

class BatchNormalization(Layer):
    def __init__(self, name, feature_num, skip_decay=True, epsilon=1e-5, requires_grad=True):
        super().__init__(name)
        self.epsilon = epsilon
        self.requires_grad = requires_grad
        self.skip_decay = skip_decay
        self.gamma = np.ones(feature_num)
        self.beta = np.zeros(feature_num)
        self.gamma_grad = None
        self.beta_grad = None
        self.ema = np.zeros(feature_num)
        self.emv = np.zeros(feature_num)

    def _forward(self, x: np.ndarray) -> np.ndarray:
        if self.train:
            batch_mean = x.mean(axis=0)
            batch_variance = x.var(axis=0)
            batch_std = np.sqrt(batch_variance + self.epsilon)
            momentum = 0.9
            self.ema = momentum * self.ema + (1 - momentum) * batch_mean
            self.emv = momentum * self.emv + (1 - momentum) * batch_variance
        else:
            batch_mean = self.ema
            batch_std = np.sqrt(self.emv + self.epsilon)
        self.norm = (x - batch_mean) / batch_std
        self.gamma_norm = self.gamma / batch_std
        return self.gamma * self.norm + self.beta

    def _backward(self, gradient_output: np.ndarray) -> np.ndarray:
        batch_size = gradient_output.shape[0]
        self.gamma_grad = (gradient_output * self.norm).sum(axis=0) / batch_size
        self.beta_grad = gradient_output.sum(axis=0) / batch_size
        dLdx = self.gamma_norm * (gradient_output - self.norm * self.gamma_grad - self.beta_grad)
        return dLdx

class Dropout(Layer):
    def __init__(self, name, drop_rate=0.5, requires_grad=False):
        super().__init__(name, requires_grad)
        self.drop_rate = drop_rate
        self.fix_value = 1 / (1 - self.drop_rate)

    def _forward(self, x):
        if self.train:
            self.mask = np.random.uniform(0, 1, x.shape) > self.drop_rate
            return x * self.mask * self.fix_value
        else:
            return x

    def _backward(self, grad_output):
        if self.train:
            return grad_output * self.mask
        else:
            return grad_output

class MLP():
    def __init__(self, hidden_units=[256, 128], dropout_rates=[0.3, 0.3]):
        self.layers = []
        # Input layer
        self.layers.append(FCLayer('fc1', n_in=128, n_out=hidden_units[0]))
        self.layers.append(BatchNormalization("batchnorm1", feature_num=hidden_units[0]))
        self.layers.append(Dropout('dropout1', drop_rate=dropout_rates[0]))
        self.layers.append(ReLU('relu1'))

        # Hidden layers
        for i in range(1, len(hidden_units)):
            self.layers.append(FCLayer(f'fc{i+1}', n_in=hidden_units[i-1], n_out=hidden_units[i]))
            self.layers.append(BatchNormalization(f"batchnorm{i+1}", feature_num=hidden_units[i]))
            self.layers.append(Dropout(f'dropout{i+1}', drop_rate=dropout_rates[i]))
            self.layers.append(ReLU(f'relu{i+1}'))

        # Output layer
        self.layers.append(FCLayer(f'fc{len(hidden_units)+1}', n_in=hidden_units[-1], n_out=10))

        # Initialize parameters
        self.parameters = []
        for layer in self.layers:
            if hasattr(layer, "W"):
                self.parameters.append([layer.W, layer.W_grad, layer.skip_decay])
            if hasattr(layer, "b"):
                self.parameters.append([layer.b, layer.b_grad, layer.skip_decay])
            if hasattr(layer, "gamma"):
                self.parameters.append([layer.gamma, layer.gamma_grad, layer.skip_decay])
            if hasattr(layer, "beta"):
                self.parameters.append([layer.beta, layer.beta_grad, layer.skip_decay])

    def _forward(self, x: np.ndarray) -> np.ndarray:
        for layer in self.layers:
            x = layer._forward(x)
        return x

    def _backward(self, gradient_output: np.ndarray) -> np.ndarray:
        for layer in self.layers[::-1]:
            gradient_output = layer._backward(gradient_output)
        return gradient_output

    def _fit(self, mode='train'):
        for layer in self.layers:
            layer._fit(mode)

    def _predict(self, x: np.ndarray) -> np.ndarray:
        self._fit('eval')
        y_hat = self._forward(x)
        return y_hat

class AdamW(object):
    def __init__(self, model, lr=1e-3, decoupled_weight_decay=0, beta1=0.9, beta2=0.999, epsilon=1e-8):
        self.model = model
        self.lr = lr
        self.beta1 = beta1
        self.beta2 = beta2
        self.decoupled_weight_decay = decoupled_weight_decay
        self.epsilon = epsilon
        self.t = 0
        self.m = [np.zeros(p[0].shape) for p in self.get_parameters()]
        self.v = [np.zeros(p[0].shape) for p in self.get_parameters()]

    def get_parameters(self):
        parameters = []
        for layer in self.model.layers:
            if hasattr(layer, "W"):
                parameters.append([layer.W, layer.W_grad, layer.skip_decay])
            if hasattr(layer, "b"):
                parameters.append([layer.b, layer.b_grad, layer.skip_decay])
            if hasattr(layer, "gamma"):
                parameters.append([layer.gamma, layer.gamma_grad, layer.skip_decay])
            if hasattr(layer, "beta"):
                parameters.append([layer.beta, layer.beta_grad, layer.skip_decay])
        return parameters

    def step(self):
        parameters = self.get_parameters()
        for i, (param_list, m, v) in enumerate(zip(parameters, self.m, self.v)):
            param, param_grad, skip_decay = param_list
            self.t += 1
            m = self.beta1 * m + (1 - self.beta1) * param_grad
            v = self.beta2 * v + (1 - self.beta2) * np.power(param_grad, 2)
            self.m[i] = m
            self.v[i] = v
            m_hat = m / (1 - np.power(self.beta1, self.t))
            v_hat = v / (1 - np.power(self.beta2, self.t))

            update = self.lr * m_hat / (np.sqrt(v_hat) + self.epsilon)
            if not skip_decay:
                param -= update
                param *= (1 - self.lr * self.decoupled_weight_decay)
            else:
                param -= update

class SGDMomentum:
    def __init__(self, model, lr=0.01, momentum=0.9, weight_decay=0.0001):
        self.model = model
        self.lr = lr
        self.momentum = momentum
        self.weight_decay = weight_decay
        self.v = [np.zeros(param[0].shape) for param in self.model.parameters]

    def get_parameters(self):
        parameters = []
        for layer in self.model.layers:
            if hasattr(layer, "W"):
                parameters.append([layer.W, layer.W_grad, layer.skip_decay])
            if hasattr(layer, "b"):
                parameters.append([layer.b, layer.b_grad, layer.skip_decay])
            if hasattr(layer, "gamma"):
                parameters.append([layer.gamma, layer.gamma_grad, layer.skip_decay])
            if hasattr(layer, "beta"):
                parameters.append([layer.beta, layer.beta_grad, layer.skip_decay])
        return parameters

    def step(self):
        self.parameters = self.get_parameters()
        for i, (v, param_list) in enumerate(zip(self.v, self.parameters)):
            param, param_grad, skip_decay = param_list
            if param_grad is not None:
                if not skip_decay:
                    param -= self.weight_decay * param
                v[:] = self.momentum * v + self.lr * param_grad
                self.v[i] = v
                param -= v

class AverageMeterics(object):
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

class Trainer(object):
    def __init__(self, config, model=None, train_loader=None, valid_loader=None):
        self.config = config
        self.epochs = self.config['epoch']
        self.lr = self.config['lr']
        self.model = model
        self.train_loader = train_loader
        self.valid_loader = valid_loader
        self.print_freq = self.config['print_freq']
        self.train_accuracy = []
        self.valid_accuracy = []
        self.train_loss = []
        self.valid_loss = []
        self.criterion = CrossEntropy()

        if self.config['optimizer'] == 'sgd':
            self.optimizer = SGDMomentum(self.model, self.lr, self.config['momentum'],
                                       self.config['weight_decay'])
        elif self.config['optimizer'] == 'adamw':
            self.optimizer = AdamW(self.model, self.lr, self.config['weight_decay'])

    @timer
    def train(self):
        best_accuracy = 0
        for epoch in range(self.epochs):
            print('current lr {:.5e}'.format(self.optimizer.lr))
            self.train_per_epoch(epoch)
            acc1 = self.validate(epoch)
            best_accuracy = max(acc1, best_accuracy)
            output_best = f'Best Accuracy: {best_accuracy:.4f}\n'
            print(output_best)

    def train_per_epoch(self, epoch):
        batch_time = AverageMeterics()
        losses = AverageMeterics()
        best_acc = AverageMeterics()
        self.model._fit()
        end_time = time.time()

        for i, (X, y) in enumerate(self.train_loader):
            y_hat = self.model._forward(X)
            loss = self.criterion(y_hat, y)

            self.model._backward(self.criterion.grad)
            self.optimizer.step()

            acc = accuracy(y_hat, y)
            losses.update(loss, X.shape[0])
            best_acc.update(acc, X.shape[0])

            batch_time.update(time.time() - end_time)
            end_time = time.time()

            if (i % self.print_freq == 0) or (i == len(self.train_loader)-1):
                print(f'Epoch: [{epoch + 1}][{i}/{len(self.train_loader) - 1}]\tTime {batch_time.val:.3f} (Avg-Time {batch_time.avg:.3f})\t '
                      f'Loss {losses.val:.4f} (Avg-Loss {losses.avg:.4f})\t'
                      f'Acc {best_acc.val:.4f} (Avg-Acc {best_acc.avg:.4f})')

        print(f'EPOCH: {epoch+1} train Results: Acc {best_acc.avg:.3f} Loss: {losses.avg:.4f}')
        self.train_loss.append(losses.avg)
        self.train_accuracy.append(best_acc.avg)

    def validate(self, epoch):
        batch_time = AverageMeterics()
        losses = AverageMeterics()
        best_acc = AverageMeterics()
        self.model._fit(mode='eval')
        end = time.time()

        for i, (X, y) in enumerate(self.valid_loader):
            y_hat = self.model._forward(X)
            loss = self.criterion(y_hat, y)
            acc = accuracy(y_hat, y)
            losses.update(loss, X.shape[0])
            best_acc.update(acc, X.shape[0])
            batch_time.update(time.time() - end)
            end = time.time()

            if (i % self.print_freq == 0) or (i == len(self.valid_loader) - 1):
                print(f'Epoch: [{epoch + 1}][{i}/{len(self.valid_loader) - 1}]\tTime {batch_time.val:.3f} (Avg-Time {batch_time.avg:.3f})\t '
                      f'Loss {losses.val:.4f} (Avg-Loss {losses.avg:.4f})\t'
                      f'Acc {best_acc.val:.4f} (Avg-Acc {best_acc.avg:.4f})')

        print(f'EPOCH: {epoch+1} Validation Results: Acc {best_acc.avg:.3f} Loss: {losses.avg:.4f}')
        self.valid_loss.append(losses.avg)
        self.valid_accuracy.append(best_acc.avg)
        return best_acc.avg

class Dataloader(object):
    def __init__(self, X, y, batch_size, shuffle=True, seed=None):
        self.X = X
        self.y = y
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.seed = seed
        self.index = np.arange(X.shape[0])

    def __iter__(self):
        if self.shuffle:
            if self.seed is not None:
                np.random.seed(self.seed)
            np.random.shuffle(self.index)
        self.n = 0
        return self

    def __next__(self):
        if self.n >= len(self.index):
            raise StopIteration
        index = self.index[self.n:self.n + self.batch_size]
        batch_X = self.X[index]
        batch_y = self.y[index]
        self.n += self.batch_size
        return batch_X, batch_y

    def __len__(self):
        return (len(self.index) + self.batch_size - 1) // self.batch_size

def hyperparameter_tuning():
    # Define hyperparameter search space
    configs = [
        {
            'lr': 0.005,
            'batch_size': 512,
            'hidden_units': [256, 128],
            'dropout_rates': [0.3, 0.3],
            'pre-process': None,
            'epoch': 200,
            'weight_decay': 5e-4,
            'momentum': 0.9,
            'optimizer': 'sgd'
        },
        {
            'lr': 0.01,
            'batch_size': 1024,
            'hidden_units': [256, 128],
            'dropout_rates': [0.25, 0.25],
            'pre-process': 'standardization',
            'epoch': 200,
            'weight_decay': 5e-4,
            'momentum': 0.9,
            'optimizer': 'sgd'
        },
        {
            'lr': 0.02,
            'batch_size': 2048,
            'hidden_units': [512, 256],
            'dropout_rates': [0.4, 0.4],
            'pre-process': None,
            'epoch': 200,
            'weight_decay': 5e-4,
            'momentum': 0.9,
            'optimizer': 'sgd'
        },
        {
            'lr': 0.01,
            'batch_size': 1024,
            'hidden_units': [256, 256],
            'dropout_rates': [0.3, 0.3],
            'pre-process': None,
            'epoch': 200,
            'weight_decay': 5e-4,
            'momentum': 0.9,
            'optimizer': 'sgd'
        }
    ]

    best_acc = 0
    best_config = {}

    # Load data
    dir_path = ''
    train_data = np.load(dir_path + 'train_data.npy')
    train_label = np.load(dir_path + 'train_label.npy')
    test_X = np.load(dir_path + 'test_data.npy')
    test_label = np.load(dir_path + 'test_label.npy')

    # Split validation set
    train_X, val_X, train_y, val_y = train_test_split(train_data, train_label,
                                                     test_size=0.2, random_state=5329)

    for cfg in configs:
        print("\n\n=== Testing New Configuration ===")
        print(f"Config: {cfg}")

        # Data preprocessing
        train_X_processed = pre_processing(train_X, cfg['pre-process'])
        val_X_processed = pre_processing(val_X, cfg['pre-process'])
        test_X_processed = pre_processing(test_X, cfg['pre-process'])

        # Create dataloaders
        train_dataloader = Dataloader(train_X_processed, train_y, cfg['batch_size'], shuffle=True)
        val_dataloader = Dataloader(val_X_processed, val_y, cfg['batch_size'], shuffle=False)
        test_dataloader = Dataloader(test_X_processed, test_label, cfg['batch_size'], shuffle=False)

        # Update config
        current_config = {
            'lr': cfg['lr'],
            'batch_size': cfg['batch_size'],
            'momentum': cfg['momentum'],
            'weight_decay': cfg['weight_decay'],
            'seed': 0,
            'epoch': cfg['epoch'],
            'optimizer': cfg['optimizer'],
            'scheduler': None,
            'pre-process': cfg['pre-process'],
            'print_freq': 50000 // cfg['batch_size'] // 5
        }

        # Initialize model with current config
        model = MLP(hidden_units=cfg['hidden_units'],
                   dropout_rates=cfg['dropout_rates'])

        # Train
        trainer = Trainer(current_config, model, train_dataloader, val_dataloader)
        trainer.train()

        # Evaluate on test set
        model._fit(mode='eval')
        y_hat = model._predict(test_dataloader.X)
        test_acc = accuracy(y_hat, test_dataloader.y)

        print(f"\nTest Accuracy for Config: {test_acc:.2f}%")

        # Update best configuration
        if test_acc > best_acc:
            best_acc = test_acc
            best_config = cfg

    print("\n\n=== Tuning Results ===")
    print(f"Best Accuracy: {best_acc:.2f}%")
    print("Best Configuration:")
    print(best_config)

if __name__ == "__main__":
    hyperparameter_tuning()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Epoch: [18][0/19]	Time 0.226 (Avg-Time 0.226)	 Loss 1.7518 (Avg-Loss 1.7518)	Acc 37.7930 (Avg-Acc 37.7930)
Epoch: [18][4/19]	Time 0.185 (Avg-Time 0.197)	 Loss 1.7319 (Avg-Loss 1.7695)	Acc 38.9648 (Avg-Acc 37.8711)
Epoch: [18][8/19]	Time 0.183 (Avg-Time 0.196)	 Loss 1.8245 (Avg-Loss 1.7747)	Acc 37.2559 (Avg-Acc 38.0263)
Epoch: [18][12/19]	Time 0.403 (Avg-Time 0.222)	 Loss 1.7969 (Avg-Loss 1.7744)	Acc 37.3047 (Avg-Acc 37.7441)
Epoch: [18][16/19]	Time 0.395 (Avg-Time 0.288)	 Loss 1.7201 (Avg-Loss 1.7698)	Acc 40.4785 (Avg-Acc 38.0256)
Epoch: [18][19/19]	Time 0.099 (Avg-Time 0.284)	 Loss 1.8160 (Avg-Loss 1.7684)	Acc 35.3860 (Avg-Acc 38.0350)
EPOCH: 18 train Results: Acc 38.035 Loss: 1.7684
Epoch: [18][0/4]	Time 0.047 (Avg-Time 0.047)	 Loss 1.6059 (Avg-Loss 1.6059)	Acc 45.1172 (Avg-Acc 45.1172)
Epoch: [18][4/4]	Time 0.053 (Avg-Time 0.049)	 Loss 1.5981 (Avg-Loss 1.5900)	Acc 44.3584 (Avg-Acc 44.6800)
EPOCH: 18 Validation Results: