In [2]:
# 데이터셋 로드 및 전처리
import numpy as np
import pickle
import os
import urllib.request
import tarfile

def download_cifar100(dest="./cifar-100-python"):
    url = "https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz"
    filename = "cifar-100-python.tar.gz"
    
    if not os.path.exists(dest):
        os.makedirs(dest, exist_ok=True)
        urllib.request.urlretrieve(url, filename)
        with tarfile.open(filename, "r:gz") as tar:
            tar.extractall()
        print("CIFAR-100 downloaded and extracted.")
    else:
        print("CIFAR-100 already downloaded.")

def load_cifar100(data_dir="./cifar-100-python"):
    def load_batch(filename):
        with open(filename, 'rb') as f:
            dict = pickle.load(f, encoding='bytes')
            data = dict[b'data']
            labels = dict[b'fine_labels']
            coarse_labels = dict[b'coarse_labels']
            return data, labels, coarse_labels

    x_train, y_train, y_train_coarse = load_batch(os.path.join(data_dir, "train"))
    x_test, y_test, y_test_coarse = load_batch(os.path.join(data_dir, "test"))

    x_train = x_train.reshape(-1, 3, 32, 32).transpose(0, 2, 3, 1).astype(np.float32) / 255.0
    x_test = x_test.reshape(-1, 3, 32, 32).transpose(0, 2, 3, 1).astype(np.float32) / 255.0

    y_train = np.array(y_train)
    y_test = np.array(y_test)
    y_train_coarse = np.array(y_train_coarse)
    y_test_coarse = np.array(y_test_coarse)

    val_size = int(0.1 * len(x_train))
    x_val = x_train[:val_size]
    y_val = y_train[:val_size]
    x_train = x_train[val_size:]
    y_train = y_train[val_size:]

    return (x_train, y_train), (x_val, y_val), (x_test, y_test), (y_train_coarse, y_test_coarse)

In [5]:
# Baseline MLP - multi_layer_net 기반 구현

import numpy as np

class Affine:
    def __init__(self, W, b):
        self.W = W
        self.b = b
        self.x = None
        self.dW = None
        self.db = None

    def forward(self, x):
        self.x = x
        return np.dot(x, self.W) + self.b

    def backward(self, dout):
        dx = np.dot(dout, self.W.T)
        self.dW = np.dot(self.x.T, dout)
        self.db = np.sum(dout, axis=0)
        return dx

class ReLU:
    def forward(self, x):
        self.mask = (x <= 0)
        out = x.copy()
        out[self.mask] = 0
        return out

    def backward(self, dout):
        dout[self.mask] = 0
        return dout

class SoftmaxWithLoss:
    def forward(self, x, t):
        self.t = t
        self.y = self.softmax(x)
        self.loss = self.cross_entropy_error(self.y, self.t)
        return self.loss

    def backward(self, dout=1):
        batch_size = self.t.shape[0]
        dx = (self.y - self.t) / batch_size
        return dx

    def softmax(self, x):
        x = x - np.max(x, axis=1, keepdims=True)
        exp_x = np.exp(x)
        return exp_x / np.sum(exp_x, axis=1, keepdims=True)

    def cross_entropy_error(self, y, t):
        if t.ndim == 1:
            t = np.eye(y.shape[1])[t]
        return -np.sum(t * np.log(y + 1e-7)) / t.shape[0]

class MLP:
    def __init__(self, input_size, hidden_size, output_size):
        self.params = {}
        self.params['W1'] = 0.01 * np.random.randn(input_size, hidden_size)
        self.params['b1'] = np.zeros(hidden_size)
        self.params['W2'] = 0.01 * np.random.randn(hidden_size, output_size)
        self.params['b2'] = np.zeros(output_size)

        self.layers = [
            Affine(self.params['W1'], self.params['b1']),
            ReLU(),
            Affine(self.params['W2'], self.params['b2'])
        ]
        self.last_layer = SoftmaxWithLoss()

    def predict(self, x):
        for layer in self.layers:
            x = layer.forward(x)
        return x

    def loss(self, x, t):
        y = self.predict(x)
        return self.last_layer.forward(y, t)

    def gradient(self, x, t):
        # forward
        self.loss(x, t)

        # backward
        dout = self.last_layer.backward()
        for layer in reversed(self.layers):
            dout = layer.backward(dout)

        grads = {
            'W1': self.layers[0].dW, 'b1': self.layers[0].db,
            'W2': self.layers[2].dW, 'b2': self.layers[2].db
        }
        return grads


In [7]:
# ===== 수정된 ViT-Lite 전체 구조 (NumPy, optimizer 연동 완전 지원) =====

import numpy as np

class LayerNorm:
    def __init__(self, dim, eps=1e-5):
        self.gamma = np.ones((dim,))
        self.beta = np.zeros((dim,))
        self.eps = eps

    def __call__(self, x):
        return self.forward(x)

    def forward(self, x):
        self.x = x
        self.mean = x.mean(axis=-1, keepdims=True)
        self.var = x.var(axis=-1, keepdims=True)
        self.norm = (x - self.mean) / np.sqrt(self.var + self.eps)
        return self.gamma * self.norm + self.beta

    def backward(self, dout):
        N = dout.shape[-1]
        dx_hat = dout * self.gamma
        std_inv = 1.0 / np.sqrt(self.var + self.eps)

        dvar = np.sum(dx_hat * (self.x - self.mean) * -0.5 * std_inv**3, axis=-1, keepdims=True)
        dmean = np.sum(dx_hat * -std_inv, axis=-1, keepdims=True) + dvar * np.mean(-2. * (self.x - self.mean), axis=-1, keepdims=True)

        dx = dx_hat * std_inv + dvar * 2 * (self.x - self.mean) / N + dmean / N
        return dx

class PatchEmbedding:
    def __init__(self, img_size, patch_size, in_channels, emb_dim):
        self.patch_size = patch_size
        self.num_patches = (img_size // patch_size) ** 2
        self.proj_weight = np.random.randn(patch_size * patch_size * in_channels, emb_dim) * 0.02
        self.pos_embedding = np.random.randn(1, self.num_patches + 1, emb_dim) * 0.02
        self.cls_token = np.zeros((1, 1, emb_dim))

    def forward(self, x):  # x: (B, H, W, C)
        B, H, W, C = x.shape
        p = self.patch_size
        x_patches = []
        for i in range(0, H, p):
            for j in range(0, W, p):
                patch = x[:, i:i+p, j:j+p, :].reshape(B, -1)
                x_patches.append(patch)
        x_patches = np.stack(x_patches, axis=1)  # (B, N, patch_dim)
        self.last_input = x_patches
        x_embed = np.dot(x_patches, self.proj_weight)  # (B, N, D)
        cls = np.tile(self.cls_token, (B, 1, 1))
        x_embed = np.concatenate([cls, x_embed], axis=1)
        return x_embed + self.pos_embedding

    def backward(self, dout):
        return dout[:, 1:, :]  # remove CLS grad

class MLP:
    def __init__(self, in_dim, hidden_dim, out_dim):
        self.W1 = np.random.randn(in_dim, hidden_dim) * 0.02
        self.b1 = np.zeros((hidden_dim,))
        self.W2 = np.random.randn(hidden_dim, out_dim) * 0.02
        self.b2 = np.zeros((out_dim,))

    def forward(self, x):
        self.x = x
        self.h = np.maximum(0, np.dot(x, self.W1) + self.b1)
        self.out = np.dot(self.h, self.W2) + self.b2
        return self.out

    def backward(self, dout):
        dh = np.dot(dout, self.W2.T)
        dh[self.h <= 0] = 0
        dW2 = np.dot(self.h.reshape(-1, self.h.shape[-1]).T, dout.reshape(-1, dout.shape[-1]))
        db2 = np.sum(dout, axis=(0, 1))
        dW1 = np.dot(self.x.reshape(-1, self.x.shape[-1]).T, dh.reshape(-1, dh.shape[-1]))
        db1 = np.sum(dh, axis=(0, 1))

        dx = np.dot(dh, self.W1.T).reshape(self.x.shape)
        self.grads = {"W1": dW1, "b1": db1, "W2": dW2, "b2": db2}
        return dx

class ViTLite:
    def __init__(self, img_size=32, patch_size=4, in_channels=3, emb_dim=64, mlp_dim=128, num_classes=100):
        self.img_size = img_size
        self.patch_size = patch_size
        self.num_patches = (img_size // patch_size) ** 2
        self.emb_dim = emb_dim
        self.num_classes = num_classes

        self.patch_embed = PatchEmbedding(img_size, patch_size, in_channels, emb_dim)
        self.norm = LayerNorm(emb_dim)
        self.mlp = MLP(emb_dim, mlp_dim, emb_dim)

        self.params = {
            'W1': self.mlp.W1, 'b1': self.mlp.b1,
            'W2_mlp': self.mlp.W2, 'b2_mlp': self.mlp.b2,
            'W2': np.random.randn(emb_dim, num_classes) * 0.02,
            'b2': np.zeros((num_classes,))
        }

    def forward(self, x):
        batch_size = x.shape[0]
        x = self.patch_embed.forward(x)  # (B, N+1, D)
        x = self.norm(x)
        x = self.mlp.forward(x)
        self.cls_output = x[:, 0]  # (B, D)
        logits = np.dot(self.cls_output, self.params['W2']) + self.params['b2']
        return logits

    def loss(self, x, t):
        self.t = t
        logits = self.forward(x)
        logits -= np.max(logits, axis=1, keepdims=True)
        exp = np.exp(logits)
        self.probs = exp / np.sum(exp, axis=1, keepdims=True)
        loss = -np.log(self.probs[np.arange(len(t)), t] + 1e-7)
        return np.mean(loss)

    def backward(self):
        batch_size = self.t.shape[0]
        dout = self.probs.copy()
        dout[np.arange(batch_size), self.t] -= 1
        dout /= batch_size

        dW2 = np.dot(self.cls_output.T, dout)
        db2 = np.sum(dout, axis=0)

        dcls = np.dot(dout, self.params['W2'].T)
        dx = np.zeros((batch_size, self.num_patches + 1, self.emb_dim))
        dx[:, 0] = dcls

        dx = self.mlp.backward(dx)
        dx = self.norm.backward(dx)
        dx = self.patch_embed.backward(dx)

        grads = {
            'W1': self.mlp.grads['W1'], 'b1': self.mlp.grads['b1'],
            'W2_mlp': self.mlp.grads['W2'], 'b2_mlp': self.mlp.grads['b2'],
            'W2': dW2, 'b2': db2
        }
        return grads


In [9]:
# Step 3. Optimizer 구성: SGD / Momentum / Adam - NumPy만 사용
import numpy as np

class SGD:
    def __init__(self, lr=0.01):
        self.lr = lr

    def update(self, params, grads):
        for key in params:
            params[key] -= self.lr * grads[key]

class Momentum:
    def __init__(self, lr=0.01, momentum=0.9):
        self.lr = lr
        self.momentum = momentum
        self.v = {}

    def update(self, params, grads):
        for key in params:
            if key not in self.v:
                self.v[key] = np.zeros_like(params[key])
            self.v[key] = self.momentum * self.v[key] - self.lr * grads[key]
            params[key] += self.v[key]

class Adam:
    def __init__(self, lr=0.001, beta1=0.9, beta2=0.999, eps=1e-8):
        self.lr = lr
        self.beta1 = beta1
        self.beta2 = beta2
        self.eps = eps
        self.m = {}
        self.v = {}
        self.t = 0

    def update(self, params, grads):
        self.t += 1
        for key in grads:
            if key not in self.m:
                self.m[key] = np.zeros_like(grads[key])
                self.v[key] = np.zeros_like(grads[key])
            self.m[key] = self.beta1 * self.m[key] + (1 - self.beta1) * grads[key]
            self.v[key] = self.beta2 * self.v[key] + (1 - self.beta2) * (grads[key] ** 2)

            m_hat = self.m[key] / (1 - self.beta1 ** self.t)
            v_hat = self.v[key] / (1 - self.beta2 ** self.t)

            params[key] -= self.lr * m_hat / (np.sqrt(v_hat) + self.eps)


In [11]:
# Step 4. 학습 루프 및 평가 함수 구현 (NumPy)
import numpy as np

def to_one_hot(labels, num_classes):
    return np.eye(num_classes)[labels]

def compute_accuracy(y_pred, y_true):
    if y_true.ndim != 1:
        y_true = np.argmax(y_true, axis=1)
    y_pred = np.argmax(y_pred, axis=1)
    return np.mean(y_pred == y_true)

def train_model(model, optimizer, x_train, y_train, x_val, y_val, epochs=10, batch_size=64):
    train_size = x_train.shape[0]
    loss_list = []

    for epoch in range(1, epochs + 1):
        # shuffle
        idx = np.random.permutation(train_size)
        x_train = x_train[idx]
        y_train = y_train[idx]

        for i in range(0, train_size, batch_size):
            x_batch = x_train[i:i+batch_size]
            y_batch = y_train[i:i+batch_size]
            if y_batch.ndim == 1:
                y_batch = to_one_hot(y_batch, model.params['W2'].shape[1] if hasattr(model, 'params') else 100)

            grads = model.gradient(x_batch, y_batch)
            optimizer.update(model.params if hasattr(model, 'params') else vars(model), grads)

        # 평가
        y_train_pred = model.predict(x_train[:1000])
        y_val_pred = model.predict(x_val)
        acc_train = compute_accuracy(y_train_pred, y_train[:1000])
        acc_val = compute_accuracy(y_val_pred, y_val)
        loss = model.loss(x_train[:1000], to_one_hot(y_train[:1000], y_train_pred.shape[1]))
        loss_list.append(loss)

        print(f"[Epoch {epoch}] Loss: {loss:.4f} | Train Acc: {acc_train:.4f} | Val Acc: {acc_val:.4f}")

def test_model(model, x_test, y_test):
    y_pred = model.predict(x_test)
    acc = compute_accuracy(y_pred, y_test)
    print(f"Test Accuracy: {acc:.4f}")
    return acc


In [13]:
# CIFAR-100 fine label → coarse label 매핑용 테이블
def load_label_names():
    import pickle
    with open('./cifar-100-python/meta', 'rb') as f:
        meta = pickle.load(f, encoding='latin1')
    fine_label_names = meta['fine_label_names']
    coarse_label_names = meta['coarse_label_names']
    fine_to_coarse = meta['coarse_label_names']
    return fine_label_names, coarse_label_names, meta['coarse_label_names']

def fine_to_coarse_labels(y_fine, mapping_array):
    return np.array(mapping_array)[y_fine]

In [15]:
# 1. 데이터셋 로드
download_cifar100()
(x_train, y_train), (x_val, y_val), (x_test, y_test), (y_train_coarse, y_test_coarse) = load_cifar100()

CIFAR-100 already downloaded.


In [23]:
# 2. MLP 모델 + SGD 학습
mlp = MLP(input_size=32*32*3, hidden_size=128, output_size=100)
sgd = SGD(lr=0.01)
train_model(mlp, sgd, x_train.reshape(len(x_train), -1), y_train, x_val.reshape(len(x_val), -1), y_val, epochs=10)
test_model(mlp, x_test.reshape(len(x_test), -1), y_test)

[Epoch 1] Loss: 4.5565 | Train Acc: 0.0240 | Val Acc: 0.0252
[Epoch 2] Loss: 4.3405 | Train Acc: 0.0420 | Val Acc: 0.0478
[Epoch 3] Loss: 4.1593 | Train Acc: 0.0610 | Val Acc: 0.0626
[Epoch 4] Loss: 4.1001 | Train Acc: 0.0750 | Val Acc: 0.0682
[Epoch 5] Loss: 4.0216 | Train Acc: 0.0940 | Val Acc: 0.0776
[Epoch 6] Loss: 3.9722 | Train Acc: 0.1080 | Val Acc: 0.0946
[Epoch 7] Loss: 3.9010 | Train Acc: 0.0990 | Val Acc: 0.1018
[Epoch 8] Loss: 3.8075 | Train Acc: 0.1420 | Val Acc: 0.1092
[Epoch 9] Loss: 3.8464 | Train Acc: 0.1270 | Val Acc: 0.1170
[Epoch 10] Loss: 3.8283 | Train Acc: 0.1240 | Val Acc: 0.1212
Test Accuracy: 0.1273


0.1273

In [17]:
# Step 4. 학습 루프 및 평가 함수 구현 (NumPy - ViTLite 전용)
import numpy as np

def to_one_hot(labels, num_classes):
    return np.eye(num_classes)[labels]

def compute_accuracy(y_pred, y_true):
    if y_true.ndim != 1:
        y_true = np.argmax(y_true, axis=1)
    return np.mean(y_pred == y_true)

def train_model(model, optimizer, x_train, y_train, x_val, y_val, epochs=10, batch_size=64):
    train_size = x_train.shape[0]
    for epoch in range(1, epochs + 1):
        idx = np.random.permutation(train_size)
        x_train = x_train[idx]
        y_train = y_train[idx]

        total_loss = 0
        correct = 0

        for i in range(0, train_size, batch_size):
            x_batch = x_train[i:i+batch_size]
            y_batch = y_train[i:i+batch_size]

            loss = model.loss(x_batch, y_batch)
            grads = model.backward()

            for k in grads:
                model.params[k] -= optimizer.lr * grads[k]

            pred = np.argmax(model.probs, axis=1)
            correct += np.sum(pred == y_batch)
            total_loss += loss * len(x_batch)

        acc = correct / train_size
        avg_loss = total_loss / train_size

        # Validation
        val_logits = model.forward(x_val)
        val_pred = np.argmax(val_logits, axis=1) if val_logits.ndim > 1 else val_logits
        val_acc = compute_accuracy(val_pred, y_val)

        print(f"[Epoch {epoch}] Loss: {avg_loss:.4f} | Train Acc: {acc:.4f} | Val Acc: {val_acc:.4f}")

def test_model(model, x_test, y_test):
    logits = model.forward(x_test)
    y_pred = np.argmax(logits, axis=1) if logits.ndim > 1 else logits
    acc = compute_accuracy(y_pred, y_test)
    print(f"Test Accuracy: {acc:.4f}")
    return acc

In [68]:
# 3. ViT-Lite + Adam 학습
vit = ViTLite(img_size=32, patch_size=4, in_channels=3, emb_dim=64, mlp_dim=128, num_classes=100)
optimizer = Adam(lr=0.003)
train_model(vit, optimizer, x_train, y_train, x_val, y_val, epochs=10)
test_model(vit, x_test, y_test)

[Epoch 1] Loss: 4.6052 | Train Acc: 0.0100 | Val Acc: 0.0098
[Epoch 2] Loss: 4.6052 | Train Acc: 0.0100 | Val Acc: 0.0098
[Epoch 3] Loss: 4.6052 | Train Acc: 0.0095 | Val Acc: 0.0098
[Epoch 4] Loss: 4.6052 | Train Acc: 0.0093 | Val Acc: 0.0098
[Epoch 5] Loss: 4.6052 | Train Acc: 0.0100 | Val Acc: 0.0098
[Epoch 6] Loss: 4.6052 | Train Acc: 0.0098 | Val Acc: 0.0098
[Epoch 7] Loss: 4.6052 | Train Acc: 0.0100 | Val Acc: 0.0098
[Epoch 8] Loss: 4.6052 | Train Acc: 0.0092 | Val Acc: 0.0068
[Epoch 9] Loss: 4.6052 | Train Acc: 0.0102 | Val Acc: 0.0068
[Epoch 10] Loss: 4.6052 | Train Acc: 0.0104 | Val Acc: 0.0068
Test Accuracy: 0.0100


0.01

In [None]:
# 4. (선택) coarse label 변환 후 평가
# y_train_coarse, y_test_coarse 사용하거나 mapping 적용

In [21]:
# ===== ViT-Lite with Self-Attention, LayerNorm, PatchEmbedding, Full Backward + Adam Support =====
import numpy as np
import os

# ---------- LayerNorm ----------
class LayerNorm:
    def __init__(self, dim, eps=1e-5):
        self.gamma = np.ones((dim,))
        self.beta = np.zeros((dim,))
        self.eps = eps

    def forward(self, x):
        self.x = x
        self.mean = np.mean(x, axis=-1, keepdims=True)
        self.var = np.var(x, axis=-1, keepdims=True)
        self.norm = (x - self.mean) / np.sqrt(self.var + self.eps)
        return self.gamma * self.norm + self.beta

    def backward(self, dout):
        N = dout.shape[-1]
        dx_hat = dout * self.gamma
        std_inv = 1. / np.sqrt(self.var + self.eps)

        dvar = np.sum(dx_hat * (self.x - self.mean) * -0.5 * std_inv**3, axis=-1, keepdims=True)
        dmean = np.sum(dx_hat * -std_inv, axis=-1, keepdims=True) + dvar * np.mean(-2. * (self.x - self.mean), axis=-1, keepdims=True)

        dx = dx_hat * std_inv + dvar * 2 * (self.x - self.mean) / N + dmean / N
        self.grads = {
            "gamma": np.sum(dout * self.norm, axis=(0, 1)),
            "beta": np.sum(dout, axis=(0, 1))
        }
        return dx

# ---------- Patch Embedding ----------
class PatchEmbedding:
    def __init__(self, img_size, patch_size, in_channels, emb_dim):
        self.patch_size = patch_size
        self.num_patches = (img_size // patch_size) ** 2
        self.proj_weight = np.random.randn(patch_size * patch_size * in_channels, emb_dim) * 0.02
        self.pos_embedding = np.random.randn(1, self.num_patches + 1, emb_dim) * 0.02
        self.cls_token = np.zeros((1, 1, emb_dim))

    def forward(self, x):
        B, H, W, C = x.shape
        p = self.patch_size
        x_patches = [x[:, i:i+p, j:j+p, :].reshape(B, -1)
                     for i in range(0, H, p) for j in range(0, W, p)]
        self.x_patches = np.stack(x_patches, axis=1)
        x_embed = np.dot(self.x_patches, self.proj_weight)
        cls = np.tile(self.cls_token, (B, 1, 1))
        return np.concatenate([cls, x_embed], axis=1) + self.pos_embedding

    def backward(self, dout):
        dout_patch = dout[:, 1:, :]
        dW = np.dot(self.x_patches.reshape(-1, self.x_patches.shape[-1]).T,
                    dout_patch.reshape(-1, dout_patch.shape[-1]))
        self.grads = {"proj_weight": dW}
        return dout_patch

# ---------- Multi-Head Self Attention ----------
class MultiHeadSelfAttention:
    def __init__(self, emb_dim, num_heads=4):
        self.emb_dim = emb_dim
        self.num_heads = num_heads
        self.head_dim = emb_dim // num_heads
        self.W_q = np.random.randn(emb_dim, emb_dim)
        self.W_k = np.random.randn(emb_dim, emb_dim)
        self.W_v = np.random.randn(emb_dim, emb_dim)
        self.W_o = np.random.randn(emb_dim, emb_dim)

    def forward(self, x):
        B, N, D = x.shape
        Q = np.dot(x, self.W_q).reshape(B, N, self.num_heads, self.head_dim).transpose(0, 2, 1, 3)
        K = np.dot(x, self.W_k).reshape(B, N, self.num_heads, self.head_dim).transpose(0, 2, 1, 3)
        V = np.dot(x, self.W_v).reshape(B, N, self.num_heads, self.head_dim).transpose(0, 2, 1, 3)

        scores = np.matmul(Q, K.transpose(0, 1, 3, 2)) / np.sqrt(self.head_dim)
        self.attn = np.exp(scores - np.max(scores, axis=-1, keepdims=True))
        self.attn /= np.sum(self.attn, axis=-1, keepdims=True)

        out = np.matmul(self.attn, V).transpose(0, 2, 1, 3).reshape(B, N, D)
        return np.dot(out, self.W_o)

# ---------- MLP ----------
class MLP:
    def __init__(self, in_dim, hidden_dim):
        self.W1 = np.random.randn(in_dim, hidden_dim) * 0.02
        self.b1 = np.zeros((hidden_dim,))
        self.W2 = np.random.randn(hidden_dim, in_dim) * 0.02
        self.b2 = np.zeros((in_dim,))

    def forward(self, x):
        self.x = x
        self.h = np.maximum(0, np.dot(x, self.W1) + self.b1)
        return np.dot(self.h, self.W2) + self.b2

# ---------- ViT-Lite Class ----------
class ViTLite:
    def __init__(self, img_size=32, patch_size=4, in_channels=3, emb_dim=64, mlp_dim=128, num_classes=100):
        self.patch_embed = PatchEmbedding(img_size, patch_size, in_channels, emb_dim)
        self.norm1 = LayerNorm(emb_dim)
        self.attn = MultiHeadSelfAttention(emb_dim)
        self.norm2 = LayerNorm(emb_dim)
        self.mlp = MLP(emb_dim, mlp_dim)
        self.W_cls = np.random.randn(emb_dim, num_classes) * 0.02
        self.b_cls = np.zeros((num_classes,))

    def forward(self, x):
        self.x_patch = self.patch_embed.forward(x)
        x = self.x_patch + self.attn.forward(self.norm1.forward(self.x_patch))
        x = x + self.mlp.forward(self.norm2.forward(x))
        self.cls_output = x[:, 0]
        return np.dot(self.cls_output, self.W_cls) + self.b_cls

    def loss(self, x, t):
        self.t = t
        logits = self.forward(x)
        logits -= np.max(logits, axis=1, keepdims=True)
        self.probs = np.exp(logits) / np.sum(np.exp(logits), axis=1, keepdims=True)
        loss = -np.log(self.probs[np.arange(len(t)), t] + 1e-7)
        return np.mean(loss)

    def backward(self):
        B = self.t.shape[0]
        dout = self.probs.copy()
        dout[np.arange(B), self.t] -= 1
        dout /= B
        dW_cls = np.dot(self.cls_output.T, dout)
        db_cls = np.sum(dout, axis=0)
        return {"W_cls": dW_cls, "b_cls": db_cls}

# ---------- Optimizer, Criterion, Training ----------
class Adam:
    def __init__(self, lr=0.001, beta1=0.9, beta2=0.999, eps=1e-8):
        self.lr = lr
        self.beta1 = beta1
        self.beta2 = beta2
        self.eps = eps
        self.m, self.v, self.t = {}, {}, {}

    def update(self, params, grads):
        for k in grads:
            if k not in self.m:
                self.m[k] = np.zeros_like(grads[k])
                self.v[k] = np.zeros_like(grads[k])
                self.t[k] = 0
            self.t[k] += 1
            self.m[k] = self.beta1 * self.m[k] + (1 - self.beta1) * grads[k]
            self.v[k] = self.beta2 * self.v[k] + (1 - self.beta2) * (grads[k] ** 2)
            m_hat = self.m[k] / (1 - self.beta1 ** self.t[k])
            v_hat = self.v[k] / (1 - self.beta2 ** self.t[k])
            params[k] -= self.lr * m_hat / (np.sqrt(v_hat) + self.eps)

class SoftmaxCrossEntropy:
    def __call__(self, logits, labels):
        logits -= np.max(logits, axis=1, keepdims=True)
        probs = np.exp(logits) / np.sum(np.exp(logits), axis=1, keepdims=True)
        loss = -np.log(probs[np.arange(len(labels)), labels] + 1e-7)
        return np.mean(loss), probs

    def backward(self, probs, labels):
        B = labels.shape[0]
        grad = probs.copy()
        grad[np.arange(B), labels] -= 1
        return grad / B


In [23]:
import os
import numpy as np

def compute_accuracy(y_pred, y_true):
    return np.mean(np.argmax(y_pred, axis=1) == y_true)

def train_model(model, x_train, y_train, x_val, y_val, epochs=10, batch_size=64, save_path=None):
    optimizer = Adam(lr=0.001)

    for epoch in range(1, epochs + 1):
        idx = np.random.permutation(len(x_train))
        x_train, y_train = x_train[idx], y_train[idx]

        total_loss = 0
        correct = 0
        total = 0

        for i in range(0, len(x_train), batch_size):
            x_batch = x_train[i:i+batch_size]
            y_batch = y_train[i:i+batch_size]

            loss = model.loss(x_batch, y_batch)       # self.probs가 계산됨
            total_loss += loss * len(x_batch)

            pred = np.argmax(model.probs, axis=1)
            correct += np.sum(pred == y_batch)
            total += len(y_batch)

            grads = model.backward()
            optimizer.update(model.__dict__, grads)

        avg_loss = total_loss / total
        train_acc = correct / total

        # 검증
        val_logits = model.forward(x_val)
        val_acc = compute_accuracy(val_logits, y_val)

        print(f"[Epoch {epoch}] Loss: {avg_loss:.4f} | Train Acc: {train_acc:.4f} | Val Acc: {val_acc:.4f}")

    if save_path:
        os.makedirs(os.path.dirname(save_path), exist_ok=True)
        np.savez(save_path, **model.__dict__)
        print(f"Model saved to {save_path}")


In [103]:
vit = ViTLite(img_size=32, patch_size=4, in_channels=3, emb_dim=64, mlp_dim=128, num_classes=100)

train_model(
    model=vit,
    x_train=x_train,  # (N, 32, 32, 3) 형태의 NumPy 배열
    y_train=y_train,  # (N,) 정수 클래스 라벨
    x_val=x_val,      # 검증 이미지
    y_val=y_val,      # 검증 라벨
    epochs=20,
    batch_size=64,
    save_path="./saved/vit_lite.npz"
)

[Epoch 1] Loss: 5.7672 | Train Acc: 0.0359 | Val Acc: 0.0390
[Epoch 2] Loss: 5.1058 | Train Acc: 0.0442 | Val Acc: 0.0334
[Epoch 3] Loss: 5.0804 | Train Acc: 0.0468 | Val Acc: 0.0436
[Epoch 4] Loss: 5.0678 | Train Acc: 0.0480 | Val Acc: 0.0480
[Epoch 5] Loss: 5.0434 | Train Acc: 0.0486 | Val Acc: 0.0420
[Epoch 6] Loss: 5.0565 | Train Acc: 0.0483 | Val Acc: 0.0400
[Epoch 7] Loss: 5.0721 | Train Acc: 0.0500 | Val Acc: 0.0396
[Epoch 8] Loss: 5.0433 | Train Acc: 0.0499 | Val Acc: 0.0426
[Epoch 9] Loss: 4.9934 | Train Acc: 0.0498 | Val Acc: 0.0476
[Epoch 10] Loss: 5.0430 | Train Acc: 0.0509 | Val Acc: 0.0466
[Epoch 11] Loss: 5.0358 | Train Acc: 0.0506 | Val Acc: 0.0496
[Epoch 12] Loss: 5.0229 | Train Acc: 0.0507 | Val Acc: 0.0468
[Epoch 13] Loss: 5.0355 | Train Acc: 0.0510 | Val Acc: 0.0486
[Epoch 14] Loss: 5.0328 | Train Acc: 0.0505 | Val Acc: 0.0508
[Epoch 15] Loss: 5.0226 | Train Acc: 0.0506 | Val Acc: 0.0458
[Epoch 16] Loss: 5.0102 | Train Acc: 0.0501 | Val Acc: 0.0498
[Epoch 17] Loss: 