## CIFAR-100 데이터 다운로드 및 전처리

In [2]:
import os
import urllib.request
import tarfile
import pickle
import numpy as np

np.random.seed(42)

def download_cifar100(save_path='cifar-100-python'):
    if os.path.exists(save_path):
        print("CIFAR-100 already downloaded.")
        return

    url = 'https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz'
    filename = 'cifar-100-python.tar.gz'
    print("Downloading CIFAR-100...")
    urllib.request.urlretrieve(url, filename)

    with tarfile.open(filename, 'r:gz') as tar:
        tar.extractall()
    os.remove(filename)
    print("Download and extraction completed.")

def load_batch(filepath):
    with open(filepath, 'rb') as f:
        data_dict = pickle.load(f, encoding='bytes')
    data = data_dict[b'data']
    fine_labels = np.array(data_dict[b'fine_labels'])
    coarse_labels = np.array(data_dict[b'coarse_labels'])
    data = data.reshape(-1, 3, 32, 32)
    return data, fine_labels, coarse_labels

def normalize_images(images):
    return images.astype(np.float32) / 255.0

def split_validation(images, fine_labels, coarse_labels, val_ratio=0.1):
    num_samples = images.shape[0]
    val_size = int(num_samples * val_ratio)

    idx = np.random.permutation(num_samples)
    images = images[idx]
    fine_labels = fine_labels[idx]
    coarse_labels = coarse_labels[idx]

    val_images = images[:val_size]
    val_fine = fine_labels[:val_size]
    val_coarse = coarse_labels[:val_size]
    train_images = images[val_size:]
    train_fine = fine_labels[val_size:]
    train_coarse = coarse_labels[val_size:]

    return (train_images, train_fine, train_coarse), (val_images, val_fine, val_coarse)

def random_crop(x, crop_size=32, padding=4):
    n, c, h, w = x.shape
    padded = np.pad(x, ((0, 0), (0, 0), (padding, padding), (padding, padding)), mode='reflect')
    cropped = np.empty((n, c, crop_size, crop_size), dtype=x.dtype)
    for i in range(n):
        top = np.random.randint(0, padding * 2 + 1)
        left = np.random.randint(0, padding * 2 + 1)
        cropped[i] = padded[i, :, top:top+crop_size, left:left+crop_size]
    return cropped

def horizontal_flip(x):
    return x[:, :, :, ::-1]

def apply_cutout(x, size=8):
    n, c, h, w = x.shape
    for i in range(n):
        cy = np.random.randint(h)
        cx = np.random.randint(w)
        y1 = np.clip(cy - size // 2, 0, h)
        y2 = np.clip(cy + size // 2, 0, h)
        x1 = np.clip(cx - size // 2, 0, w)
        x2 = np.clip(cx + size // 2, 0, w)
        x[i, :, y1:y2, x1:x2] = 0.0
    return x

def generate_augmented_dataset(images, labels, method, target_size):
    N = images.shape[0]
    augmented_images = []
    augmented_labels = []
    repeat = target_size // N + 1
    for _ in range(repeat):
        imgs = images.copy()
        if method == 'crop':
            imgs = random_crop(imgs)
        elif method == 'crop+flip':
            imgs = horizontal_flip(random_crop(imgs))
        elif method == 'crop+flip+cutout':
            imgs = apply_cutout(horizontal_flip(random_crop(imgs)))
        else:
            raise ValueError(f"Unknown method: {method}")
        augmented_images.append(imgs)
        augmented_labels.append(labels.copy())
        if sum(x.shape[0] for x in augmented_images) >= target_size:
            break
    X = np.concatenate(augmented_images, axis=0)[:target_size]
    y = np.concatenate(augmented_labels, axis=0)[:target_size]
    return X, y

def load_cifar100_dataset():
    download_cifar100()
    train_data, train_fine, train_coarse = load_batch('cifar-100-python/train')
    test_data, test_fine, test_coarse = load_batch('cifar-100-python/test')
    train_data = normalize_images(train_data)
    test_data = normalize_images(test_data)
    return (train_data, train_fine, train_coarse), (test_data, test_fine, test_coarse)

def load_fine_to_coarse_matrix():
    train_data, train_fine, train_coarse = load_batch('cifar-100-python/train')
    mapping = np.zeros((100, 20), dtype=np.float32)
    for fine_label, coarse_label in zip(train_fine, train_coarse):
        mapping[fine_label, coarse_label] = 1.0
    return mapping

def prepare_dataset():
    (full_train_images, full_train_labels, full_train_coarse), (test_images, test_labels, test_coarse_labels) = load_cifar100_dataset()
    print("Generating augmented datasets...")

    # offline 증강 세트 생성
    X_crop, y_crop = generate_augmented_dataset(full_train_images, full_train_labels, method='crop', target_size=100000)
    X_cf, y_cf = generate_augmented_dataset(full_train_images, full_train_labels, method='crop+flip', target_size=150000)
    X_cfco, y_cfco = generate_augmented_dataset(full_train_images, full_train_labels, method='crop+flip+cutout', target_size=150000)

    train_crop, val_crop = split_validation(X_crop, y_crop, y_crop)
    train_cf, val_cf = split_validation(X_cf, y_cf, y_cf)
    train_cfco, val_cfco = split_validation(X_cfco, y_cfco, y_cfco)

    x_train_raw = full_train_images.copy()
    y_train_raw = full_train_labels.copy()
    y_train_coarse_raw = full_train_coarse.copy()
    train_raw, val_raw = split_validation(x_train_raw, y_train_raw, y_train_coarse_raw)

    return {
        'train_raw': train_raw,
        'val_raw': val_raw,
        'train_crop': train_crop,
        'val_crop': val_crop,
        'train_cropflip': train_cf,
        'val_cropflip': val_cf,
        'train_cropflipcutout': train_cfco,
        'val_cropflipcutout': val_cfco,
        'test': (test_images, test_labels, test_coarse_labels),
        'fine_to_coarse_matrix': load_fine_to_coarse_matrix()
    }

data = prepare_dataset()
for k, v in data.items():
    if isinstance(v, tuple):
        print(f"{k}: {[x.shape for x in v]}")


CIFAR-100 already downloaded.
Generating augmented datasets...
train_raw: [(45000, 3, 32, 32), (45000,), (45000,)]
val_raw: [(5000, 3, 32, 32), (5000,), (5000,)]
train_crop: [(90000, 3, 32, 32), (90000,), (90000,)]
val_crop: [(10000, 3, 32, 32), (10000,), (10000,)]
train_cropflip: [(135000, 3, 32, 32), (135000,), (135000,)]
val_cropflip: [(15000, 3, 32, 32), (15000,), (15000,)]
train_cropflipcutout: [(135000, 3, 32, 32), (135000,), (135000,)]
val_cropflipcutout: [(15000, 3, 32, 32), (15000,), (15000,)]
test: [(10000, 3, 32, 32), (10000,), (10000,)]


## 라벨 스무딩

In [4]:
def smooth_labels(y, smoothing=0.1, num_classes=100):
    confidence = 1.0 - smoothing
    label_shape = (y.shape[0], num_classes)
    smooth = np.full(label_shape, smoothing / (num_classes - 1))
    smooth[np.arange(y.shape[0]), y] = confidence
    return smooth

## 모델 구조 정의

In [6]:
from common.layers import Convolution, Affine, Relu, BatchNormalization
from common.functions import softmax, cross_entropy_error
from common.util import im2col, col2im

def fake_quantize(x, num_bits=8):
    qmin = 0.
    qmax = 2.**num_bits - 1.
    x_min = np.min(x)
    x_max = np.max(x)
    
    if x_max == x_min:
        return x  # avoid divide by zero
    
    scale = (x_max - x_min) / (qmax - qmin)
    zero_point = qmin - x_min / scale
    zero_point = np.clip(np.round(zero_point), qmin, qmax)

    q_x = zero_point + x / scale
    q_x = np.clip(np.round(q_x), qmin, qmax)
    fq_x = scale * (q_x - zero_point)
    return fq_x
    
class Affine:
    def __init__(self, W, b):
        self.W = W
        self.b = b
        
        self.x = None
        self.original_x_shape = None
        self.dW = None
        self.db = None

    def forward(self, x):
        self.original_x_shape = x.shape
        x = x.reshape(x.shape[0], -1)
        self.x = x

        # Fake Quantization
        W_q = fake_quantize(self.W)
        b_q = fake_quantize(self.b)
        x_q = fake_quantize(self.x)

        out = np.dot(x_q, W_q) + b_q
        return out

    def backward(self, dout):
        dx = np.dot(dout, self.W.T)
        self.dW = np.dot(self.x.T, dout)
        self.db = np.sum(dout, axis=0)
        
        dx = dx.reshape(*self.original_x_shape)
        return dx

class Convolution:
    def __init__(self, W, b, stride=1, pad=0):
        self.W = W
        self.b = b
        self.stride = stride
        self.pad = pad
        
        self.x = None
        self.col = None
        self.col_W = None
        self.dW = None
        self.db = None

    def forward(self, x):
        FN, C, FH, FW = self.W.shape
        N, _, H, W = x.shape
        out_h = 1 + int((H + 2*self.pad - FH) / self.stride)
        out_w = 1 + int((W + 2*self.pad - FW) / self.stride)

        # Fake Quantization
        W_q = fake_quantize(self.W)
        b_q = fake_quantize(self.b)
        x_q = fake_quantize(x)

        col = im2col(x_q, FH, FW, self.stride, self.pad)
        col_W = W_q.reshape(FN, -1).T
        out = np.dot(col, col_W) + b_q
        out = out.reshape(N, out_h, out_w, -1).transpose(0, 3, 1, 2)

        self.x = x
        self.col = col
        self.col_W = col_W

        return out

    def backward(self, dout):
        FN, C, FH, FW = self.W.shape
        dout = dout.transpose(0, 2, 3, 1).reshape(-1, FN)

        self.db = np.sum(dout, axis=0)
        self.dW = np.dot(self.col.T, dout).transpose(1, 0).reshape(FN, C, FH, FW)
        dcol = np.dot(dout, self.col_W.T)
        dx = col2im(dcol, self.x.shape, FH, FW, self.stride, self.pad)

        return dx


class ResidualBlock:
    def __init__(self, in_channels, out_channels, stride=1):
        self.stride = stride
        self.equal_in_out = (in_channels == out_channels and stride == 1)

        self.conv1 = Convolution(
            W=np.random.randn(out_channels, in_channels, 3, 3) * np.sqrt(2. / in_channels),
            b=np.zeros(out_channels),
            stride=stride,
            pad=1
        )
        self.bn1 = BatchNormalization(gamma=np.ones(out_channels), beta=np.zeros(out_channels))
        self.relu1 = Relu()

        self.conv2 = Convolution(
            W=np.random.randn(out_channels, out_channels, 3, 3) * np.sqrt(2. / out_channels),
            b=np.zeros(out_channels),
            stride=1,
            pad=1
        )
        self.bn2 = BatchNormalization(gamma=np.ones(out_channels), beta=np.zeros(out_channels))
        self.relu2 = Relu()

        if not self.equal_in_out:
            self.shortcut = Convolution(
                W=np.random.randn(out_channels, in_channels, 1, 1) * np.sqrt(2. / in_channels),
                b=np.zeros(out_channels),
                stride=stride,
                pad=0
            )
            self.bn_shortcut = BatchNormalization(gamma=np.ones(out_channels), beta=np.zeros(out_channels))

    def forward(self, x, train_flg=True, skip_prob=0.0):
        self.x = x

        if train_flg and np.random.rand() < skip_prob:
            return x  # skip this residual block
        out = self.conv1.forward(x)
        out = self.bn1.forward(out, train_flg)
        out = self.relu1.forward(out)

        out = self.conv2.forward(out)
        out = self.bn2.forward(out, train_flg)
        self.out_main = out

        if self.equal_in_out:
            shortcut = x
        else:
            shortcut = self.shortcut.forward(x)
            shortcut = self.bn_shortcut.forward(shortcut, train_flg)
        self.out_shortcut = shortcut

        out += shortcut
        out = self.relu2.forward(out)

        return out

    def backward(self, dout):
        dout = self.relu2.backward(dout)

        dshortcut = dout.copy()
        dmain = dout.copy()

        dmain = self.bn2.backward(dmain)
        dmain = self.conv2.backward(dmain)

        dmain = self.relu1.backward(dmain)
        dmain = self.bn1.backward(dmain)
        dmain = self.conv1.backward(dmain)

        if not self.equal_in_out:
            dshortcut = self.bn_shortcut.backward(dshortcut)
            dshortcut = self.shortcut.backward(dshortcut)

        dx = dmain + dshortcut
        return dx

class ResNet20:
    def __init__(self, input_dim=(3, 32, 32), num_classes=100):
        self.params = []
        self.trainable_layers = []

        self.conv1 = Convolution(
            W=np.random.randn(16, 3, 3, 3) * np.sqrt(2. / 3),
            b=np.zeros(16),
            stride=1,
            pad=1
        )
        self.bn1 = BatchNormalization(gamma=np.ones(16), beta=np.zeros(16))
        self.relu1 = Relu()

        self.layer1 = [ResidualBlock(16, 16, stride=1) for _ in range(3)]
        self.layer2 = [ResidualBlock(16 if i == 0 else 32, 32, stride=2 if i == 0 else 1) for i in range(3)]
        self.layer3 = [ResidualBlock(32 if i == 0 else 64, 64, stride=2 if i == 0 else 1) for i in range(3)]

        self.fc = Affine(W=np.random.randn(64, num_classes) * np.sqrt(2. / 64), b=np.zeros(num_classes))

    def clip_weights(self, clip_value=1.0):
        self.conv1.W = np.clip(self.conv1.W, -clip_value, clip_value)
        self.fc.W = np.clip(self.fc.W, -clip_value, clip_value)

        for block in self.layer1 + self.layer2 + self.layer3:
            block.conv1.W = np.clip(block.conv1.W, -clip_value, clip_value)
            block.conv2.W = np.clip(block.conv2.W, -clip_value, clip_value)
            if not block.equal_in_out:
                block.shortcut.W = np.clip(block.shortcut.W, -clip_value, clip_value)

    def forward(self, x, train_flg=True, skip_prob=0.0):
        self.input = x

        out = self.conv1.forward(x)
        out = self.bn1.forward(out, train_flg)
        out = self.relu1.forward(out)

        for block in self.layer1:
            out = block.forward(out, train_flg)
        for block in self.layer2:
            out = block.forward(out, train_flg)
        for block in self.layer3:
            out = block.forward(out, train_flg)

        self.feature_map = out

        N, C, H, W = out.shape
        out = out.mean(axis=(2, 3))

        self.pooled = out
        out = self.fc.forward(out)
        return out

    def predict(self, x, batch_size=100):
        y_list = []
        for i in range(0, x.shape[0], batch_size):
            x_batch = x[i:i+batch_size]
            y_batch = self.forward(x_batch, train_flg=False)
            y_list.append(y_batch)
        return np.concatenate(y_list, axis=0)

    def loss(self, x, t):
        y = self.forward(x, train_flg=True)
        return cross_entropy_error(softmax(y), t)

    def accuracy(self, x, t, batch_size=100):
        acc = 0.0
        total = x.shape[0]
        for i in range(0, total, batch_size):
            x_batch = x[i:i+batch_size]
            t_batch = t[i:i+batch_size]

            y = self.predict(x_batch)
            y = np.argmax(y, axis=1)

            if t.ndim != 1:
                t_batch = np.argmax(t_batch, axis=1)

            acc += np.sum(y == t_batch)

        return acc / total

    def backward(self, dout):
        dout = self.fc.backward(dout)
        dout = dout.reshape(self.feature_map.shape[0], self.feature_map.shape[1], 1, 1)
        dout = dout.repeat(self.feature_map.shape[2], axis=2).repeat(self.feature_map.shape[3], axis=3)

        for block in reversed(self.layer3):
            dout = block.backward(dout)
        for block in reversed(self.layer2):
            dout = block.backward(dout)
        for block in reversed(self.layer1):
            dout = block.backward(dout)

        dout = self.relu1.backward(dout)
        dout = self.bn1.backward(dout)
        dout = self.conv1.backward(dout)
        return dout

## 모델 구조 출력

In [8]:
def count_params(layer):
    count = 0
    if hasattr(layer, 'W'):
        count += np.prod(layer.W.shape)
    if hasattr(layer, 'b'):
        count += np.prod(layer.b.shape)
    return count

def print_resnet20_summary(model, input_shape=(1, 3, 32, 32)):
    print("=" * 75, flush=True)
    print(f"{'Layer (type)':<35}{'Output Shape':<25}{'Param #':>10}", flush=True)
    print("=" * 75, flush=True)

    x = np.zeros(input_shape)
    total_params = 0
    layer_idx = 1

    x = model.conv1.forward(x)
    p = count_params(model.conv1)
    print(f"{layer_idx:>2}. {'Conv1':<32}{str(x.shape):<25}{p:>10,}", flush=True)
    total_params += p
    layer_idx += 1

    x = model.bn1.forward(x, train_flg=False)
    x = model.relu1.forward(x)

    for i, layer_block in enumerate([model.layer1, model.layer2, model.layer3]):
        for j, block in enumerate(layer_block):
            residual = x.copy()

            # Conv1
            x = block.conv1.forward(x)
            p = count_params(block.conv1)
            name = f"Block[{i+1}-{j+1}]_Conv1"
            print(f"{layer_idx:>2}. {name:<32}{str(x.shape):<25}{p:>10,}", flush=True)
            total_params += p
            layer_idx += 1

            x = block.bn1.forward(x, train_flg=False)
            x = block.relu1.forward(x)

            x = block.conv2.forward(x)
            p = count_params(block.conv2)
            name = f"Block[{i+1}-{j+1}]_Conv2"
            print(f"{layer_idx:>2}. {name:<32}{str(x.shape):<25}{p:>10,}", flush=True)
            total_params += p
            layer_idx += 1

            x = block.bn2.forward(x, train_flg=False)

            if not block.equal_in_out:
                x_sc = block.shortcut.forward(residual)
                p = count_params(block.shortcut)
                name = f"└─ Shortcut[{i+1}-{j+1}]"
                print(f"{'':>3} {name:<32}{str(x_sc.shape):<25}{p:>10,}", flush=True)
                total_params += p
                x = x + x_sc
                x = block.bn_shortcut.forward(x, train_flg=False)
            else:
                x = x + residual

            x = block.relu2.forward(x)

    x = x.mean(axis=(2, 3))
    print(f"{'':>3} {'GlobalAvgPool':<32}{str(x.shape):<25}{'0':>10}", flush=True)

    x = model.fc.forward(x)
    p = count_params(model.fc)
    print(f"{layer_idx:>2}. {'FC':<32}{str(x.shape):<25}{p:>10,}", flush=True)
    total_params += p

    print("=" * 75, flush=True)
    print(f"{'Total weight layers:':<60}{'20'}", flush=True)
    print(f"{'Total params:':<60}{total_params:,}", flush=True)
    print("=" * 75, flush=True)

model = ResNet20()
print_resnet20_summary(model, input_shape=(1, 3, 32, 32))

Layer (type)                       Output Shape                Param #
 1. Conv1                           (1, 16, 32, 32)                 448
 2. Block[1-1]_Conv1                (1, 16, 32, 32)               2,320
 3. Block[1-1]_Conv2                (1, 16, 32, 32)               2,320
 4. Block[1-2]_Conv1                (1, 16, 32, 32)               2,320
 5. Block[1-2]_Conv2                (1, 16, 32, 32)               2,320
 6. Block[1-3]_Conv1                (1, 16, 32, 32)               2,320
 7. Block[1-3]_Conv2                (1, 16, 32, 32)               2,320
 8. Block[2-1]_Conv1                (1, 32, 16, 16)               4,640
 9. Block[2-1]_Conv2                (1, 32, 16, 16)               9,248
    └─ Shortcut[2-1]                (1, 32, 16, 16)                 544
10. Block[2-2]_Conv1                (1, 32, 16, 16)               9,248
11. Block[2-2]_Conv2                (1, 32, 16, 16)               9,248
12. Block[2-3]_Conv1                (1, 32, 16, 16)              

## 모델 학습

In [10]:
import numpy as np
import time
import pickle
from common.optimizer import Adam
from common.functions import softmax

class Trainer:
    def __init__(self, model, model_name,
                 train_data, val_data, test_data,
                 epochs=20, batch_size=64, lr=0.01,
                 smoothing=0.15, apply_augmentations=None,
                 coarse_train_labels=None, coarse_val_labels=None,
                 mapping_matrix=None):

        self.model = model
        self.model_name = model_name
        self.train_x, self.train_t = train_data
        self.val_x, self.val_t = val_data
        self.test_x, self.test_t = test_data

        self.epochs = epochs
        self.batch_size = batch_size
        self.lr = lr
        self.smoothing = smoothing
        self.apply_augmentations = apply_augmentations

        self.coarse_train_t = coarse_train_labels
        self.coarse_val_t = coarse_val_labels
        self.mapping_matrix = mapping_matrix

        self.train_size = self.train_x.shape[0]
        self.iter_per_epoch = max(self.train_size // self.batch_size, 1)

        self.optimizer = Adam(lr=lr)

        self.train_loss_list = []
        self.val_loss_list = []
        self.train_acc_list = []
        self.val_acc_list = []
        self.train_coarse_acc_list = []
        self.val_coarse_acc_list = []

    def smooth_labels(self, y, num_classes=100):
        confidence = 1.0 - self.smoothing
        label_shape = (y.shape[0], num_classes)
        smooth = np.full(label_shape, self.smoothing / (num_classes - 1), dtype=np.float32)
        smooth[np.arange(y.shape[0]), y] = confidence
        return smooth

    def loss_grad(self, x, t):
        y = self.model.forward(x, train_flg=True)
        batch_size = x.shape[0]
        if t.size == y.size:
            dx = (softmax(y) - t) / batch_size
        else:
            dx = softmax(y)
            dx[np.arange(batch_size), t] -= 1
            dx /= batch_size
        return dx, y

    def get_param_dict_and_grad(self):
        param_dict, grad_dict = {}, {}
        if hasattr(self.model.fc, 'W'):
            param_dict['fc_W'] = self.model.fc.W
            param_dict['fc_b'] = self.model.fc.b
            grad_dict['fc_W'] = self.model.fc.dW
            grad_dict['fc_b'] = self.model.fc.db

        idx = 0
        for layer in self.model.layer1 + self.model.layer2 + self.model.layer3:
            for attr in ['conv1', 'conv2', 'shortcut']:
                if hasattr(layer, attr):
                    conv = getattr(layer, attr)
                    param_dict[f'{idx}_W'] = conv.W
                    param_dict[f'{idx}_b'] = conv.b
                    grad_dict[f'{idx}_W'] = conv.dW
                    grad_dict[f'{idx}_b'] = conv.db
                    idx += 1
        return param_dict, grad_dict

    def train_step(self):
        batch_mask = np.random.choice(self.train_size, self.batch_size)
        x_batch = self.train_x[batch_mask]
        t_batch = self.train_t[batch_mask]

        if self.apply_augmentations is not None:
            x_batch, t_batch = self.apply_augmentations(x_batch, t_batch)

        if t_batch.ndim == 1:
            t_batch = self.smooth_labels(t_batch)

        loss = self.model.loss(x_batch, t_batch)
        dx, y = self.loss_grad(x_batch, t_batch)
        self.model.backward(dx)

        if hasattr(self.model, 'clip_weights'):
            self.model.clip_weights(clip_value=1.0)

        params, grads = self.get_param_dict_and_grad()
        self.optimizer.update(params, grads)

        return loss

    def train(self):
        for epoch in range(self.epochs):
            print(f"\n[Epoch {epoch + 1}/{self.epochs}]", flush=True)
            epoch_loss = 0
            start_time = time.time()

            for i in range(self.iter_per_epoch):
                loss = self.train_step()
                epoch_loss += loss
                if i % 10 == 0 or i == self.iter_per_epoch - 1:
                    print(f"  Iter {i+1:3d}/{self.iter_per_epoch}: Loss {loss:.4f}", flush=True)

            avg_loss = epoch_loss / self.iter_per_epoch
            self.train_loss_list.append(avg_loss)

            train_acc = self.model.accuracy(self.train_x[:1000], self.train_t[:1000])
            val_acc = self.model.accuracy(self.val_x, self.val_t)

            train_coarse_acc = self.coarse_accuracy_hard_mapping(self.train_x[:1000], self.coarse_train_t[:1000])
            val_coarse_acc = self.coarse_accuracy_hard_mapping(self.val_x, self.coarse_val_t)

            val_loss = self.batched_loss(self.val_x, self.val_t, batch_size=128)

            self.train_acc_list.append(train_acc)
            self.val_acc_list.append(val_acc)
            self.val_loss_list.append(val_loss)
            self.train_coarse_acc_list.append(train_coarse_acc)
            self.val_coarse_acc_list.append(val_coarse_acc)

            elapsed = time.time() - start_time
            print(f"Fine Train Loss: {avg_loss:.4f}, Fine Train Acc: {train_acc:.4f}, Val Acc: {val_acc:.4f}, Val Loss: {val_loss:.4f}", flush=True)
            print(f"Coarse Train Acc: {train_coarse_acc:.4f}, Coarse Val Acc: {val_coarse_acc:.4f} | Time: {elapsed:.2f}s", flush=True)

            if (epoch + 1) % 5 == 0:
                model_filename = f"{self.model_name}_epoch_{epoch+1}.pkl"
                self.save_model(model_filename)
                print(f">>> Model saved to {model_filename}", flush=True)

    def batched_loss(self, x, t, batch_size=128):
        total_loss = 0.0
        total_count = 0
        for i in range(0, len(x), batch_size):
            x_batch = x[i:i+batch_size]
            t_batch = t[i:i+batch_size]
            loss = self.model.loss(x_batch, t_batch)
            total_loss += loss * len(x_batch)
            total_count += len(x_batch)
        return total_loss / total_count

    def coarse_accuracy_hard_mapping(self, x, y_coarse, batch_size=100):
        fine_to_coarse = np.argmax(self.mapping_matrix, axis=1)
        correct = 0
        total = x.shape[0]
        for i in range(0, total, batch_size):
            x_batch = x[i:i+batch_size]
            y_batch = y_coarse[i:i+batch_size]
            fine_logits = self.model.predict(x_batch)
            pred_fine = np.argmax(fine_logits, axis=1)
            pred_coarse = fine_to_coarse[pred_fine]
            correct += np.sum(pred_coarse == y_batch)
        return correct / total

    def save_model(self, filename):
        params, _ = self.get_param_dict_and_grad()
        model_state = {k: v.copy() for k, v in params.items()}

        model_state['conv1_W'] = self.model.conv1.W.copy()
        model_state['conv1_b'] = self.model.conv1.b.copy()

        def extract_bn_params(model):
            bn_params = {}
            bn_count = 0
            for layer in model.layer1 + model.layer2 + model.layer3:
                for bn_attr in ['bn1', 'bn2']:
                    if hasattr(layer, bn_attr):
                        bn = getattr(layer, bn_attr)
                        bn_params[f'{bn_count}_gamma'] = bn.gamma.copy()
                        bn_params[f'{bn_count}_beta'] = bn.beta.copy()
                        bn_params[f'{bn_count}_running_mean'] = bn.running_mean.copy()
                        bn_params[f'{bn_count}_running_var'] = bn.running_var.copy()
                        bn_count += 1
                if hasattr(layer, 'bn_shortcut'):
                    bn = layer.bn_shortcut
                    bn_params[f'{bn_count}_gamma'] = bn.gamma.copy()
                    bn_params[f'{bn_count}_beta'] = bn.beta.copy()
                    bn_params[f'{bn_count}_running_mean'] = bn.running_mean.copy()
                    bn_params[f'{bn_count}_running_var'] = bn.running_var.copy()
                    bn_count += 1
            bn = model.bn1
            bn_params[f'{bn_count}_gamma'] = bn.gamma.copy()
            bn_params[f'{bn_count}_beta'] = bn.beta.copy()
            bn_params[f'{bn_count}_running_mean'] = bn.running_mean.copy()
            bn_params[f'{bn_count}_running_var'] = bn.running_var.copy()
            return bn_params

        model_state.update(extract_bn_params(self.model))

        optimizer_state = {
            'lr': self.optimizer.lr,
            'beta1': self.optimizer.beta1,
            'beta2': self.optimizer.beta2,
            'm': self.optimizer.m,
            'v': self.optimizer.v,
            't': self.optimizer.iter
        }

        save_data = {
            'model': model_state,
            'optimizer': optimizer_state,
            'train_loss_list': self.train_loss_list,
            'train_acc_list': self.train_acc_list,
            'val_acc_list': self.val_acc_list,
            'val_loss_list': self.val_loss_list,
            'train_coarse_acc_list': self.train_coarse_acc_list,
            'val_coarse_acc_list': self.val_coarse_acc_list
        }

        with open(filename, 'wb') as f:
            pickle.dump(save_data, f)

    def save_log(self, filename='log.npz'):
        np.savez(filename,
                 loss=np.array(self.train_loss_list),
                 train_acc=np.array(self.train_acc_list),
                 val_acc=np.array(self.val_acc_list),
                 val_loss=np.array(self.val_loss_list),
                 coarse_train_acc=np.array(self.train_coarse_acc_list),
                 coarse_val_acc=np.array(self.val_coarse_acc_list))
        print(f"Log saved to {filename}", flush=True)


## 실험 수행

In [12]:
import numpy as np

def apply_mixup(x, y, alpha=1.0):
    lam = np.random.beta(alpha, alpha)
    index = np.random.permutation(x.shape[0])
    x_mix = lam * x + (1 - lam) * x[index]
    if y.ndim == 1:
        y = np.eye(100)[y]
    y_mix = lam * y + (1 - lam) * y[index]
    return x_mix, y_mix

def rand_bbox(h, w, lam):
    cut_rat = np.sqrt(1. - lam)
    cut_h = int(h * cut_rat)
    cut_w = int(w * cut_rat)
    cy = np.random.randint(h)
    cx = np.random.randint(w)
    y1 = np.clip(cy - cut_h // 2, 0, h)
    y2 = np.clip(cy + cut_h // 2, 0, h)
    x1 = np.clip(cx - cut_w // 2, 0, w)
    x2 = np.clip(cx + cut_w // 2, 0, w)
    return x1, y1, x2, y2

def apply_cutmix(x, y, alpha=1.0):
    lam = np.random.beta(alpha, alpha)
    batch_size, _, h, w = x.shape
    index = np.random.permutation(batch_size)
    x1, y1, x2, y2 = rand_bbox(h, w, lam)
    x[:, :, y1:y2, x1:x2] = x[index, :, y1:y2, x1:x2]
    if y.ndim == 1:
        y = np.eye(100)[y]
    y_a, y_b = y, y[index]
    y_mix = lam * y_a + (1 - lam) * y_b
    return x, y_mix

def get_hard_coarse_from_soft(y_soft, fine_to_coarse):
    soft_coarse = np.dot(y_soft, fine_to_coarse)  # (N, 20)
    return np.argmax(soft_coarse, axis=1)

def run_experiments():
    np.random.seed(42)

    data = prepare_dataset()
    fine_to_coarse_matrix = data['fine_to_coarse_matrix']

    experiment_settings = [
        {
            "name": "crop",
            "train_key": "train_crop",
            "val_key": "val_crop",
            "aug": None
        },
        {
            "name": "crop+flip",
            "train_key": "train_cropflip",
            "val_key": "val_cropflip",
            "aug": None
        },
        {
            "name": "crop+flip+cutout",
            "train_key": "train_cropflipcutout",
            "val_key": "val_cropflipcutout",
            "aug": None
        },
        {
            "name": "mixup",
            "train_key": "train_raw",
            "val_key": "val_raw",
            "aug": apply_mixup
        },
        {
            "name": "cutmix",
            "train_key": "train_raw",
            "val_key": "val_raw",
            "aug": apply_cutmix
        }
    ]

    for setting in experiment_settings:
        print(f"\n==== Running {setting['name']} ====")
        model = ResNet20()

        x_train, y_train, y_train_coarse = data[setting['train_key']]
        x_val, y_val, y_val_coarse = data[setting['val_key']]
        x_test, y_test, y_test_coarse = data['test']

        # mixup/cutmix인 경우: soft label → coarse 재지정
        if setting['name'] in ['mixup', 'cutmix']:
            if y_train.ndim == 1:
                y_train = np.eye(100)[y_train]  # label을 one-hot로
            # val은 여전히 정수 라벨이므로 one-hot로 바꿔줌
            y_val_onehot = np.eye(100)[y_val]
            y_train_coarse = get_hard_coarse_from_soft(y_train, fine_to_coarse_matrix)
            y_val_coarse = get_hard_coarse_from_soft(y_val_onehot, fine_to_coarse_matrix)

        trainer = Trainer(
            model=model,
            model_name=setting['name'],
            train_data=(x_train, y_train),
            val_data=(x_val, y_val),
            test_data=(x_test, y_test),
            epochs=10,
            batch_size=64,
            lr=0.01,
            smoothing=0.15,
            apply_augmentations=setting['aug'],
            coarse_train_labels=y_train_coarse,
            coarse_val_labels=y_val_coarse,
            mapping_matrix=fine_to_coarse_matrix
        )
        trainer.train()
        trainer.save_log(f"{setting['name']}_log.npz")

In [13]:
if __name__ == "__main__":
    run_experiments()

CIFAR-100 already downloaded.
Generating augmented datasets...

==== Running crop ====

[Epoch 1/10]
  Iter   1/1406: Loss 5.8696
  Iter  11/1406: Loss 4.9294
  Iter  21/1406: Loss 4.4706
  Iter  31/1406: Loss 4.4262
  Iter  41/1406: Loss 4.5273
  Iter  51/1406: Loss 4.3829
  Iter  61/1406: Loss 4.4288
  Iter  71/1406: Loss 4.4037
  Iter  81/1406: Loss 4.5221
  Iter  91/1406: Loss 4.1132
  Iter 101/1406: Loss 4.2455
  Iter 111/1406: Loss 4.2289
  Iter 121/1406: Loss 4.2468
  Iter 131/1406: Loss 3.9702
  Iter 141/1406: Loss 4.2995
  Iter 151/1406: Loss 4.4077
  Iter 161/1406: Loss 4.0717
  Iter 171/1406: Loss 4.0306
  Iter 181/1406: Loss 4.1878
  Iter 191/1406: Loss 4.1206
  Iter 201/1406: Loss 4.0248
  Iter 211/1406: Loss 4.0344
  Iter 221/1406: Loss 4.2796
  Iter 231/1406: Loss 4.1452
  Iter 241/1406: Loss 3.8822
  Iter 251/1406: Loss 4.3129
  Iter 261/1406: Loss 4.2084
  Iter 271/1406: Loss 4.0923
  Iter 281/1406: Loss 4.0004
  Iter 291/1406: Loss 3.9616
  Iter 301/1406: Loss 3.9885
