In [1]:
import numpy as np 
import matplotlib.pyplot as plt

# install
## numpy
## matplotlib

In [2]:
from google.colab import drive
drive.mount('/content/drive')
!pwd
%cd /content/drive/MyDrive/Colab Notebooks/assign04

Mounted at /content/drive
/content
/content/drive/MyDrive/Colab Notebooks/assign04


## data load & preprocessing

#### 손글씨 데이터셋

In [3]:
from dataset.mnist import load_mnist

(train_raw_img, train_label), (test_raw_img, test_label) = load_mnist(flatten=False, normalize=False)
print(train_raw_img.shape)

(60000, 1, 28, 28)


In [4]:
# preprocessing (train & inference)

train_img = train_raw_img.reshape(len(train_raw_img.squeeze()), -1)
train_label = train_label.reshape(len(train_label), -1)

test_img = test_raw_img.reshape(len(test_raw_img.squeeze()), -1)
test_label = test_label.reshape(len(test_label), -1)

print(train_img.shape)
print(train_label.shape)
print(test_img.shape)
print(test_label.shape)

(60000, 784)
(60000, 1)
(10000, 784)
(10000, 1)


In [5]:
# normalization (set value 0 ~ 1)

train_img = train_img.astype('float')
train_img = train_img/255

test_img = test_img.astype('float')
test_img = test_img/255

## Softmax Regression for Multi Class Single-label Classfication

##### 지난 시간 Classification 결과를 Sigmoid 대신 Softmax로 학습한다.

data -> Linear(784, 100) -> ReLU(100, 100) -> Linear(100, 10) -> softmax(10, 10)

In [6]:
class Linear :
    def __init__(self, input_size=1, hidden_size=1) :
        self.W = np.random.randn(input_size, hidden_size) # input_size와 hidden_size를 가지는 정규분포를 따르는 난수 생성
        self.b = np.zeros(hidden_size)
        self.x = None
        self.dW = None
        self.db = None
    
    def forward(self, x) :
        self.x = x
        out = np.dot(x, self.W) + self.b  # 입력값(x) * 가중치(W) + Bias(b)
        return out
    
    def backward(self, dout, lr) :
        dx = np.dot(dout, self.W.T) # 입력값에 대한 미분값
        self.dW = np.dot(self.x.T, dout)  # 가중치에 대한 미분값
        self.db = np.sum(dout, axis=0)  # Bias에 대한 미분값
        self.W -= lr * self.dW  # 학습률(lr)을 통해 가중치 업데이트
        self.b -= lr * self.db  # 학습률(lr)을 통해 Bias 업데이트
        return dx

In [7]:
class Relu :
    def __init__(self) :
        self.mask = None
    
    def forward(self, x) :
        self.mask = (x < 0)
        out = x.copy()
        out[self.mask] = 0
        return out
    
    def backward(self, dout) :
        dout[self.mask] = 0
        return dout                                                                                                                                                                                                                                                                                                                                                   

In [8]:
class softmax_with_crossEntropy :
    def __init__(self) :
        self.delta = 1e-7
        # softmax
        self.softmax_x = None
        self.softmax_out = None
        # crossEntropy
        self.pred = None
        self.target = None
        
    def softmax_forward(self, x) :
        self.softmax_x = x  # 입력값 x를 softmax_x에 입력
        max_x = np.max(x, axis=1, keepdims=True)
        exp_x = np.exp(x - max_x) # 입력값에서 최대값을 빼주어 지수함수 계산
        self.softmax_out = exp_x / np.sum(exp_x, axis=1, keepdims=True)
        return self.softmax_out # softmax 함수의 출력 값
    
    def crossEntropy_forward(self, pred, target) :
        self.pred = pred  # 모델이 예측한 값 (pred)
        self.target = target  # 실제값 (target)
        batch_size = pred.shape[0]
        loss = -np.sum(np.log(pred + self.delta) * target) / batch_size
        return loss
    
    def backward(self) :
        batch_size = self.target.shape[0]
        dx = (self.pred - self.target) / batch_size # Cross Entropy 함수의 gradient 계산
        return dx

## Train

### base

In [9]:
# one_hot label 만드는 함수

def make_one_hot(labels) :
    a = []
    for label in labels :
        one_hot = np.zeros(10)
        one_hot[label] = 1
        a.append(one_hot)
    a = np.array(a)
    return a

# one_hot_labels = make_one_hot(train_label)
# print(train_label[0])
# print(one_hot_labels[0])

In [10]:
# train version 1

from collections import OrderedDict

def train_MLP(config) :
    lr, num_epoch = config['learning_rate'], config['num_epoch']
    print_loss_interval = 1
    
    layer1 = Linear(784, 100)
    relu = Relu()
    layer2 = Linear(100, 10)
    softmax_with_CE = softmax_with_crossEntropy()
    
    for iter in range(num_epoch) :
        # forward
        x = layer1.forward(train_img)
        x = relu.forward(x)
        x = layer2.forward(x)
        preds = softmax_with_CE.softmax_forward(x)
        
        # loss
        one_hot_labels = make_one_hot(train_label)
        losses = softmax_with_CE.crossEntropy_forward(preds, one_hot_labels)
        loss = losses.sum()/len(preds)
        
        if iter % print_loss_interval == 0:
            print("[epoch %d / %d] average loss : %f" % (iter, num_epoch, loss))
        
        # backward
        dL = softmax_with_CE.backward()
        dL = layer2.backward(dL, lr)
        dL = relu.backward(dL)
        dL = layer1.backward(dL, lr)

    model = OrderedDict()
    model['layer1'] = layer1
    model['relu'] = relu
    model['layer2'] = layer2
    model['softmax_with_CE'] = softmax_with_CE
    
    return model

In [11]:
config = { 'learning_rate' : 0.1,
            'num_epoch' : 100
          }

model = train_MLP(config)

[epoch 0 / 100] average loss : 0.000244
[epoch 1 / 100] average loss : 0.000240
[epoch 2 / 100] average loss : 0.000224
[epoch 3 / 100] average loss : 0.000211
[epoch 4 / 100] average loss : 0.000199
[epoch 5 / 100] average loss : 0.000189
[epoch 6 / 100] average loss : 0.000179
[epoch 7 / 100] average loss : 0.000170
[epoch 8 / 100] average loss : 0.000162
[epoch 9 / 100] average loss : 0.000154
[epoch 10 / 100] average loss : 0.000148
[epoch 11 / 100] average loss : 0.000141
[epoch 12 / 100] average loss : 0.000136
[epoch 13 / 100] average loss : 0.000131
[epoch 14 / 100] average loss : 0.000126
[epoch 15 / 100] average loss : 0.000122
[epoch 16 / 100] average loss : 0.000118
[epoch 17 / 100] average loss : 0.000114
[epoch 18 / 100] average loss : 0.000111
[epoch 19 / 100] average loss : 0.000108
[epoch 20 / 100] average loss : 0.000105
[epoch 21 / 100] average loss : 0.000102
[epoch 22 / 100] average loss : 0.000100
[epoch 23 / 100] average loss : 0.000098
[epoch 24 / 100] average l

In [12]:
def eval(model, train_version = True) :
    if train_version :
        x = train_img
        labels = train_label.squeeze()
        print('In train dataset ... ')
    else : 
        x = test_img
        labels = test_label.squeeze()
        print('\nIn test dataset ... ')
    
    for layer in model.values() :
        if isinstance(layer, softmax_with_crossEntropy) :
            x = layer.softmax_forward(x)
        else :
            x = layer.forward(x)
            
    preds = x.argmax(axis=1)
    acc = np.sum(np.where(preds==labels, True, False))/len(labels)
    return acc

print('\t Accuracy :', eval(model, train_version=True))
print('\t Accuracy :', eval(model, train_version=False))
print()

In train dataset ... 
	 Accuracy : 0.7596333333333334

In test dataset ... 
	 Accuracy : 0.7677



Sigmoid와 Softmax는 학습 관점에서 크게 4가지 차이점이 있다.

1. 출력값 : 두 함수 모두 출력값은 0과 1사이의 실수지만, Sigmoid는 입력값이 커질수록 1에 가까워 지고, 작아질수록 0에 가까워 진다. 반면 Softmax는 모든 클래스에 대한 확률에 합이 1이 되도록 정규화 된다.

2. 사용 용도 : Sigmoid 함수는 주로 이진 분류 문제에 사용된다. 반면 Softmax 함수는 다중 클래스 분류 문제에 사용된다.

3. 의존성 : Sigmoid 함수는 출력값이 독립적이므로 클래스 간 관계가 없지만, Softmax 함수는 모든 클래스에 대한 확률을 정규화하므로 관계가 있다.

4. 다중 클래스 분류 : Sigmoid는 입력값에 대해 각각 클래스에 대한 확률을 독립적으로 계산하여, 클래스를 이진 분류를 수행한다. 반면, Softmax는 입력값에 대해 각 클래스에 대한 확률을 계산하고 클래스별 확률을 정규화하여 다중 클래스 분류를 수행한다.

## L2 Regularization

#### L2 Regularization을 통해 Overfitting을 방지하고 위의 학습한 데이터와 평가 및 차이를 알아보자

In [13]:
class L2_regularization :
    def __init__(self, lamb) :
        self.lamb = lamb
    
    def forward(self, W) :
        self.W = W
        self.regularization_term = 0.5 * self.lamb * np.sum(self.W**2)
        return self.regularization_term

    def backward(self, W) :
        return self.lamb * W

In [17]:
def train(config) :
    lr, num_epoch, weight_decay = config['learning_rate'], config['num_epoch'], config['weight_decay']
    print_loss_interval = 9

    layer1_l2 = Linear(784, 100)
    relu_l2 = Relu()
    layer2_l2 = Linear(100, 10)
    softmax_with_CE_l2 = softmax_with_crossEntropy()

    L2_reg = L2_regularization(weight_decay)  # L2 정규화 적용
    
    for iter in range(num_epoch) :
        # forward
        x = layer1_l2.forward(train_img)
        x = relu_l2.forward(x)
        x = layer2_l2.forward(x)
        preds = softmax_with_CE_l2.softmax_forward(x)

        # loss
        one_hot_labels = make_one_hot(train_label)
        losses = softmax_with_CE_l2.crossEntropy_forward(preds, one_hot_labels)
        loss = losses.sum()/len(preds)

        if iter % print_loss_interval == 0:
            print("[epoch %d / %d] average loss : %f" % (iter, num_epoch, loss))
        
        # backward
        dL = softmax_with_CE_l2.backward()
        dL = layer2_l2.backward(dL, lr)
        dL = relu_l2.backward(dL)
        dL = layer1_l2.backward(dL, lr)

        # weight update
        layer1_l2.W -= lr * L2_reg.backward(layer1_l2.W)
        layer2_l2.W -= lr * L2_reg.backward(layer2_l2.W)

    model_l2 = OrderedDict()
    model_l2['layer1'] = layer1_l2
    model_l2['relu'] = relu_l2
    model_l2['layer2'] = layer2_l2
    model_l2['softmax_with_CE'] = softmax_with_CE_l2
    
    return model_l2

In [24]:
config = {  'learning_rate' : 0.1,
            'num_epoch' : 100,
            'weight_decay' : 0.1
          }

model_l2 = train(config)

[epoch 0 / 100] average loss : 0.000232
[epoch 9 / 100] average loss : 0.000126
[epoch 18 / 100] average loss : 0.000085
[epoch 27 / 100] average loss : 0.000065
[epoch 36 / 100] average loss : 0.000052
[epoch 45 / 100] average loss : 0.000042
[epoch 54 / 100] average loss : 0.000034
[epoch 63 / 100] average loss : 0.000027
[epoch 72 / 100] average loss : 0.000022
[epoch 81 / 100] average loss : 0.000018
[epoch 90 / 100] average loss : 0.000015
[epoch 99 / 100] average loss : 0.000013


In [25]:
print('\t Accuracy :', eval(model, train_version=True))
print('\t Accuracy :', eval(model, train_version=False))
print('\n ---After L2 regularization---')
print('\t Accuracy :', eval(model_l2, train_version=True))
print('\t Accuracy :', eval(model_l2, train_version=False))
print()

In train dataset ... 
	 Accuracy : 0.7596333333333334

In test dataset ... 
	 Accuracy : 0.7677

 ---After L2 regularization---
In train dataset ... 
	 Accuracy : 0.7961166666666667

In test dataset ... 
	 Accuracy : 0.7995



L2 Regularization을 하였을 때는 하지 않았을 때에 비해 Accuracy가 높게 나온다

Weight Decay에 따라 달라지는데 Weight Decay가 너무 크면 모델이 데이터에 충분히 적합하지 못하고, 너무 작으면 정규화 효과를 보지 못한다