<a href="https://colab.research.google.com/github/jhlee508/handson-ml2-study/blob/master/HoM_Ch11_Training_DNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## 1. 그래디언트 소실과 폭주 문제

### 1.1 글로럿과 He 초기화

In [10]:
# fan in(입력의 연결 개수) 기반
tf.keras.layers.Dense(10, activation = 'relu', kernel_initializer='he_normal')

# fan out(출력의 연결 개수) 기반
he_avg_init = tf.keras.initializers.VarianceScaling(scale = 2., mode = 'fan_avg',
                                                    distribution='uniform')

tf.keras.layers.Dense(10, activation='relu', kernel_initializer=he_avg_init)

<tensorflow.python.keras.layers.core.Dense at 0x7f173a187390>

keras는 디폴트로 Glorot_Uniform 사용

### 1.2 수렴하지 않는 활성화 함수

In [15]:
# LeakyReLU
model = tf.keras.models.Sequential([
                                 tf.keras.layers.Dense(10, kernel_initializer="he_normal"),
                                 tf.keras.layers.LeakyReLU(alpha = 0.2) # 기본값은 0.3
])

In [16]:
# PReLU
model = tf.keras.models.Sequential([
                                 tf.keras.layers.Dense(10, kernel_initializer="he_normal"),
                                 tf.keras.layers.PReLU()
])

In [17]:
# SELU 활성화 함수 사용시
layer = tf.keras.layers.Dense(10, activation='selu', kernel_initializer='lecun_normal')

### 1.3 배치 정규화

In [18]:
model = tf.keras.models.Sequential([
           tf.keras.layers.Flatten(input_shape = [28, 28]),
           tf.keras.layers.BatchNormalization(),
           tf.keras.layers.Dense(300, activation='elu', kernel_initializer='he_normal'),
           tf.keras.layers.BatchNormalization(),
           tf.keras.layers.Dense(100, activation='elu', kernel_initializer='he_normal'),
           tf.keras.layers.BatchNormalization(),
           tf.keras.layers.Dense(10, activation='softmax')
           ])

model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten (Flatten)            (None, 784)               0         
_________________________________________________________________
batch_normalization (BatchNo (None, 784)               3136      
_________________________________________________________________
dense_16 (Dense)             (None, 300)               235500    
_________________________________________________________________
batch_normalization_1 (Batch (None, 300)               1200      
_________________________________________________________________
dense_17 (Dense)             (None, 100)               30100     
_________________________________________________________________
batch_normalization_2 (Batch (None, 100)               400       
_________________________________________________________________
dense_18 (Dense)             (None, 10)               

In [20]:
# 첫번째 배치 정규화 층의 파라미터
[(var.name, var.trainable) for var in model.layers[1].variables]

[('batch_normalization/gamma:0', True),
 ('batch_normalization/beta:0', True),
 ('batch_normalization/moving_mean:0', False),
 ('batch_normalization/moving_variance:0', False)]

감마와 베타는 역전파로 훈련되고, 나머지 이동평균과 분산은 훈련되지 않음

In [21]:
# Batch Normaliztion은 활성화 함수 이전에 사용하는 것이 더 좋다고 함
# 따라서, 은닉층에서 활성화 함수를 지정하지 말고 배치 정규화 층 뒤에 별도의 층으로 추가

model = tf.keras.models.Sequential([
           tf.keras.layers.Flatten(input_shape = [28, 28]),
           tf.keras.layers.BatchNormalization(),
           tf.keras.layers.Dense(300, kernel_initializer='he_normal'),
           tf.keras.layers.BatchNormalization(),
           tf.keras.layers.Activation("elu"),
           tf.keras.layers.Dense(100, kernel_initializer='he_normal'),
           tf.keras.layers.BatchNormalization(),
           tf.keras.layers.Activation("elu"),
           tf.keras.layers.Dense(10, activation='softmax')
           ])

model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_1 (Flatten)          (None, 784)               0         
_________________________________________________________________
batch_normalization_3 (Batch (None, 784)               3136      
_________________________________________________________________
dense_19 (Dense)             (None, 300)               235500    
_________________________________________________________________
batch_normalization_4 (Batch (None, 300)               1200      
_________________________________________________________________
activation (Activation)      (None, 300)               0         
_________________________________________________________________
dense_20 (Dense)             (None, 100)               30100     
_________________________________________________________________
batch_normalization_5 (Batch (None, 100)              

### 1.4 그래디언트 클리핑
- 역전파될 때 일정 임곗값을 넘어서지 못하게 그래디언트를 잘라내는 것

In [22]:
optimizer = tf.keras.optimizers.SGD(clipvalue = 1.0)
model.compile(loss = "mse", optimizer=optimizer)

## 3. 고속 옵티마이저

In [24]:
# 모멘텀 최적화
optimizer = tf.keras.optimizers.SGD(lr = 0.001, momentum=0.9)

In [25]:
# NAG
optimizer = tf.keras.optimizers.SGD(lr = 0.001, momentum=0.9, nesterov=True) 

In [26]:
# AdaGrad - 심층신경망에서 사용하기 힘듬.
optimizer = tf.keras.optimizers.Adagrad(lr = 0.001) 

In [30]:
# RMSProp
optimizer = tf.keras.optimizers.RMSprop(lr = 0.001, rho = 0.9)

In [31]:
# Adam
optimizer = tf.keras.optimizers.Adam(lr = 0.001, beta_1=0.9, beta_2=0.999)

In [33]:
# Adamax
optimizer = tf.keras.optimizers.Adamax(lr = 0.001, beta_1=0.9, beta_2=0.999)

In [34]:
# Nadam
optimizer = tf.keras.optimizers.Nadam(lr = 0.001, beta_1=0.9, beta_2=0.999)

### 3.6 학습률 스케줄링

In [37]:
# 거듭제곱 기반 스케줄링 
optimizer = tf.keras.optimizers.SGD(lr = 0.01, decay = 1e-4) # decay : 학습률을 나누기 위해 수행할 스텝 수의 역수

In [38]:
# 지수기반 스케줄링
def exponential_decay(lr0, s):
  def exponential_decay_fn(epoch):
    return lr0 * 0.1 ** (epoch/s)
  
  return exponential_decay_fn

exponential_decay_fn = exponential_decay(lr0=0.01, s=20)

In [39]:
# LearningRateScheduler
lr_scheduler = tf.keras.callbacks.LearningRateScheduler(exponential_decay_fn)

In [None]:
# 실제 사용
history = model.fit(X_train_scaled, y_train, epochs = 20,
                    validation_data = (X_valid_scaled, y_valid)
                    callbacks = [lr_scheduler]
                    )

In [None]:
# 구간별 고정 스케줄링
def piecewise_constant_fn(epoch):
  if epoch < 5:
    return 0.01
  elif epoch < 15:
    return 0.005
  else:
    return 0.001

In [36]:
# 성능 기반 스케쥴링
lr_scheduler = tf.keras.callbacks.ReduceLROnPlateau(factor = 0.5, patience = 5)

In [None]:
# tf.keras 학습률 스케줄
s = 20 * len(X_train) // 32 
learning_rate = tf.keras.optimizers.schedules.ExponentialDecay(0.01, s, 0.1)
optimizer = tf.keras.optimizers.SGD(learning_rate)

## 4. 규제를 사용해 과대적합 피하기

### 4.1 L1, L2 규제

In [42]:
# L2 규제
layer = tf.keras.layers.Dense(100, activation = 'relu',
                              kernel_initializer = "he_normal",
                              kernel_regularizer = tf.keras.regularizers.l2(0.02))

In [43]:
# 파이썬 functools.partial() 함수
from functools import partial

RegularizedDense = partial(tf.keras.layers.Dense,
                           activation = 'relu',
                           kernel_initializer = "he_normal",
                           kernel_regularizer = tf.keras.regularizers.l2(0.02))


model = tf.keras.models.Sequential([
                tf.keras.layers.Flatten(input_shape = [28, 28]),
                RegularizedDense(300),
                RegularizedDense(100),
                RegularizedDense(10, activation = 'softmax',
                                 kernel_initializer = 'glorot_uniform')
                ])

### 4.2 드롭아웃

In [44]:
model = tf.keras.models.Sequential([
           tf.keras.layers.Flatten(input_shape = [28, 28]),
           tf.keras.layers.Dropout(rate = 0.2),
           tf.keras.layers.Dense(300, kernel_initializer='he_normal'),
           tf.keras.layers.Dropout(rate = 0.2),
           tf.keras.layers.Activation("elu"),
           tf.keras.layers.Dense(100, kernel_initializer='he_normal'),
           tf.keras.layers.Dropout(rate = 0.2),
           tf.keras.layers.Dense(10, activation='softmax')
           ])

model.summary()

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_3 (Flatten)          (None, 784)               0         
_________________________________________________________________
dropout (Dropout)            (None, 784)               0         
_________________________________________________________________
dense_26 (Dense)             (None, 300)               235500    
_________________________________________________________________
dropout_1 (Dropout)          (None, 300)               0         
_________________________________________________________________
activation_2 (Activation)    (None, 300)               0         
_________________________________________________________________
dense_27 (Dense)             (None, 100)               30100     
_________________________________________________________________
dropout_2 (Dropout)          (None, 100)              

### 4.3 몬테 카를로 드롭아웃
- 테스트 시에 드롭아웃 사용

In [None]:
y_probas = np.stack([model(X_test_scaled, training=True) for sample in range(100)])
y_proba = y_probas.mean(axis = 0)

### 4.4 맥스-노름 규제

In [46]:
tf.keras.layers.Dense(100, activation='elu',
                      kernel_initializer = 'he_normal',
                      kernel_constraint=tf.keras.constraints.max_norm(1.))

<tensorflow.python.keras.layers.core.Dense at 0x7f17302ff410>