In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
keras.layers.Dense(10, activation="relu", kernel_initializer="he_normal")

<tensorflow.python.keras.layers.core.Dense at 0x18201b5c208>

In [3]:
he_avg_init = keras.initializers.VarianceScaling(scale=2., mode="fan_avg", distribution="uniform")
keras.layers.Dense(10, activation="sigmoid", kernel_initializer=he_avg_init)

<tensorflow.python.keras.layers.core.Dense at 0x18201c1e898>

In [4]:
(X_train_full, y_train_full), (X_test, y_test) = keras.datasets.fashion_mnist.load_data()
X_train_full = X_train_full / 255.0
X_test = X_test / 255.0
X_valid, X_train = X_train_full[:5000], X_train_full[5000:]
y_valid, y_train = y_train_full[:5000], y_train_full[5000:]

In [5]:
model = keras.models.Sequential([
    keras.layers.Flatten(input_shape=list(X_train[0].shape)),
    keras.layers.Dense(10, kernel_initializer="he_normal"),
    keras.layers.LeakyReLU(alpha=0.2),
    keras.layers.Dense(10, activation="softmax")
])

In [6]:
model.compile(loss="sparse_categorical_crossentropy",
              optimizer=keras.optimizers.SGD(learning_rate=1e-3),
              metrics=["accuracy"])

In [7]:
history=model.fit(X_train, y_train, epochs=10, validation_data=(X_valid, y_valid))

Train on 55000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [8]:
model = keras.models.Sequential([
    keras.layers.Flatten(input_shape=list(X_train[0].shape)),
    keras.layers.Dense(10, activation="selu", kernel_initializer="lecun_normal"),
    keras.layers.LeakyReLU(alpha=0.2),
    keras.layers.Dense(10, activation="softmax")
])

In [9]:
model.compile(loss="sparse_categorical_crossentropy",
              optimizer=keras.optimizers.SGD(learning_rate=1e-3),
              metrics=["accuracy"])

In [10]:
history=model.fit(X_train, y_train, epochs=10, validation_data=(X_valid, y_valid))

Train on 55000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# 배치정규화

In [11]:
model = keras.models.Sequential([
    keras.layers.Flatten(input_shape=[28, 28]),
    keras.layers.BatchNormalization(),
    keras.layers.Dense(300, activation="elu", kernel_initializer="he_normal"),
    keras.layers.BatchNormalization(),
    keras.layers.Dense(100, activation="elu", kernel_initializer="he_normal"),
    keras.layers.BatchNormalization(),
    keras.layers.Dense(10, activation="softmax"),
])

In [12]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_2 (Flatten)          (None, 784)               0         
_________________________________________________________________
batch_normalization (BatchNo (None, 784)               3136      
_________________________________________________________________
dense_6 (Dense)              (None, 300)               235500    
_________________________________________________________________
batch_normalization_1 (Batch (None, 300)               1200      
_________________________________________________________________
dense_7 (Dense)              (None, 100)               30100     
_________________________________________________________________
batch_normalization_2 (Batch (None, 100)               400       
_________________________________________________________________
dense_8 (Dense)              (None, 10)               

In [13]:
[(var.name, var.trainable) for var in model.layers[1].variables]

[('batch_normalization/gamma:0', True),
 ('batch_normalization/beta:0', True),
 ('batch_normalization/moving_mean:0', False),
 ('batch_normalization/moving_variance:0', False)]

In [14]:
### updates는 삭제될 예정이라함
#model.layers[1].updates

In [15]:
model = keras.models.Sequential([
    keras.layers.Flatten(input_shape=[28, 28]),
    keras.layers.BatchNormalization(),
    keras.layers.Dense(300, kernel_initializer="he_normal", use_bias=False),
    keras.layers.BatchNormalization(),
    keras.layers.Activation("elu"),
    keras.layers.Dense(100, kernel_initializer="he_normal", use_bias=False),
    keras.layers.BatchNormalization(),
    keras.layers.Activation("elu"),
    keras.layers.Dense(10, activation="softmax"),
])

In [16]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_3 (Flatten)          (None, 784)               0         
_________________________________________________________________
batch_normalization_3 (Batch (None, 784)               3136      
_________________________________________________________________
dense_9 (Dense)              (None, 300)               235200    
_________________________________________________________________
batch_normalization_4 (Batch (None, 300)               1200      
_________________________________________________________________
activation (Activation)      (None, 300)               0         
_________________________________________________________________
dense_10 (Dense)             (None, 100)               30000     
_________________________________________________________________
batch_normalization_5 (Batch (None, 100)              

In [17]:
model.compile(loss="sparse_categorical_crossentropy",
              optimizer=keras.optimizers.SGD(learning_rate=1e-3),
              metrics=["accuracy"])

In [18]:
### 배치정규화층은 시간 소모를 높이지만 성능이 월등히 높아짐

history=model.fit(X_train, y_train, epochs=10, validation_data=(X_valid, y_valid))

Train on 55000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# 그래디언트 클리핑 

In [19]:
model = keras.models.Sequential([
    keras.layers.Flatten(input_shape=list(X_train[0].shape)),
    keras.layers.Dense(10, kernel_initializer="he_normal"),
    keras.layers.LeakyReLU(alpha=0.2),
    keras.layers.Dense(10, activation="softmax")
])

### clipvalue를 사용하면 임계치를 넘은 축만 스케일링하며 clipnorm은 전체축을 스케일링함
### 일반적으로 clipnorm을 쓰면 임계치를 넘은 축 외에는 무시되는 경우가 많음

In [20]:
optimizer = keras.optimizers.SGD(clipvalue = 1.0)
model.compile(loss="sparse_categorical_crossentropy", optimizer=optimizer, metrics=["accuracy"])

In [21]:
history=model.fit(X_train, y_train, epochs=10, validation_data=(X_valid, y_valid))

Train on 55000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [51]:
optimizer = keras.optimizers.SGD(clipnorm = 1.0)
model.compile(loss="sparse_categorical_crossentropy", optimizer=optimizer, metrics=["accuracy"])

In [52]:
history=model.fit(X_train, y_train, epochs=10, validation_data=(X_valid, y_valid))

Train on 55000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


##### 성능차이는 미세하지만 clipnorm의 case가 더 수렴이 빨랐다 
##### 확신은 못하겠지만 이미지 classify의 경우 중요하지 않은 차원을 빼는게 도움이 되기 때문이 아닐까? 라고 생각한다

# 전이학습

In [53]:
### y_5_or_6는 y가 5와 6인 값만 True를 반환해주는 입출력 크기가 바뀌지 않는 리스트 이며 이 bool값을 이용해 특정 값만 선택가능
### 이거를 다시 6


def split_dataset(X, y):
    y_5_or_6 = (y == 5) | (y == 6) # sandals or shirts
    y_A = y[~y_5_or_6]
    y_A[y_A > 6] -= 2 # class indices 7, 8, 9 should be moved to 5, 6, 7
    y_B = (y[y_5_or_6] == 6).astype(np.float32) # binary classification task: is it a shirt (class 6)?
    return ((X[~y_5_or_6], y_A),
            (X[y_5_or_6], y_B))

### 간략하게 설명하자면 class 5, 6만 _B를 붙여서 bool형으로 떼주고 200개만 남김 나머지 부분은 _A를 붙여서 데이터 분할

(X_train_A, y_train_A), (X_train_B, y_train_B) = split_dataset(X_train, y_train)
(X_valid_A, y_valid_A), (X_valid_B, y_valid_B) = split_dataset(X_valid, y_valid)
(X_test_A, y_test_A), (X_test_B, y_test_B) = split_dataset(X_test, y_test)
X_train_B = X_train_B[:200]
y_train_B = y_train_B[:200]

In [23]:
X_train_A.shape

(43986, 28, 28)

In [24]:
X_train_B.shape

(200, 28, 28)

In [25]:
y_train_B.shape

(200,)

In [26]:
### 5와6을 빼고 빈 공간에 계속 뒤에서 채워넣었으므로 0~7

pd.Series(y_train_A).value_counts()

0    5543
4    5512
6    5510
3    5499
2    5496
7    5494
5    5488
1    5444
dtype: int64

In [27]:
model_A = keras.models.Sequential()
model_A.add(keras.layers.Flatten(input_shape=[28, 28]))
for n_hidden in (300, 100, 50, 50, 50):
    model_A.add(keras.layers.Dense(n_hidden, activation="selu"))
model_A.add(keras.layers.Dense(8, activation="softmax"))

In [28]:
model_A.compile(loss="sparse_categorical_crossentropy",
                optimizer=keras.optimizers.SGD(learning_rate=1e-3),
                metrics=["accuracy"])

In [29]:
history = model_A.fit(X_train_A, y_train_A, epochs=20,
                    validation_data=(X_valid_A, y_valid_A))

Train on 43986 samples, validate on 4014 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [30]:
model_A.save("my_model_A.h5")

In [54]:
model_A = keras.models.load_model("my_model_A.h5")
model_B_on_A = keras.models.Sequential(model_A.layers[:-1])
###출력층위 제외하고 가져온다
model_B_on_A.add(keras.layers.Dense(1, activation="sigmoid"))

In [56]:
### model_A는 model_B_on_A를 훈련시킬때 공유하는 층이 같이 훈련되므로 미리 모델을 복사해두어야함
### 근데 사실상 h5파일을 두번 가져오는것과 동일한 동작이라 이런 필요가 있나 싶다

model_A_clone = keras.models.clone_model(model_A)
model_A_clone.set_weights(model_A.get_weights())

# 층 동결
### 새로 추가된 출력층의 가중치가 random하게 초기화 되었으므로 어느정도 맞춰준다음 전체학습 하는게 좋다

In [58]:
for layer in model_B_on_A.layers[:-1]:
    layer.trainable = False

### 층을 동결하거나 동결해제했다면 반드시!! 이후에 다시 컴파일 해줘야함
model_B_on_A.compile(loss="binary_crossentropy", optimizer="sgd", metrics=["accuracy"])

In [59]:
history = model_B_on_A.fit(X_train_B, y_train_B, epochs=4, validation_data=(X_valid_B, y_valid_B))

for layer in model_B_on_A.layers[:-1]:
    layer.trainable = True

optimizer = keras.optimizers.SGD(lr=1e-4)
### 전이학습시 동결을 해제한다음의 학습률을 낮춰줘야함
model_B_on_A.compile(loss="binary_crossentropy", optimizer=optimizer, metrics=["accuracy"])

Train on 200 samples, validate on 986 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [35]:
history = model_B_on_A.fit(X_train_B, y_train_B, epochs=16, validation_data=(X_valid_B, y_valid_B))

Train on 200 samples, validate on 986 samples
Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16


In [36]:
model_B_on_A.evaluate(X_test_B, y_test_B)



[0.10953126209974288, 0.987]

###  model_B데이터만 사용하여 MLP구성해보았는데 애시당초 샘플수의 차이 때문에 성능낮음
### 이미지 분류 같은 경우 미리 많은 샘플로 가중치를 초기화한 모델을 base로 두고 이후 (은닉)+출력층 구성하면
### 샘플 수, 모델 구상 노력대비 성능, 시간적으로 매우 이득

In [37]:
model_B_test = keras.models.Sequential()
model_B_test.add(keras.layers.Flatten(input_shape=[28, 28]))
for n_hidden in (300, 100, 50, 50, 50):
    model_B_test.add(keras.layers.Dense(n_hidden, activation="selu"))
model_B_test.add(keras.layers.Dense(1, activation="sigmoid"))

In [38]:
optimizer = keras.optimizers.SGD(lr=1e-4)
model_B_test.compile(loss="binary_crossentropy", optimizer=optimizer, metrics=["accuracy"])

In [39]:
history = model_B_test.fit(X_train_B, y_train_B, epochs=25, validation_data=(X_valid_B, y_valid_B))

Train on 200 samples, validate on 986 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


# 배치 정규화 이후 건너 뛰고 optimizer 부분을 먼저 살펴보았음

## optimizer만 나와있어서 모델은 적절히 위에서 가져왔음

In [40]:
model = keras.models.Sequential([
    keras.layers.Flatten(input_shape=list(X_train[0].shape)),
    keras.layers.Dense(10, activation="relu"),
    keras.layers.LeakyReLU(alpha=0.2),
    keras.layers.Dense(10, activation="softmax")
])

In [41]:
##옵티마이저 바꿀시 accuracy비교를 위한 그냥 SGD
model.compile(loss="sparse_categorical_crossentropy",
              optimizer=keras.optimizers.SGD(lr=0.01),
              metrics=["accuracy"])

In [42]:
history=model.fit(X_train, y_train, epochs=10, validation_data=(X_valid, y_valid))

Train on 55000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# 모멘텀 최적화
### SGD와 다르게 기울기 벡터에 학습률을 곱해 바로 update하지 않고 모멘텀 벡터에 더해주며 이전 가중치를 일정량 반영함
### SGD보다 빠르게 수렴하며 보통 모멘텀으로 0.9사용

In [43]:
#momentum 매개변수는 그냥 SGD가 아닌 keras가 구현한 SGD에만 넣어줄 수 있음
model.compile(loss="sparse_categorical_crossentropy",
              optimizer=keras.optimizers.SGD(lr=0.01, momentum=0.9),
              metrics=["accuracy"])

In [44]:
history=model.fit(X_train, y_train, epochs=10, validation_data=(X_valid, y_valid))

Train on 55000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# 네스테로프 가속경사
### 모멘텀 벡터를 현재 기울기가 아닌 모멘텀 방향으로 조금 앞선 위치의 기울기로 update함
### 모멘텀 최적화보다 빠르며 최소점에서 진동을 줄일 수 있음

In [45]:
model.compile(loss="sparse_categorical_crossentropy",
              optimizer=keras.optimizers.SGD(lr=0.01, momentum=0.9, nesterov=True),
              metrics=["accuracy"])

In [46]:
history=model.fit(X_train, y_train, epochs=10, validation_data=(X_valid, y_valid))

Train on 55000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# Adagrad
### 각차원마다 기울기를 일정량 누적해서 더해가며 위치 update시 가속도에 나눠줌
### 이로 인하여 scale이 큰 차원의 학습률이 더 빠르게 감소하며 이를 *적응적 학습률* 이라함

In [47]:
model.compile(loss="sparse_categorical_crossentropy",
              optimizer=keras.optimizers.Adagrad(lr=0.01),
              metrics=["accuracy"])

In [48]:
history=model.fit(X_train, y_train, epochs=10, validation_data=(X_valid, y_valid))

Train on 55000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# RMS Prop
### 학습률을 감쇠시킬때 최근의 기울기만 사용해서 누적하여 학습이 너무 빠르게 끝나는것을 막음

In [60]:
model.compile(loss="sparse_categorical_crossentropy",
              optimizer=keras.optimizers.RMSprop(lr=0.001, rho=0.9),
              metrics=["accuracy"])

In [61]:
history=model.fit(X_train, y_train, epochs=10, validation_data=(X_valid, y_valid))

Train on 55000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# Adam
### 모멘텀최적화와 RMSProp가 결합한 형태라고 보면된다
### 적응적 학습률이라 학습률 튜닝할 필요가 없다?

In [62]:
model.compile(loss="sparse_categorical_crossentropy",
              optimizer=keras.optimizers.Adam(lr=0.001, beta_1 = 0.9, beta_2 = 0.999),
              metrics=["accuracy"])

In [63]:
history=model.fit(X_train, y_train, epochs=10, validation_data=(X_valid, y_valid))

Train on 55000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# Adamax
### 그레디언트 제곱 누적시 1-B^inf를 이용해 결과적으로 이전 누적 기울기 혹은 현재 기울기만 사용하는거 같은데...

In [65]:
model.compile(loss="sparse_categorical_crossentropy",
    optimizer = keras.optimizers.Adamax(learning_rate=0.001, beta_1=0.9, beta_2=0.999),
                            metrics=["accuracy"])

In [66]:
history=model.fit(X_train, y_train, epochs=10, validation_data=(X_valid, y_valid))

Train on 55000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# 학습률 스케쥴링 