In [13]:
!pip install einops



In [14]:
#구현하는 모델에서 쓰이는 모든 activation함수는 정의하여 드린 GELU 함수를 사용해야함.
#MultiHeadAttention에서 Head로 나눌때, 이미지를 patch로자른후 sequence로 만들때 Rearrange함수를 사용하면 편리함.(사용하지 않으셔도 됩니다)
#CIFAR10에 대한 test accuracy가 60프로 이상인 ViT모델을 만드시오.
import tensorflow as tf
from einops.layers.tensorflow import Rearrange
from tensorflow.keras.activations import gelu
GELU = lambda x : gelu(x)

In [15]:
#논문[1]에서 설명하는 MultiHeadAttention을 만들어라.
class MultiHeadedAttention(tf.keras.Model):
    #dimension - 모델의 dimension(MHA를 거친 후의 dimension)
    def __init__(self, dimension, heads=8):
        super(MultiHeadedAttention, self).__init__()
        ############Write your code Here############
        self.heads = heads
        self.scale = dimension ** -0.5
        self.to_qkv = tf.keras.layers.Dense(dimension * 3, use_bias=False)
        self.to_out = tf.keras.layers.Dense(dimension)

        self.rearrange_qkv = Rearrange('b n (qkv h d) -> qkv b h n d', qkv = 3, h = self.heads)
        self.rearrange_out = Rearrange('b h n d -> b n (h d)')
        ############################################
    def call(self, inputs):
        output = None
        ############Write your code Here############
        qkv = self.to_qkv(inputs)
        qkv = self.rearrange_qkv(qkv)
        q = qkv[0]
        k = qkv[1]
        v = qkv[2]

        dots = tf.einsum('bhid,bhjd->bhij', q, k) * self.scale
        attn = tf.nn.softmax(dots,axis=-1)

        output = tf.einsum('bhij,bhjd->bhid', attn, v)
        output = self.rearrange_out(output)
        output = self.to_out(output)
        ############################################
        return output
 
#인자로 받은 residual_function을 사용하여 real_function값을 return하여주는 Class를 만들어라.(call함수 참고)
class ResidualBlock(tf.keras.Model):
    def __init__(self, residual_function):
        super(ResidualBlock, self).__init__()
        ############Write your code Here############
        self.residual_function = residual_function
        ############################################
 
    def call(self, inputs):
        return self.residual_function(inputs) + inputs
 
#인자로 받은 normfunction에 들어가기전에 LayerNormalization을 해주는 Class를 만들어라.(call함수 참고)
class NormalizationBlock(tf.keras.Model):
    def __init__(self, norm_function, epsilon=1e-5):
        super(NormalizationBlock, self).__init__()
        ############Write your code Here############
        self.normalize = tf.keras.layers.LayerNormalization(epsilon=1e-5)
        self.norm_function = norm_function
        ############################################
 
    def call(self, inputs):
        return self.norm_function(self.normalize(inputs))
 
#논문[1]에서의 MLPBlock을 만들어라.
class MLPBlock(tf.keras.Model):
    #output_dimension - MLPBlock의 output dimension
    #hidden_dimension - MLPBlock의 hidden layer dimension
    def __init__(self, output_dimension, hidden_dimension):
        super(MLPBlock, self).__init__()
        ############Write your code Here############
        self.net_fwd = tf.keras.Sequential()
        self.net_fwd.add(tf.keras.layers.Dense(hidden_dimension, activation=GELU)) 
        self.net_fwd.add(tf.keras.layers.Dense(output_dimension))
        self.net_fwd.add(tf.keras.layers.Dropout(.2, input_shape=(output_dimension,)))
        ############################################
 
    def call(self, inputs):
        output = None
        ############Write your code Here############
        output=self.net_fwd(inputs)
        ############################################
        return output
 
#논문[1]을 읽고 TransformerEncoder를 위에서 정의한 class들을 사용하여 만들어라.
class TransformerEncoder(tf.keras.Model):
    #dimension - 모델의 dimension(MHA를 거친 후의 dimension), heads - MHA에서 head의 개수
    #depth - encoder layer의 개수, mlp_dimension - MLP block의 hidden layer의 dimension
    def __init__(self, dimension, depth, heads, mlp_dimension): 
        super(TransformerEncoder, self).__init__()
        layers_ = []
        for _ in range(depth):
            ############Write your code Here############
            layers_.extend([
                ResidualBlock(NormalizationBlock(MultiHeadedAttention(dimension, heads = heads))),
                ResidualBlock(NormalizationBlock(MLPBlock(dimension, mlp_dimension)))
            ])
            ############################################
        self.layers_ = tf.keras.Sequential(layers_)
 
    def call(self, inputs):
        return self.layers_(inputs)
 
#논문[2]를 읽고 ViT모델을 위에서 정의한 class들을 사용하여 만들어라.
class ImageTransformer(tf.keras.Model):
    #image_size - 이미지의 W==H의 크기(int), patch_size - 이미지를 쪼갤 patch의 크기(int)
    #n_classes - 최종 class의 개수, batch_size - 배치사이즈
    #dimension - 모델의 dimension(MHA를 거친 후의 dimension), depth - encoder layer의 개수
    #heads - MHA에서 head의 개수, mlp_dimension - MLP block의 hidden layer의 dimension
    #channel - input image에 대한 channel의 수
    def __init__(
            self, image_size, patch_size, n_classes, batch_size,
            dimension, depth, heads, mlp_dimension, channels=3):
        super(ImageTransformer, self).__init__()
        assert image_size % patch_size == 0, 'invalid patch size for image size'
 
        num_patches = (image_size // patch_size) ** 2
        self.patch_size = patch_size
        self.dimension = dimension
        self.batch_size = batch_size
       
        self.positional_embedding = self.add_weight(
            "position_embeddings", shape=[num_patches + 1, dimension],
            initializer=tf.keras.initializers.RandomNormal(), dtype=tf.float32
        )
        self.classification_token = self.add_weight(
            "classification_token", shape=[1, 1, dimension],
            initializer=tf.keras.initializers.RandomNormal(), dtype=tf.float32
        )
        ############Write your code Here############
        self.patch_to_embedding = tf.keras.layers.Dense(dimension)
        self.rearrange = Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1=self.patch_size, p2=self.patch_size)
        self.transformer = TransformerEncoder(dimension, depth, heads, mlp_dimension)
        self.identity = tf.identity
        self.mlp_head = tf.keras.Sequential([tf.keras.layers.Dense(mlp_dimension, activation=GELU), tf.keras.layers.Dense(n_classes)])
        ############################################
 
    def call(self, inputs):
        output = None
        ############Write your code Here############
        shapes = tf.shape(inputs)

        x = self.rearrange(inputs)
        x = self.patch_to_embedding(x)
        
        classification_token = tf.broadcast_to(self.classification_token,(shapes[0],1,self.dimension))
        x = tf.concat((classification_token, x), axis=1)
        x += self.positional_embedding
        x = self.transformer(x)

        x = self.identity(x[:, 0])
        output = self.mlp_head(x)
        ############################################
        return output

In [16]:
from tensorflow.keras import datasets
# Download and prepare the CIFAR10 dataset
(train_images, train_labels), (test_images, test_labels) = datasets.cifar10.load_data()
# Normalize pixel values to be between 0 and 1
############Write your code Here############
train_images = train_images /255
test_images = test_images /255
############################################
# Make image shape (BS, H, W, C) to (BS, C, H, W)
############Write your code Here############
import numpy as np
from tensorflow import keras
train_images= np.transpose(train_images,(0,3,1,2))
test_images= np.transpose(test_images,(0,3,1,2))
############################################

#Initialize your model
#Initialize optimizer and loss and compile it to the model
############Write your code Here############
optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)

image_size=32
patch_size=4
n_classes=10
batch_size=256
dimension=64
depth=10
heads=4
mlp_dimension=256
channels=3

model =ImageTransformer(image_size,patch_size,n_classes,batch_size,dimension,depth,heads,mlp_dimension,channels)
model.compile(
        optimizer=optimizer,
        loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        metrics=[
            keras.metrics.SparseCategoricalAccuracy(name="accuracy"),
            keras.metrics.SparseTopKCategoricalAccuracy(5, name="top-5-accuracy"),
        ],
    )
############################################

#Train your model
############Write your code Here############
model.fit(
        x=train_images,
        y=train_labels,
        batch_size=256,
        epochs=50,
        validation_split=0.2
    )
############################################
print('==============Training Finished===============')

#Evaluate your test samples
accuracy = 0
############Write your code Here############
result = model.evaluate(test_images,test_labels,256)
accuracy=result[1]
############################################

print('Test Accuracy :', accuracy)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Test Accuracy : 0.6208000183105469
