In [3]:
import tensorflow as tf
import keras
from tensorflow.keras import Sequential,Model,initializers,layers,Input

def create_padding_mask(x):
  mask = tf.cast(tf.math.equal(x, 0), tf.float32)
  # (batch_size, 1, 1, key의 문장 길이)
  return mask[:, tf.newaxis, tf.newaxis, :]

def create_look_ahead_mask(x):
  seq_len = tf.shape(x)[1]
  look_ahead_mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0) # input equals to zero->하삼각영행렬 생성
  padding_mask = create_padding_mask(x) # 패딩 마스크도 포함
  return tf.maximum(look_ahead_mask, padding_mask)


In [72]:
vocab_size = 20000  # Only consider the top 20k words
maxlen = 200  # Only consider the first 200 words of each movie review
(x_train, y_train), (x_val, y_val) = keras.datasets.imdb.load_data(num_words=vocab_size)
print(len(x_train), "Training sequences")
print(len(x_val), "Validation sequences")
x_train = keras.preprocessing.sequence.pad_sequences(x_train, maxlen=maxlen)
x_val = keras.preprocessing.sequence.pad_sequences(x_val, maxlen=maxlen)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
25000 Training sequences
25000 Validation sequences


AttributeError: module 'keras.preprocessing.sequence' has no attribute 'pad_sequences'

In [69]:
import tensorflow as tf
from tensorflow.keras import layers,Model


class Multi_Head_Attention(layers.Layer):
    
    def __init__(self,d_model,num_heads,dropout=0,mask=None):
        super(Multi_Head_Attention,self).__init__()
        self.d_model=d_model #512
        self.num_heads=num_heads #8
        self.mask=mask
        assert d_model % self.num_heads == 0
        self.d_k=self.d_model//num_heads
        self.dropout=dropout
        self.wq = layers.Dense(self.d_model)
        self.wk = layers.Dense(self.d_model)
        self.wv = layers.Dense(self.d_model)
        self.dense = layers.Dense(self.d_model)

    def call(self,query,key,value):

        def Split_Heads(input):
            batch_size=tf.shape(input)[0]
            seq_length=tf.shape(input)[1]
            a=tf.reshape(input, [batch_size,seq_length, self.num_heads, self.d_k])
            output=tf.transpose(a, perm=[0,2,1,3])
            return output

        def Scaled_Dot_Product_Attnetion(query,key,value):

            key_dim=tf.cast(tf.shape(key)[-1],tf.float32)
            
            query = tf.multiply(query, 1.0 / tf.sqrt(float(key_dim)))
            
            attention_score=tf.matmul(query,key,transpose_a=False,transpose_b=True)
            if self.mask is not None:
                attention_score += (self.mask * -1e9)
                print(attention_score.shape)

            attention_prob=tf.nn.softmax(attention_score,axis=-1)
            attention_prob=tf.nn.dropout(attention_prob,self.dropout)
            attention_value=tf.matmul(attention_prob,value)
            return attention_value # 원문에서는 드랍아웃을 추가해서 Dropout 된거(attention output) 안된거(attention score) 둘다 리턴


        batch_size=tf.shape(key)[0]
        seq_length=tf.shape(key)[1]
   
        q: tf.Tensor = self.wq(query)
        k: tf.Tensor = self.wk(key)
        v: tf.Tensor = self.wv(value)
  
        query=Split_Heads(q)
        key=Split_Heads(k)
        value=Split_Heads(v)
    
        concat_attention=Scaled_Dot_Product_Attnetion(query,key,value)
     
        concat_attention=tf.transpose(concat_attention,perm=[0,2,1,3])

        concat_attention=tf.reshape(concat_attention,shape=(batch_size,seq_length,self.d_model))
    
        output=self.dense(concat_attention)

    
        return output

class MLP(layers.Layer):
    def __init__(self,units,dropout_rate):
        self.units=units
        self.dropout_rate=dropout_rate
        self.dropout=layers.Dropout(self.dropout_rate)
        self.mlp1=layers.Dense(units=2*self.units,activation=tf.nn.relu)
        self.mlp2=layers.Dense(units=self.units,activation=tf.nn.relu)
        super(MLP,self).__init__()

    def call(self,input):
        x=self.mlp1(input)
        x=self.dropout(x)
        x=self.mlp2(x)
        x=self.dropout(x)
    
        return x

In [12]:
import tensorflow as tf
from tensorflow.keras import layers,Model,Input
#from Modules import *
#from Data_Prerposessing import *

class Transformer():
    def __init__(self,input_size,num_classes,patch_size,projection_dim,transformer_layers,d_model,num_heads,dropout_MHA,dropout_MLP,mask):
        self.input_size= input_size
        self.num_classes=num_classes
        self.image_size=self.input_size[0]
        self.patch_size=patch_size
        self.num_patches=(self.image_size//self.patch_size)**2
        self.projection_dim=projection_dim
        self.transformer_layers=transformer_layers
        self.MLP_units=[self.projection_dim*2,self.projection_dim]
        self.d_model=d_model
        self.num_heads=num_heads
        self.dropout_MHA=dropout_MHA
        self.dropout_MLP=dropout_MLP
        self.padding_mask=Input(shape=(1,1,None))
        self.look_ahead_mask=Input(shape=(1,None,None))
        self.Encoder_Multi_Head_Attention=Multi_Head_Attention(d_model=self.d_model,num_heads=self.num_heads,mask=self.padding_mask)
        self.Decoder_Multi_Head_Attention=Multi_Head_Attention(d_model=self.d_model,num_heads=self.num_heads,mask=self.look_ahead_mask)

        self.MLP=MLP(units=self.d_model,dropout_rate=self.dropout_MLP)
        self.output=layers.Dense(units=self.num_classes)

    def Encoder_Block(self,input):

        x1=layers.LayerNormalization(epsilon=1e-6)(input)
        x1=self.Encoder_Multi_Head_Attention(x1,x1,x1)
        x2=layers.Add()([x1,input])
        x3=layers.LayerNormalization(epsilon=1e-6)(x2)
        
        x3=MLP(x3)
        
        x4=layers.Add()([x3,x2])
        
        return x4


    def Decoder_Block(self,input,enc_out):
        
        x=self.Decoder_Multi_Head_Attention(input,input,input)#query,key,value
        x=layers.LayerNormalization(epsilon=1e-6)(x)
        x1=layers.Add()([x,input])
                
        x2=self.Decoder_Multi_Head_Attention(x1,enc_out,enc_out)
        x2=layers.LayerNormalization(epsilon=1e-6)(x2)
        x3=layers.Add()([x1,x2])

        x4=MLP(units=self.d_model,dropout_rate=self.dropout_MLP)(x3)
        
        x4=layers.LayerNormalization(epsilon=1e-6)(x4)
        
        x4=layers.Add()([x3,x4])
        
        return x4


    def call(self):
        # input=Input(shape=(self.input_size))

        # x=self.Encoder_Block(x)
        # for i in range(self.transformer_layers-1):
        #     x=self.Encoder_Block(x)
        
        # # word embedding 단계

        # x=self.Decoder_Block()
        

        # model=Model(inputs=input,outputs=output)
        pass

        return None



In [20]:
model=Transformer( 
    input_size=(224,224,3),
    num_classes=100,
    patch_size=8,
    projection_dim=64,
    transformer_layers=8,
    d_model=64,
    num_heads=4,
    dropout_MHA=0.1,
    dropout_MLP=0.1).call()

In [None]:
model.summary()

In [69]:
lr_rate=tf.keras.optimizers.schedules.ExponentialDecay(1e-3, 10000, 0.97, staircase=False, name=None)

    
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=lr_rate),loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True),metrics=['acc'])

In [2]:
import tensorflow as tf

In [7]:
a=tf.ones(shape=(16,56,56,48))
b=tf.ones(shape=(16,57,48,56))

c=tf.matmul(a,b)
c.shape

InvalidArgumentError: {{function_node __wrapped__BatchMatMulV2_device_/job:localhost/replica:0/task:0/device:GPU:0}} In[0] and In[1] must have compatible batch dimensions: [16,56,56,48] vs. [16,57,48,56] [Op:BatchMatMulV2]