## Transformers - Group Members

- Abhijit
- Biyun
- Hayoung Kim
- Karthick Vel Kathirvel
- Rahull Borana

In [None]:
import numpy as np
import tensorflow as tf

In [None]:
n = 7
m = 7
block_size = 16
hidden_dim = 64
num_layers = 6
num_heads = 8
key_dim = 8
mlp_dim = 128
dropout_rate = 0.05
num_classes = 10


In [None]:
# this is written as a tensorflow "layer".  it's just a vector the same size as the 
# output of the previous layer. the vector is initialized randomly, but we'll use 
# gradient descent to update the values in the vector
#
# it's purpose is to be appended to the beginning of the sequence of vectors fed into
# the transformer.  then after the transformer runs on the whole data, we just grab
# the resulting zero-th vector...the class token...and use that as the portfolio weights
class ClassToken(tf.keras.layers.Layer):
    def __init__(self):
        super().__init__()

    def build(self, input_shape):
        w_init = tf.random_normal_initializer()
        self.w = tf.Variable(
            initial_value = w_init(shape=(1, 1, input_shape[-1]), dtype=tf.float32),
            trainable = True
        )

    def call(self, inputs):
        batch_size = tf.shape(inputs)[0]
        hidden_dim = self.w.shape[-1]

        cls = tf.broadcast_to(self.w, [batch_size, 1, hidden_dim])
        cls = tf.cast(cls, dtype=inputs.dtype)
        return cls

In [None]:
def build_ViT(n,m,block_size,hidden_dim,num_layers,num_heads,key_dim,mlp_dim,dropout_rate,num_classes):
    # n is number of rows of blocks
    # m is number of cols of blocks
    # block_size is number of pixels (with rgb) in each block
    
    inp = tf.keras.layers.Input(shape=(n*m,block_size))
    inp2 = tf.keras.layers.Input(shape=(n*m))
    mid = tf.keras.layers.Dense(hidden_dim)(inp) # transform to vectors with different dimension
    # the positional embeddings
#     positions = tf.range(start=0, limit=n*m, delta=1)
    emb = tf.keras.layers.Embedding(input_dim=n*m, output_dim=hidden_dim)(inp2) # learned positional embedding for each of the n*m possible possitions
    mid = mid + emb # for some reason, tf.keras.layers.Add causes an error, but + doesn't?
    # create and append class token to beginning of all input vectors
    token = ClassToken()(mid) # append class token to beginning of sequence
    mid = tf.keras.layers.Concatenate(axis=1)([token, mid])
    
    for l in range(num_layers): # how many Transformer Head layers are there?
        ln  = tf.keras.layers.LayerNormalization()(mid) # normalize
        mha = tf.keras.layers.MultiHeadAttention(num_heads=num_heads,key_dim=key_dim,value_dim=key_dim)(ln,ln,ln) # self attention!
        add = tf.keras.layers.Add()([mid,mha]) # add and norm
        ln  = tf.keras.layers.LayerNormalization()(add)
        den = tf.keras.layers.Dense(mlp_dim,activation='gelu')(ln) # maybe should be relu...who knows...
        den = tf.keras.layers.Dropout(dropout_rate)(den) # regularization
        den = tf.keras.layers.Dense(hidden_dim)(den) # back to the right dimensional space
        den = tf.keras.layers.Dropout(dropout_rate)(den)
        mid = tf.keras.layers.Add()([den,add]) # add and norm again
    ln = tf.keras.layers.LayerNormalization()(mid)
    fl = ln[:,0,:] # just grab the class token for each image in batch
    clas = tf.keras.layers.Dense(num_classes,activation='softmax')(fl) # probability that the image is in each category
    mod = tf.keras.models.Model([inp,inp2],clas)
    mod.compile(optimizer='adam',loss='sparse_categorical_crossentropy',metrics=['accuracy'])
    return mod

In [None]:
trans = build_ViT(n,m,block_size,hidden_dim,num_layers,num_heads,key_dim,mlp_dim,dropout_rate,num_classes)
trans.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 49, 16)]     0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 49)]         0           []                               
                                                                                                  
 dense (Dense)                  (None, 49, 64)       1088        ['input_1[0][0]']                
                                                                                                  
 embedding (Embedding)          (None, 49, 64)       3136        ['input_2[0][0]']                
                                                                                              

In [None]:
mnist = tf.keras.datasets.mnist

(x_train, y_train),(x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0

ndata_train = x_train.shape[0]
ndata_test = x_test.shape[0]

In [None]:
x_train.shape

(60000, 28, 28)

In [None]:
x_train_ravel = np.zeros((ndata_train,n*m,block_size))
for img in range(ndata_train):
    ind = 0
    for row in range(n):
        for col in range(m):
            x_train_ravel[img,ind,:] = x_train[img,(row*4):((row+1)*4),(col*4):((col+1)*4)].ravel()
            ind += 1
            

In [None]:
x_test_ravel = np.zeros((ndata_test,n*m,block_size))
for img in range(ndata_test):
    ind = 0
    for row in range(n):
        for col in range(m):
            x_test_ravel[img,ind,:] = x_test[img,(row*4):((row+1)*4),(col*4):((col+1)*4)].ravel()
            ind += 1

In [None]:
pos_feed_train = np.array([list(range(n*m))]*ndata_train)
pos_feed_test = np.array([list(range(n*m))]*ndata_test)

In [None]:
trans.fit([x_train_ravel,pos_feed_train],y_train,epochs=40,batch_size = 40,validation_split=0.2)

In [None]:
import numpy as np

In [None]:
import tensorflow as tf

class ClassToken(tf.keras.layers.Layer):
    def __init__(self, **kwargs):  
        super(ClassToken, self).__init__(**kwargs)

    def build(self, input_shape):
        w_init = tf.random_normal_initializer()
        self.w = self.add_weight(
            shape=(1, 1, input_shape[-1]), 
            initializer=w_init,
            trainable=True,
            name='cls_token'  
        )

    def call(self, inputs):
        batch_size = tf.shape(inputs)[0]
        hidden_dim = self.w.shape[-1]
        cls = tf.broadcast_to(self.w, [batch_size, 1, hidden_dim])
        return tf.cast(cls, dtype=inputs.dtype)

model_path = 'ViT_validationAcc9867_updated.h5'  # Path to your model file
#model_path = 'ViT_validationAcc9867.h5'  # Path to your model file
loaded_model = tf.keras.models.load_model(model_path, custom_objects={'ClassToken': ClassToken})

In [None]:
n = 7
m = 7
block_size = 16
hidden_dim = 64
num_layers = 6
num_heads = 8
key_dim = 8
mlp_dim = 128
dropout_rate = 0.05
num_classes = 10


In [None]:
mnist = tf.keras.datasets.mnist

(x_train, y_train),(x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0

ndata_train = x_train.shape[0]
ndata_test = x_test.shape[0]

In [None]:
x_train.shape

(60000, 28, 28)

In [None]:
x_train_ravel = np.zeros((ndata_train,n*m,block_size))
for img in range(ndata_train):
    ind = 0
    for row in range(n):
        for col in range(m):
            x_train_ravel[img,ind,:] = x_train[img,(row*4):((row+1)*4),(col*4):((col+1)*4)].ravel()
            ind += 1
            

In [None]:
x_test_ravel = np.zeros((ndata_test,n*m,block_size))
for img in range(ndata_test):
    ind = 0
    for row in range(n):
        for col in range(m):
            x_test_ravel[img,ind,:] = x_test[img,(row*4):((row+1)*4),(col*4):((col+1)*4)].ravel()
            ind += 1

In [None]:
pos_feed_train = np.array([list(range(n*m))]*ndata_train)
pos_feed_test = np.array([list(range(n*m))]*ndata_test)

In [None]:
loaded_model.fit([x_train_ravel,pos_feed_train],y_train,epochs=40,batch_size = 40,validation_split=0)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
  58/1500 [>.............................] - ETA: 14s - loss: 0.0053 - accuracy: 0.9991

KeyboardInterrupt: 

In [None]:
out = loaded_model.evaluate([x_test_ravel,pos_feed_test],y_test)



In [None]:
#save the tensorflow model trans
# loaded_model.save('ViT_validationAcc9867_updated.h5')