In [None]:
!cp '/content/drive/MyDrive/MS DL NLP Final Project/Dataset/MaxPooledFeatures.zip' /content/
!unzip MaxPooledFeatures.zip

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: content/MaxPooledFeatures/515684832.npz  
  inflating: content/MaxPooledFeatures/267325341.npz  
  inflating: content/MaxPooledFeatures/7199344342.npz  
  inflating: content/MaxPooledFeatures/6899246013.npz  
  inflating: content/MaxPooledFeatures/5687835693.npz  
  inflating: content/MaxPooledFeatures/1547327107.npz  
  inflating: content/MaxPooledFeatures/1690350208.npz  
  inflating: content/MaxPooledFeatures/2148916767.npz  
  inflating: content/MaxPooledFeatures/241345770.npz  
  inflating: content/MaxPooledFeatures/4929777339.npz  
  inflating: content/MaxPooledFeatures/3117602933.npz  
  inflating: content/MaxPooledFeatures/6960287365.npz  
  inflating: content/MaxPooledFeatures/4805425261.npz  
  inflating: content/MaxPooledFeatures/459560684.npz  
  inflating: content/MaxPooledFeatures/2163780897.npz  
  inflating: content/MaxPooledFeatures/4950996568.npz  
  inflating: content/MaxPooledFeatures/3505

#### Imports

In [1]:
# Imports
import numpy as np
import tensorflow as tf

#### Util Functions

In [2]:
# Util function for Positional Encoding
def positional_encoding(length, depth):
    depth = depth / 2
    positions = np.arange(length)[:, np.newaxis]     # (seq, 1)
    depths = np.arange(depth)[np.newaxis, :] / depth   # (1, depth)
    angle_rates = 1 / (10000 ** depths)         # (1, depth)
    angle_rads = positions * angle_rates      # (pos, depth)
    pos_encoding = np.concatenate([np.sin(angle_rads), np.cos(angle_rads)], axis = -1)
    return tf.cast(pos_encoding, dtype = tf.float32)

In [3]:
# Positional Embedding
class PositionalEmbedding(tf.keras.layers.Layer):
  def __init__(self, vocab_size, d_model):
    super().__init__()
    self.d_model = d_model
    self.embedding = tf.keras.layers.Embedding(vocab_size, d_model, mask_zero = True)
    self.pos_encoding = positional_encoding(length = 2048, depth = d_model)

  def compute_mask(self, *args, **kwargs):
    return self.embedding.compute_mask(*args, **kwargs)

  def call(self, x):
    length = tf.shape(x)[1]
    x = self.embedding(x)
    # This factor sets the relative scale of the embedding and positonal_encoding.
    x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    x = x + self.pos_encoding[tf.newaxis, :length, :]
    return x

In [None]:
# Base Attention - Used to define child (cross / global / masked) attention layers
class BaseAttention(tf.keras.layers.Layer):
  def __init__(self, **kwargs):
    super().__init__()
    self.mha = tf.keras.layers.MultiHeadAttention(**kwargs)
    self.layernorm = tf.keras.layers.LayerNormalization()
    self.add = tf.keras.layers.Add()

# Global Self Attention - Used in Encoder
class GlobalSelfAttention(BaseAttention):
  def call(self, x):
    attn_output = self.mha(query = x, value = x, key = x)
    x = self.add([x, attn_output])
    x = self.layernorm(x)
    return x

# Causal (Masked) Self Attention - Used in Decoder
class CausalSelfAttention(BaseAttention):
  def call(self, x):
    attn_output = self.mha(query = x, value = x, key = x, use_causal_mask = True)
    x = self.add([x, attn_output])
    x = self.layernorm(x)
    return x

# Cross Attention - Used in Decoder
class CrossAttention(BaseAttention):
  def call(self, x, context):
    attn_output, attn_scores = self.mha(query = x, key = context, value = context, return_attention_scores = True)
    self.last_attn_scores = attn_scores
    x = self.add([x, attn_output])
    x = self.layernorm(x)
    return x

In [None]:
# Feed Forward Layers - Used in both Encoder and Decoder
class FeedForward(tf.keras.layers.Layer):
  def __init__(self, d_model, dff, dropout_rate = 0.1):
    super().__init__()
    self.seq = tf.keras.Sequential([
      tf.keras.layers.Dense(dff, activation = 'relu'),
      tf.keras.layers.Dense(d_model),
      tf.keras.layers.Dropout(dropout_rate)
    ])
    self.add = tf.keras.layers.Add()
    self.layer_norm = tf.keras.layers.LayerNormalization()

  def call(self, x):
    x = self.add([x, self.seq(x)])
    x = self.layer_norm(x)
    return x

In [None]:
# Encoder Layer
class EncoderLayer(tf.keras.layers.Layer):
  def __init__(self,*, d_model, num_heads, dff, dropout_rate = 0.1):
    super().__init__()
    self.self_attention = GlobalSelfAttention(num_heads = num_heads, key_dim = d_model, dropout = dropout_rate)
    self.ffn = FeedForward(d_model, dff)

  def call(self, x):
    x = self.self_attention(x)
    x = self.ffn(x)
    return x

In [None]:
# Encoder
class Encoder(tf.keras.layers.Layer):
  def __init__(self, *, num_layers, d_model, num_heads, dff, vocab_size, dropout_rate = 0.1):
    super().__init__()
    self.d_model = d_model
    self.num_layers = num_layers
    self.pos_embedding = PositionalEmbedding(vocab_size = vocab_size, d_model = d_model)
    self.enc_layers = [EncoderLayer(d_model = d_model, num_heads = num_heads, dff = dff, dropout_rate = dropout_rate) for _ in range(num_layers)]
    self.dropout = tf.keras.layers.Dropout(dropout_rate)

  def call(self, x):
    # `x` is token-IDs shape: (batch, seq_len)
    x = self.pos_embedding(x)  # Shape `(batch_size, seq_len, d_model)`.

    # Add dropout.
    x = self.dropout(x)

    for i in range(self.num_layers):
      x = self.enc_layers[i](x)

    return x  # Shape `(batch_size, seq_len, d_model)`.

In [None]:
# Decoder Layer
class DecoderLayer(tf.keras.layers.Layer):
  def __init__(self, *, d_model, num_heads, dff, dropout_rate = 0.1):
    super(DecoderLayer, self).__init__()
    self.causal_self_attention = CausalSelfAttention(num_heads = num_heads, key_dim = d_model, dropout = dropout_rate)
    self.cross_attention = CrossAttention(num_heads = num_heads, key_dim = d_model, dropout = dropout_rate)
    self.ffn = FeedForward(d_model, dff)

  def call(self, x, context):
    x = self.causal_self_attention(x = x)
    x = self.cross_attention(x = x, context = context)

    # Cache the last attention scores for plotting later
    self.last_attn_scores = self.cross_attention.last_attn_scores

    x = self.ffn(x)  # Shape `(batch_size, seq_len, d_model)`.
    return x

In [None]:
# Decoder
class Decoder(tf.keras.layers.Layer):
  def __init__(self, *, num_layers, d_model, num_heads, dff, vocab_size, dropout_rate = 0.1):
    super(Decoder, self).__init__()
    self.d_model = d_model
    self.num_layers = num_layers
    self.pos_embedding = PositionalEmbedding(vocab_size = vocab_size, d_model = d_model)
    self.dropout = tf.keras.layers.Dropout(dropout_rate)
    self.dec_layers = [DecoderLayer(d_model = d_model, num_heads = num_heads,dff = dff, dropout_rate = dropout_rate) for _ in range(num_layers)]
    self.last_attn_scores = None

  def call(self, x, context):
    # `x` is token-IDs shape (batch, target_seq_len)
    x = self.pos_embedding(x)  # (batch_size, target_seq_len, d_model)

    x = self.dropout(x)

    for i in range(self.num_layers):
      x  = self.dec_layers[i](x, context)

    self.last_attn_scores = self.dec_layers[-1].last_attn_scores

    # The shape of x is (batch_size, target_seq_len, d_model).
    return x

#### Model

In [4]:
# Name of the Model - EfficientDecoder
class EfficientDecoder(tf.keras.Model):
  def __init__(self, *, num_layers, d_model, num_heads, dff, target_vocab_size, dropout_rate = 0.1):
    super().__init__()

    # Dense layer for mapping image encodings to d_model
    self.dense = tf.keras.layers.Dense(d_model, activation = 'relu')

    # Only utilize Decoder part of the Transformer Model, will be using EfficientNetB4 as Image Encoder explicitly
    # For training, we have already extracted image encodings and store as np arrays to save computations.
    # During inference, we will build pipeline: Inputs -> EfficientNetB4 -> EfficientDecoder -> Output
    self.decoder = Decoder(num_layers = num_layers, d_model = d_model,
                           num_heads = num_heads, dff = dff,
                           vocab_size = target_vocab_size,
                           dropout_rate = dropout_rate)

    self.final_layer = tf.keras.layers.Dense(target_vocab_size)

  def call(self, inputs):
    # To use a Keras model with `.fit` you must pass all your inputs in the
    # first argument.
    # context is image encodings (bs, seeq_len, img_enc), x is decoder input sequence (bs, seq_len)
    context, x  = inputs
    context = self.dense(context) # (bs, seq_len, d_model)
    x = self.decoder(x, context)  # (bs, seq_len, d_model)
    logits = self.final_layer(x)  # (bs, seq_len, target_vocab_size)

    try:
      # Drop the keras mask, so it doesn't scale the losses/metrics.
      # b/250038731
      del logits._keras_mask
    except AttributeError:
      pass

    # Return the final output and the attention weights.
    return logits

#### Unit Test

In [None]:
# Dummy Caption - This should be NE (Numerical Encoded) Vectors of Words from Target Vocab
x = np.random.randn(1, 50) # (bs, seq_len)

# Load Image (Context)
file_path = '/content/MaxPooledFeatures/36979.npz'
image_encodings = np.load(file_path)
key = 'arr_0'

# Assert Shape
context = image_encodings[key]
assert context.shape == (1792, )

# Reshape for Model
context = np.tile(np.expand_dims(context, axis = 0), (1, 50, 1)) # (bs, seq_len, img_enc)

# Model Parameters
num_layers = 3
d_model = 512
num_heads = 4
dff = 2048
target_vocab_size = 1000

# Define Model
model = EfficientDecoder(num_layers = num_layers, d_model = d_model, num_heads = num_heads, target_vocab_size = target_vocab_size, dff = dff)
op = model((context, x))
print(op.shape)

(1, 50, 1000)


#### Check GIT

In [None]:
!git clone https://github.com/jainamshah17/dlnlpfinalproject.git

Cloning into 'dlnlpfinalproject'...
remote: Enumerating objects: 5, done.[K
remote: Counting objects: 100% (5/5), done.[K
remote: Compressing objects: 100% (5/5), done.[K
remote: Total 5 (delta 0), reused 5 (delta 0), pack-reused 0[K
Receiving objects: 100% (5/5), done.


In [None]:
# Import
import sys
sys.path.append('/content/dlnlpfinalproject/')

from model import EfficientDecoder as EDGit

In [None]:
!python /content/dlnlpfinalproject/unit_test.py

2023-11-23 21:37:10.687597: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-23 21:37:10.687658: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-23 21:37:10.687702: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-11-23 21:37:15.274895: W tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc:47] Overriding orig_value setting because the TF_FORCE_GPU_ALLOW_GROWTH environment variable is set. Original config value was 0.
(1, 50, 1000)


In [None]:
# Define Model from Git
model = EDGit(num_layers = num_layers, d_model = d_model, num_heads = num_heads, target_vocab_size = target_vocab_size, dff = dff)
op = model((context, x))
print(op.shape)

(1, 50, 1000)
