# ML 101

In [None]:
!pip install tensorflow tf-keras

In [None]:
# https://github.com/huggingface/transformers/issues/29470
!pip install transformers==4.37.2

# The encoder and decoder layers

## Encoder layer

In [3]:
import tensorflow as tf
from tensorflow.keras.layers import MultiHeadAttention, LayerNormalization, Dense

class TransformerEncoderLayer(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads, dff, rate=0.1):
    super(TransformerEncoderLayer, self).__init__()

    self.mha = MultiHeadAttention(num_heads, d_model)
    self.ffn = tf.keras.Sequential([
        Dense(dff, activation='relu'),
        Dense(d_model)
    ])

    self.layernorm1 = LayerNormalization(epsilon=1e-6)
    self.layernorm2 = LayerNormalization(epsilon=1e-6)
    self.dropout1 = tf.keras.layers.Dropout(rate)
    self.dropout2 = tf.keras.layers.Dropout(rate)

  def call(self, x, training):
    attn_output = self.mha(x,x,x)
    attn_output = self.dropout1(attn_output, training=training)
    out1 = self.layernorm1(x + attn_output)

    ffn_output = self.ffn(out1)
    ffn_output = self.dropout2(ffn_output, training=training)
    out2 = self.layernorm2(out1 + ffn_output)

    return out2

## Decoder Layer

In [4]:
class TransformerDecoderLayer(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads, dff, rate=0.1):
    super(TransformerDecoderLayer, self).__init__()
    self.mha1 = MultiHeadAttention(num_heads, d_model)
    self.mha2 = MultiHeadAttention(num_heads, d_model)

    self.ffn = tf.keras.Sequential([
        Dense(dff, activation='relu'),
        Dense(d_model)
    ])

    self.layernorm1 = LayerNormalization(epsilon=1e-6)
    self.layernorm2 = LayerNormalization(epsilon=1e-6)
    self.layernorm3 = LayerNormalization(epsilon=1e-6)

    self.dropout1 = tf.keras.layers.Dropout(rate)
    self.dropout2 = tf.keras.layers.Dropout(rate)
    self.dropout3 = tf.keras.layers.Dropout(rate)

  def call(self, x, enc_output, training, look_ahead_mask, padding_mask):
    attn1, attn_weights_block1 = self.mha1(x,x,x,look_ahead_mask)
    attn1 = self.dropout1(attn1, training=training)
    out1 = self.layernorm1(attn1 + x)

    attn2, attn_weights_block2 = self.mha2(enc_output, enc_output, out1, padding_mask)
    attn2 = self.dropout2(attn2, training=training)
    out2 = self.layernorm2(attn2 + out1)

    ffn_output = self.ffn(out2)
    ffn_output = self.dropout3(ffn_output, training=training)
    out3 = self.layernorm3(ffn_output + out2)

    return out3, attn_weights_block1, attn_weights_block2

# Full Transformer

In [5]:
class Transformer(tf.keras.Model):
  def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, target_vocab_size, pe_input, pe_target, rate=0.1):
    super(Transformer, self).__init__()
    self.encoder = TransformerEncoderLayer(num_layers, d_model, num_heads, dff, input_vocab_size, pe_input, rate)
    self.decoder = TransformerDecoderLayer(num_layers, d_model, num_heads, dff, target_vocab_size, pe_target, rate)

    self.final_layer = tf.keras.layers.Dense(target_vocab_size)

  def call(self, inp, tar, training, enc_padding_mask, look_ahead_mask, dec_padding_mask):
    enc_output = self.encoder(input, training, enc_padding_mask)
    dec_output, attention_weights = self.decoder(tar, enc_output, training, look_ahead_mask, dec_padding_mask)

    final_output = self.final_layer(dec_output)

# Training loop with Hugging Face

In [None]:
from transformers import BertTokenizer, TFBertModel

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertModel.from_pretrained('bert-base-uncased')

sentences = ["I love this product!", "This is a bad product."]

inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors="tf")

# Fine-Tuning

In [7]:
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model

input_ids = Input(shape=(None,), dtype='int32', name="input_ids")
attention_mask = Input(shape=(None,), dtype='int32', name="attention_mask")

bert = model(input_ids, attention_mask=attention_mask)

x = bert.last_hidden_state[:, 0, :]
x = Dense(128, activation='relu')(x)
output = Dense(1, activation='sigmoid')(x)

fine_tuned_model = Model(inputs=[input_ids, attention_mask], outputs=[output])
fine_tuned_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

labels = [1,0]

fine_tuned_model.fit(inputs, labels, epochs=3, batch_size=32)

ValueError: Failed to find data adapter that can handle input: <class 'transformers.tokenization_utils_base.BatchEncoding'>, (<class 'list'> containing values of types {"<class 'int'>"})