# Setting

In [2]:
# !pip install -q tensorflow-models
!pip install tf-models-official
!pip install transformers[sentencepiece]
!pip install datasets evaluate transformers[sentencepiece]

Collecting tf-models-official
  Downloading tf_models_official-2.13.1-py2.py3-none-any.whl (2.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
Collecting immutabledict (from tf-models-official)
  Downloading immutabledict-3.0.0-py3-none-any.whl (4.0 kB)
Collecting pyyaml<5.4.0,>=5.1 (from tf-models-official)
  Downloading PyYAML-5.3.1.tar.gz (269 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m269.4/269.4 kB[0m [31m26.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sacrebleu (from tf-models-official)
  Downloading sacrebleu-2.3.1-py3-none-any.whl (118 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m118.9/118.9 kB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece (from tf-models-official)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90

In [3]:
import os

import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf
import tensorflow_models as tfm
import tensorflow_hub as hub
import tensorflow_datasets as tfds
import tensorflow_text as tf_text
from datasets import load_dataset
from tensorflow import keras
from tensorflow.keras.preprocessing.sequence import pad_sequences
from transformers import AutoTokenizer,AutoModel,TFGPT2LMHeadModel,AutoConfig,DataCollatorForLanguageModeling
from transformers import create_optimizer,pipeline,TFAutoModel

# Process Data

In [5]:
dataset = load_dataset("csv",data_files="Pushkin.csv",encoding='latin-1')

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")

In [8]:
x = dataset['train']['Content'][27]

In [9]:
tokenizer.pad_token = 0
total_word=len(tokenizer)
seq_len=128

In [10]:
xs = []
for element in dataset['train']['Content']:
    token = tokenizer(
        element,
        truncation=True,
        max_length=seq_len,
        return_overflowing_tokens=True,
        return_length=True)
    for length, input_ids in zip(token['length'], token['input_ids']):
        if length == seq_len:
            xs.append(input_ids)
        else:
            input_ids = [0] * (seq_len - length) + input_ids[:length]  # Add padding at the beginning
            xs.append(input_ids)

# Data Stream

In [11]:
split_point = int(len(xs) * 0.7)

# Split the array into two portions
train_data = xs[:split_point]
val_data = xs[split_point:]

In [12]:
# Define the generator function to read data from the CSV file
def train_data_generator():
   for i in train_data:
            yield tf.convert_to_tensor(i[:-1]),tf.convert_to_tensor(i[1:])

def val_data_generator():
   for i in val_data:
            yield tf.convert_to_tensor(i[:-1]),tf.convert_to_tensor(i[1:])


train_data1 = tf.data.Dataset.from_generator(
    train_data_generator,
    output_signature=(
        tf.TensorSpec(shape=(None,), dtype=tf.int64),
        tf.TensorSpec(shape=(None,), dtype=tf.int64)
    )
)
val_data1 = tf.data.Dataset.from_generator(
    val_data_generator,
    output_signature=(
        tf.TensorSpec(shape=(None,), dtype=tf.int64),
        tf.TensorSpec(shape=(None,), dtype=tf.int64)
    )
)


In [13]:
BUFFER_SIZE = 20000
BATCH_SIZE = 64
def make_batches(ds):
  return (
      ds
      .shuffle(BUFFER_SIZE)
      .batch(BATCH_SIZE)
      .prefetch(buffer_size=tf.data.AUTOTUNE)
      )

In [14]:
# Create training and validation set batches.
train_batches = make_batches(train_data1)
val_batches = make_batches(val_data1)

In [15]:
for i in train_batches:
  print(i)
  break

(<tf.Tensor: shape=(64, 127), dtype=int64, numpy=
array([[ 1813,    11,   198, ...,  1290,    11,   198],
       [ 2215,   262,  7812, ...,  3152,   645,   271],
       [  198,    51,   359, ...,   198,  1722, 33424],
       ...,
       [  198,  2202,   465, ...,   314, 11747,   451],
       [    0,     0,     0, ...,   258,  8228,  2347],
       [ 1299, 26625, 22655, ...,  7666, 13468,   284]])>, <tf.Tensor: shape=(64, 127), dtype=int64, numpy=
array([[   11,   198,  1870, ...,    11,   198,  1870],
       [  262,  7812,  1110, ...,   645,   271,  5321],
       [   51,   359, 30092, ...,  1722, 33424, 20218],
       ...,
       [ 2202,   465,  5814, ..., 11747,   451, 43856],
       [    0,     0,     0, ...,  8228,  2347,    13],
       [26625, 22655,   198, ..., 13468,   284, 16352]])>)


In [16]:
for pt, en in train_batches.take(1):
  break

print(pt)
print(en.shape)

tf.Tensor(
[[  198  6653   691 ... 14682    11   198]
 [ 8496  5465   290 ...   287   262  5405]
 [  523     0   887 ... 10980   351 10195]
 ...
 [18273   339 23180 ...  1182   198  8421]
 [  403   794 16860 ...  1666 15360    13]
 [    0     0     0 ...  8169   560 21213]], shape=(64, 127), dtype=int64)
(64, 127)


# Architect

In [17]:
model = TFAutoModel.from_pretrained('gpt2')

Downloading model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFGPT2Model.

All the weights of TFGPT2Model were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2Model for predictions without further training.


In [18]:
embedding_layer = model.get_input_embeddings()
embedding_weights = embedding_layer.weights[0].numpy()

In [19]:
type(embedding_weights)

numpy.ndarray

In [20]:
def positional_encoding(length, depth):
    depth = depth / 2

    positions = np.arange(length)[:, np.newaxis]  # (seq, 1)
    depths = np.arange(depth)[np.newaxis, :] / depth  # (1, depth)

    angle_rates = 1 / (10000 ** depths)  # (1, depth)
    angle_rads = positions * angle_rates  # (pos, depth)

    pos_encoding = np.concatenate(
        [np.sin(angle_rads), np.cos(angle_rads)],
        axis=-1)

    return tf.cast(pos_encoding, dtype=tf.float32)

In [21]:
class PositionalEmbedding(tf.keras.layers.Layer):
    def __init__(self, vocab_size,d_model):
        super().__init__()
        self.d_model = d_model
        self.embedding = tf.keras.layers.Embedding(embedding_weights.shape[0],
                                        embedding_weights.shape[1],
                                        weights=[embedding_weights],
                                        trainable=False,
                                        mask_zero=True)
        self.pos_encoding = positional_encoding(length=2048, depth=d_model)
        self.dense = tf.keras.layers.Dense(d_model, activation='relu')

    def compute_mask(self, *args, **kwargs):
        return self.embedding.compute_mask(*args, **kwargs)

    def call(self, x):
        length = tf.shape(x)[1]
        x = self.embedding(x)
        x = self.dense(x)
        # This factor sets the relative scale of the embedding and positonal_encoding.
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x = x + self.pos_encoding[tf.newaxis, :length, :]
        return x

In [22]:
embed = PositionalEmbedding(vocab_size=10, d_model=512)

print(pt)
en_emb = embed(pt)

tf.Tensor(
[[  198  6653   691 ... 14682    11   198]
 [ 8496  5465   290 ...   287   262  5405]
 [  523     0   887 ... 10980   351 10195]
 ...
 [18273   339 23180 ...  1182   198  8421]
 [  403   794 16860 ...  1666 15360    13]
 [    0     0     0 ...  8169   560 21213]], shape=(64, 127), dtype=int64)


In [23]:
class BaseAttention(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super().__init__()
        self.mha = tf.keras.layers.MultiHeadAttention(**kwargs)
        self.layernorm = tf.keras.layers.LayerNormalization()
        self.add = tf.keras.layers.Add()

In [24]:
class CausalSelfAttention(BaseAttention):
  def call(self, x):
    attn_output = self.mha(
        query=x,
        value=x,
        key=x,
        use_causal_mask = True)
    x = self.add([x, attn_output])
    x = self.layernorm(x)
    return x

In [25]:
sample_csa = CausalSelfAttention(num_heads=2, key_dim=512)

print(sample_csa(en_emb).shape)

(64, 127, 512)


In [26]:
class FeedForward(tf.keras.layers.Layer):
    def __init__(self, d_model, dff, dropout_rate=0.1):
        super().__init__()
        self.seq = tf.keras.Sequential([
            tf.keras.layers.Dense(dff, activation='relu'),
            tf.keras.layers.Dense(d_model),
            tf.keras.layers.Dropout(dropout_rate)
        ])
        self.add = tf.keras.layers.Add()
        self.layer_norm = tf.keras.layers.LayerNormalization()

    def call(self, x):
        x = self.add([x, self.seq(x)])
        x = self.layer_norm(x)
        return x

In [27]:
sample_ffn = FeedForward(512, 2048)

print(en_emb.shape)
print(sample_ffn(en_emb).shape)

(64, 127, 512)
(64, 127, 512)


In [28]:
class DecoderLayer(tf.keras.layers.Layer):
  def __init__(self,
               *,
               d_model,
               num_heads,
               dff,
               dropout_rate=0.1):
    super(DecoderLayer, self).__init__()

    self.causal_self_attention = CausalSelfAttention(
        num_heads=num_heads,
        key_dim=d_model,
        dropout=dropout_rate)

    self.ffn = FeedForward(d_model, dff)

  def call(self, x):
    x = self.causal_self_attention(x=x)

    x = self.ffn(x)  # Shape `(batch_size, seq_len, d_model)`.
    return x

In [29]:
sample_decoder_layer = DecoderLayer(d_model=512, num_heads=8, dff=2048)

sample_decoder_layer_output = sample_decoder_layer(
    x=en_emb)

print(en_emb.shape)
print(sample_decoder_layer_output.shape)  # `(batch_size, seq_len, d_model)`

(64, 127, 512)
(64, 127, 512)


In [30]:
class Decoder(tf.keras.layers.Layer):
  def __init__(self, *, num_layers, d_model, num_heads, dff, vocab_size,
               dropout_rate=0.1):
    super(Decoder, self).__init__()

    self.d_model = d_model
    self.num_layers = num_layers

    self.pos_embedding = PositionalEmbedding(vocab_size=vocab_size,d_model=d_model)

    self.dropout = tf.keras.layers.Dropout(dropout_rate)
    self.dec_layers = [
        DecoderLayer(d_model=d_model, num_heads=num_heads,
                     dff=dff, dropout_rate=dropout_rate)
        for _ in range(num_layers)]


  def call(self, x):
    # `x` is token-IDs shape (batch, target_seq_len)
    x = self.pos_embedding(x)  # (batch_size, target_seq_len, d_model)

    x = self.dropout(x)

    for i in range(self.num_layers):
      x  = self.dec_layers[i](x)

    # The shape of x is (batch_size, target_seq_len, d_model).
    return x

In [31]:
# Instantiate the decoder.
sample_decoder = Decoder(num_layers=4,
                         d_model=512,
                         num_heads=8,
                         dff=2048,
                         vocab_size=8000)

cnt=0
output = sample_decoder(x=pt)
cnt=cnt+1
  # Print the shapes.
print(cnt)

1


In [32]:
print(pt.shape)
print(sample_decoder_layer_output.shape)

(64, 127)
(64, 127, 512)


In [33]:
class Transformer(tf.keras.Model):
  def __init__(self, *, num_layers, d_model, num_heads, dff,
               vocab_size, dropout_rate=0.1):
    super().__init__()

    self.decoder = Decoder(num_layers=num_layers, d_model=d_model,
                           num_heads=num_heads, dff=dff,
                           vocab_size=vocab_size,
                           dropout_rate=dropout_rate)

    self.final_layer = tf.keras.layers.Dense(vocab_size)

  def call(self, inputs):
    # To use a Keras model with `.fit` you must pass all your inputs in the
    # first argument.
    x = self.decoder(inputs)  # (batch_size, target_len, d_model)

    # Final linear layer output.
    logits = self.final_layer(x)  # (batch_size, target_len, vocab_size)

    # try:
    #   # Drop the keras mask, so it doesn't scale the losses/metrics.
    #   # b/250038731
    #   del logits._keras_mask
    # except AttributeError:
    #   pass

    # Return the final output and the attention weights.
    return logits

# Training

In [34]:
num_layers = 4
d_model = 128
dff = 512
num_heads = 8
dropout_rate = 0.1

In [35]:
transformer = Transformer(
    num_layers=num_layers,
    d_model=d_model,
    num_heads=num_heads,
    dff=dff,
    vocab_size=total_word,
    dropout_rate=dropout_rate)

In [36]:
output = transformer(pt)

print(en.shape)
print(pt.shape)
print(output.shape)

(64, 127)
(64, 127)
(64, 127, 50257)


In [37]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
  def __init__(self, d_model, warmup_steps=4000):
    super().__init__()

    self.d_model = d_model
    self.d_model = tf.cast(self.d_model, tf.float32)
    self.initial_learning_rate = 0.001
    self.warmup_steps = warmup_steps

  def __call__(self, step):
    step = tf.cast(step, dtype=tf.float32)
    arg1 = tf.math.rsqrt(step)
    arg2 = step * (self.warmup_steps ** -1.5)

    return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

  def get_config(self):
        return {"initial_learning_rate": self.initial_learning_rate}

In [38]:
learning_rate = CustomSchedule(d_model)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98,
                                     epsilon=1e-9)

In [39]:
def masked_loss(label, pred):
  mask = label != 0
  loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')
  loss = loss_object(label, pred)

  mask = tf.cast(mask, dtype=loss.dtype)
  loss *= mask

  loss = tf.reduce_sum(loss)/tf.reduce_sum(mask)
  return loss


def masked_accuracy(label, pred):
  pred = tf.argmax(pred, axis=2)
  label = tf.cast(label, pred.dtype)
  match = label == pred

  mask = label != 0

  match = match & mask

  match = tf.cast(match, dtype=tf.float32)
  mask = tf.cast(mask, dtype=tf.float32)
  return tf.constant(1, dtype=tf.float32)
  return tf.reduce_sum(match)/tf.reduce_sum(mask)

In [40]:
transformer.compile(
    loss=masked_loss,
    optimizer=optimizer,
    metrics=[masked_accuracy])

In [49]:
history=transformer.fit(train_batches,
                epochs=350,validation_data=val_batches,verbose=1)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50

KeyboardInterrupt: ignored

In [42]:
def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.show()

plot_graphs(history, "masked_accuracy")
plot_graphs(history, "loss")

# Pridict

In [43]:
class Translator(tf.Module):
  def __init__(self, tokenizer, transformer):
    self.tokenizer = tokenizer
    self.transformer = transformer

  def __call__(self, sentences, max_length=seq_len,num_gen=20):

    sentences=sentences.numpy()
    if type(sentences)==np.ndarray:
      sentences=[s.decode() for s in sentences]
    else:
      sentences=sentences.decode()

    if type(sentences)==str:
      sentences=[sentences]

    tokens = [self.tokenizer.encode(sentence, add_special_tokens=False) for sentence in sentences]

    for i in range(num_gen):
      inputs=pad_sequences(tokens, maxlen=max_length, padding='pre',truncating='pre')

      predictions = self.transformer(tf.convert_to_tensor(inputs), training=False)

      predictions = predictions[:, -1:, :]  # Shape `(batch_size, 1, vocab_size)`.

      predicted_id = tf.argmax(predictions, axis=-1)

      for j in range(len(sentences)):
        tokens[j] = tokens[j]+[predicted_id.numpy()[j][0]]

    return [self.tokenizer.decode(token, add_special_tokens=False) for token in tokens]

In [44]:
class Beam_Search(tf.Module):
  def __init__(self, tokenizer, transformer):
    self.tokenizer = tokenizer
    self.transformer = transformer

  def __call__(self, sentences, max_length=seq_len,num_gen=20):

    sentences=sentences.numpy()
    if type(sentences)==np.ndarray:
      sentences=[s.decode() for s in sentences]
    else:
      sentences=sentences.decode()

    if type(sentences)==str:
      sentences=[sentences]

    band_width=3
    tokens = [self.tokenizer.encode(sentence, add_special_tokens=False) for sentence in sentences]

    cur_seq=[]
    cur_prob=[]
    cur_len = []
    batch_size=len(sentences)
    for j in range(batch_size):
        cur_seq=cur_seq+[tokens[j]]*band_width

        cur_prob=cur_prob+[0.]*band_width

        cur_len=cur_len+[0]*band_width

    for i in range(num_gen):
      inputs=pad_sequences(cur_seq, maxlen=max_length, padding='pre',truncating='pre')

      predictions = self.transformer(tf.convert_to_tensor(inputs), training=False)

      predictions = predictions[:, -1:, :]  # Shape `(batch_size*band_width, 1, vocab_size)`.

      # predicted_id = tf.argmax(predictions, axis=-1)

      _,predicted_id = tf.math.top_k(predictions, k=band_width)

      for j in range(0,batch_size*band_width,band_width):
        candiate_list=[]
        for k in range(j,j+band_width):
          for t in range(band_width):
            idx=predicted_id.numpy()[k][0][t]
            next_list = cur_seq[k]+[idx]
            next_prob = cur_prob[k]+np.log(predictions.numpy()[k][0][idx])
            next_len = cur_len[k]+1
            mean_pob= next_prob/next_len

            candiate_list=candiate_list+[(next_list,next_prob,next_len,mean_pob)]

        sorted_list = sorted(candiate_list, key=lambda x: x[-1], reverse=True)
        for k in range(j,j+band_width):
          cur_seq[k]=sorted_list[k-j][0]
          cur_prob[k]=sorted_list[k-j][1]
          cur_len[k]=sorted_list[k-j][2]


    gen_text=[]

    for i in range(0,batch_size*band_width,band_width):
        tmp = cur_prob[i:i+band_width]
        idx = tmp.index(max(tmp))
        gen_text=gen_text+[cur_seq[i+idx]]
    # return tokenizer.sequences_to_texts(cur_seq)
    return [self.tokenizer.decode(gen_text_element) for gen_text_element in gen_text]

In [50]:
translator = Beam_Search(tokenizer, transformer)
# translator = Translator(tokenizer, transformer)
sentence = ['I love you','the blue flower']

translator(tf.constant(sentence))

['I love you.\nI know you you my tears. my tears.\nI know you you you you you',
 "the blue flower.\nAnd I'm yours.\nAnd my love. I remember my tears.\nAnd I"]

# Export

In [46]:
class ExportTranslator(tf.Module):
  def __init__(self, translator):
    self.translator = translator

  @tf.function(input_signature=[tf.TensorSpec(shape=[], dtype=tf.string)])
  def __call__(self, sentence):
    (result)= self.translator(sentence, max_length=seq_len, num_gen=20)

    return result

In [47]:
translator = ExportTranslator(translator)

In [48]:
translator('i love you')

AttributeError: ignored

In [None]:
tf.saved_model.save(translator, export_dir='translator')

In [None]:
reloaded = tf.saved_model.load('translator')

In [None]:
# Inspect the loaded object
print(dir(reloaded))

In [None]:
reloaded.translator('i love you').numpy()

In [None]:
transformer.save('/content')