# Setting

In [None]:
# !pip install -q tensorflow-models
!pip install tf-models-official
!pip install transformers[sentencepiece]
!pip install datasets evaluate transformers[sentencepiece]

[31mERROR: Could not find a version that satisfies the requirement tensorflow-models (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for tensorflow-models[0m[31m


In [None]:
import os

import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf
import tensorflow_models as tfm
import tensorflow_hub as hub
import tensorflow_datasets as tfds
import tensorflow_text as tf_text
from datasets import load_dataset
from tensorflow import keras
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Process Data

In [None]:
dataset = load_dataset("csv",data_files="Pushkin.csv",encoding='latin-1')

tokenizer = keras.preprocessing.text.Tokenizer(filters='!"#$%&()*+,/:;<=>?@[\\]^_`{|}~\t',split=' ')
for text in dataset['train']['Content']:
  tokenizer.fit_on_texts([text.strip()])

In [None]:
x = dataset['train']['Content'][27]

In [None]:
total_word=len(tokenizer.word_index) + 1
seq_len=128

input_sequences = []
for line in  dataset['train']['Content']:
	token_list = tokenizer.texts_to_sequences([line])[0]
	length=len(token_list)
	num_seq=int(np.ceil(length/seq_len))
	if num_seq!=1:
		for i in range(num_seq-1):
			n_gram_sequence = token_list[seq_len*i:seq_len*(i+1)]
			input_sequences.append(n_gram_sequence)
	if num_seq*seq_len<length:
		n_gram_sequence = token_list[seq_len*(num_seq-1):]
		input_sequences.append(n_gram_sequence)


# pad sequences
xs = np.array(pad_sequences(input_sequences, maxlen=seq_len, padding='pre'))

In [None]:
split_point = int(len(xs) * 0.7)

# Split the array into two portions
train_data = xs[:split_point]
val_data = xs[split_point:]

In [None]:
# Define the generator function to read data from the CSV file
def train_data_generator():
   for i in train_data:
            yield tf.convert_to_tensor(i[:-1]),tf.convert_to_tensor(i[1:])

def val_data_generator():
   for i in val_data:
            yield tf.convert_to_tensor(i[:-1]),tf.convert_to_tensor(i[1:])


train_data1 = tf.data.Dataset.from_generator(
    train_data_generator,
    output_signature=(
        tf.TensorSpec(shape=(None,), dtype=tf.int64),
        tf.TensorSpec(shape=(None,), dtype=tf.int64)
    )
)
val_data1 = tf.data.Dataset.from_generator(
    val_data_generator,
    output_signature=(
        tf.TensorSpec(shape=(None,), dtype=tf.int64),
        tf.TensorSpec(shape=(None,), dtype=tf.int64)
    )
)


In [None]:
BUFFER_SIZE = 20000
BATCH_SIZE = 64
def make_batches(ds):
  return (
      ds
      .shuffle(BUFFER_SIZE)
      .batch(BATCH_SIZE)
      .prefetch(buffer_size=tf.data.AUTOTUNE)
      )

In [None]:
# Create training and validation set batches.
train_batches = make_batches(train_data1)
val_batches = make_batches(val_data1)

In [None]:
for i in train_batches:
  print(i)
  break

(<tf.Tensor: shape=(48, 127), dtype=int64, numpy=
array([[ 407, 3111,    1, ...,    8,  127,   20],
       [  25,    3,  497, ...,  656, 2460,   10],
       [  10,   61,   49, ..., 2425, 2426,  100],
       ...,
       [1907, 1908,   28, ...,   22,    1,  307],
       [2842,    1,  266, ...,   16,    4,   41],
       [2574,   21,  417, ..., 2613,   21,   37]])>, <tf.Tensor: shape=(48, 127), dtype=int64, numpy=
array([[3111,    1,  703, ...,  127,   20,   37],
       [   3,  497, 1096, ..., 2460,   10, 2461],
       [  61,   49,   27, ..., 2426,  100, 2427],
       ...,
       [1908,   28,    4, ...,    1,  307,  618],
       [   1,  266, 1151, ...,    4,   41, 1203],
       [  21,  417,  204, ...,   21,   37, 2614]])>)


In [None]:
for pt, en in train_batches.take(1):
  break

print(pt)
print(en.shape)

tf.Tensor(
[[ 618 1642   15 ...    1 1676 1677]
 [1948 1949  311 ...  264 1979  395]
 [ 183  717  145 ...   31 2488    3]
 ...
 [ 106    4  987 ...   14  395 2010]
 [ 856   12   57 ...   78 1538    4]
 [1255   20 3213 ...  367   19 3238]], shape=(48, 127), dtype=int64)
(48, 127)


# Architect

In [None]:
def positional_encoding(length, depth):
    depth = depth / 2

    positions = np.arange(length)[:, np.newaxis]  # (seq, 1)
    depths = np.arange(depth)[np.newaxis, :] / depth  # (1, depth)

    angle_rates = 1 / (10000 ** depths)  # (1, depth)
    angle_rads = positions * angle_rates  # (pos, depth)

    pos_encoding = np.concatenate(
        [np.sin(angle_rads), np.cos(angle_rads)],
        axis=-1)

    return tf.cast(pos_encoding, dtype=tf.float32)

In [None]:
class PositionalEmbedding(tf.keras.layers.Layer):
    def __init__(self, vocab_size, d_model):
        super().__init__()
        self.d_model = d_model
        self.embedding = tf.keras.layers.Embedding(vocab_size, d_model,mask_zero=True)
        self.pos_encoding = positional_encoding(length=2048, depth=d_model)

    def compute_mask(self, *args, **kwargs):
        return self.embedding.compute_mask(*args, **kwargs)

    def call(self, x):
        length = tf.shape(x)[1]
        x = self.embedding(x)
        # This factor sets the relative scale of the embedding and positonal_encoding.
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x = x + self.pos_encoding[tf.newaxis, :length, :]
        return x

In [None]:
class BaseAttention(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super().__init__()
        self.mha = tf.keras.layers.MultiHeadAttention(**kwargs)
        self.layernorm = tf.keras.layers.LayerNormalization()
        self.add = tf.keras.layers.Add()

In [None]:
class CausalSelfAttention(BaseAttention):
  def call(self, x):
    attn_output = self.mha(
        query=x,
        value=x,
        key=x,
        use_causal_mask = True)
    x = self.add([x, attn_output])
    x = self.layernorm(x)
    return x

In [None]:
embed = PositionalEmbedding(vocab_size=total_word, d_model=512)

print(pt)
en_emb = embed(pt)

tf.Tensor(
[[ 618 1642   15 ...    1 1676 1677]
 [1948 1949  311 ...  264 1979  395]
 [ 183  717  145 ...   31 2488    3]
 ...
 [ 106    4  987 ...   14  395 2010]
 [ 856   12   57 ...   78 1538    4]
 [1255   20 3213 ...  367   19 3238]], shape=(48, 127), dtype=int64)


In [None]:
sample_csa = CausalSelfAttention(num_heads=2, key_dim=512)

print(en_emb.shape)
print(sample_csa(en_emb).shape)

(48, 127, 512)
(48, 127, 512)


In [None]:
class FeedForward(tf.keras.layers.Layer):
    def __init__(self, d_model, dff, dropout_rate=0.1):
        super().__init__()
        self.seq = tf.keras.Sequential([
            tf.keras.layers.Dense(dff, activation='relu'),
            tf.keras.layers.Dense(d_model),
            tf.keras.layers.Dropout(dropout_rate)
        ])
        self.add = tf.keras.layers.Add()
        self.layer_norm = tf.keras.layers.LayerNormalization()

    def call(self, x):
        x = self.add([x, self.seq(x)])
        x = self.layer_norm(x)
        return x

In [None]:
sample_ffn = FeedForward(512, 2048)

print(en_emb.shape)
print(sample_ffn(en_emb).shape)

(48, 127, 512)
(48, 127, 512)


In [None]:
class DecoderLayer(tf.keras.layers.Layer):
  def __init__(self,
               *,
               d_model,
               num_heads,
               dff,
               dropout_rate=0.1):
    super(DecoderLayer, self).__init__()

    self.causal_self_attention = CausalSelfAttention(
        num_heads=num_heads,
        key_dim=d_model,
        dropout=dropout_rate)

    self.ffn = FeedForward(d_model, dff)

  def call(self, x):
    x = self.causal_self_attention(x=x)

    x = self.ffn(x)  # Shape `(batch_size, seq_len, d_model)`.
    return x

In [None]:
sample_decoder_layer = DecoderLayer(d_model=512, num_heads=8, dff=2048)

sample_decoder_layer_output = sample_decoder_layer(
    x=en_emb)

print(en_emb.shape)
print(sample_decoder_layer_output.shape)  # `(batch_size, seq_len, d_model)`

(48, 127, 512)
(48, 127, 512)


In [None]:
class Decoder(tf.keras.layers.Layer):
  def __init__(self, *, num_layers, d_model, num_heads, dff, vocab_size,
               dropout_rate=0.1):
    super(Decoder, self).__init__()

    self.d_model = d_model
    self.num_layers = num_layers

    self.pos_embedding = PositionalEmbedding(vocab_size=vocab_size,d_model=d_model)

    self.dropout = tf.keras.layers.Dropout(dropout_rate)
    self.dec_layers = [
        DecoderLayer(d_model=d_model, num_heads=num_heads,
                     dff=dff, dropout_rate=dropout_rate)
        for _ in range(num_layers)]


  def call(self, x):
    # `x` is token-IDs shape (batch, target_seq_len)
    x = self.pos_embedding(x)  # (batch_size, target_seq_len, d_model)

    x = self.dropout(x)

    for i in range(self.num_layers):
      x  = self.dec_layers[i](x)

    # The shape of x is (batch_size, target_seq_len, d_model).
    return x

In [None]:
# Instantiate the decoder.
sample_decoder = Decoder(num_layers=4,
                         d_model=512,
                         num_heads=8,
                         dff=2048,
                         vocab_size=8000)

cnt=0
output = sample_decoder(x=pt)
cnt=cnt+1
  # Print the shapes.
print(cnt)

1


In [None]:
print(pt.shape)
print(sample_decoder_layer_output.shape)

(48, 127)
(48, 127, 512)


In [None]:
class Transformer(tf.keras.Model):
  def __init__(self, *, num_layers, d_model, num_heads, dff,
               vocab_size, dropout_rate=0.1):
    super().__init__()

    self.decoder = Decoder(num_layers=num_layers, d_model=d_model,
                           num_heads=num_heads, dff=dff,
                           vocab_size=vocab_size,
                           dropout_rate=dropout_rate)

    self.final_layer = tf.keras.layers.Dense(vocab_size)

  def call(self, inputs):
    # To use a Keras model with `.fit` you must pass all your inputs in the
    # first argument.
    x = self.decoder(inputs)  # (batch_size, target_len, d_model)

    # Final linear layer output.
    logits = self.final_layer(x)  # (batch_size, target_len, vocab_size)

    # try:
    #   # Drop the keras mask, so it doesn't scale the losses/metrics.
    #   # b/250038731
    #   del logits._keras_mask
    # except AttributeError:
    #   pass

    # Return the final output and the attention weights.
    return logits

# Training

In [None]:
num_layers = 4
d_model = 128
dff = 512
num_heads = 8
dropout_rate = 0.1

In [None]:
transformer = Transformer(
    num_layers=num_layers,
    d_model=d_model,
    num_heads=num_heads,
    dff=dff,
    vocab_size=total_word,
    dropout_rate=dropout_rate)

In [None]:
output = transformer(pt)

print(en.shape)
print(pt.shape)
print(output.shape)

(48, 127)
(48, 127)
(48, 127, 4751)


In [None]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
  def __init__(self, d_model, warmup_steps=4000):
    super().__init__()

    self.d_model = d_model
    self.d_model = tf.cast(self.d_model, tf.float32)
    self.initial_learning_rate = 0.01
    self.warmup_steps = warmup_steps

  def __call__(self, step):
    step = tf.cast(step, dtype=tf.float32)
    arg1 = tf.math.rsqrt(step)
    arg2 = step * (self.warmup_steps ** -1.5)

    return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

  def get_config(self):
        return {"initial_learning_rate": self.initial_learning_rate}

In [None]:
learning_rate = CustomSchedule(d_model)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98,
                                     epsilon=1e-9)

In [None]:
def masked_loss(label, pred):
  mask = label != 0
  loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')
  loss = loss_object(label, pred)

  mask = tf.cast(mask, dtype=loss.dtype)
  loss *= mask

  loss = tf.reduce_sum(loss)/tf.reduce_sum(mask)
  return loss


def masked_accuracy(label, pred):
  pred = tf.argmax(pred, axis=2)
  label = tf.cast(label, pred.dtype)
  match = label == pred

  mask = label != 0

  match = match & mask

  match = tf.cast(match, dtype=tf.float32)
  mask = tf.cast(mask, dtype=tf.float32)
  return tf.constant(1, dtype=tf.float32)
  return tf.reduce_sum(match)/tf.reduce_sum(mask)

In [None]:
transformer.compile(
    loss=masked_loss,
    optimizer=optimizer,
    metrics=[masked_accuracy])

In [None]:
history=transformer.fit(train_batches,
                epochs=10,validation_data=val_batches,verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.show()

plot_graphs(history, "masked_accuracy")
plot_graphs(history, "loss")

# Pridict

In [None]:
class Translator(tf.Module):
  def __init__(self, tokenizer, transformer):
    self.tokenizer = tokenizer
    self.transformer = transformer

  def __call__(self, sentences, max_length=seq_len,num_gen=20):

    sentences=sentences.numpy()
    if type(sentences)==np.ndarray:
      sentences=[s.decode() for s in sentences]
    else:
      sentences=sentences.decode()

    if type(sentences)==str:
      sentences=[sentences]

    tokens = self.tokenizer.texts_to_sequences(sentences)
    for i in range(num_gen):
      inputs=pad_sequences(tokens, maxlen=max_length, padding='pre',truncating='pre')

      predictions = self.transformer(tf.convert_to_tensor(inputs), training=False)

      predictions = predictions[:, -1:, :]  # Shape `(batch_size, 1, vocab_size)`.

      predicted_id = tf.argmax(predictions, axis=-1)

      for j in range(len(sentences)):
        tokens[j] = tokens[j]+[predicted_id.numpy()[j][0]]

    return self.tokenizer.sequences_to_texts(tokens)

In [None]:
class Beam_Search(tf.Module):
  def __init__(self, tokenizer, transformer):
    self.tokenizer = tokenizer
    self.transformer = transformer

  def __call__(self, sentences, max_length=seq_len,num_gen=20):

    sentences=sentences.numpy()
    if type(sentences)==np.ndarray:
      sentences=[s.decode() for s in sentences]
    else:
      sentences=sentences.decode()

    if type(sentences)==str:
      sentences=[sentences]

    band_width=3
    tokens = self.tokenizer.texts_to_sequences(sentences)
    cur_seq=[]
    cur_prob=[]
    cur_len = []
    batch_size=len(sentences)
    for j in range(batch_size):
        cur_seq=cur_seq+[tokens[j]]*band_width

        cur_prob=cur_prob+[0.]*band_width

        cur_len=cur_len+[0]*band_width

    for i in range(num_gen):
      inputs=pad_sequences(cur_seq, maxlen=max_length, padding='pre',truncating='pre')

      predictions = self.transformer(tf.convert_to_tensor(inputs), training=False)

      predictions = predictions[:, -1:, :]  # Shape `(batch_size*band_width, 1, vocab_size)`.

      # predicted_id = tf.argmax(predictions, axis=-1)

      _,predicted_id = tf.math.top_k(predictions, k=band_width)

      for j in range(0,batch_size*band_width,band_width):
        candiate_list=[]
        for k in range(j,j+band_width):
          for t in range(band_width):
            idx=predicted_id.numpy()[k][0][t]
            next_list = cur_seq[k]+[idx]
            next_prob = cur_prob[k]+np.log(predictions.numpy()[k][0][idx])
            next_len = cur_len[k]+1
            mean_pob= next_prob/next_len

            candiate_list=candiate_list+[(next_list,next_prob,next_len,mean_pob)]

        sorted_list = sorted(candiate_list, key=lambda x: x[-1], reverse=True)
        for k in range(j,j+band_width):
          cur_seq[k]=sorted_list[k-j][0]
          cur_prob[k]=sorted_list[k-j][1]
          cur_len[k]=sorted_list[k-j][2]


    gen_text=[]

    for i in range(0,batch_size*band_width,band_width):
        tmp = cur_prob[i:i+band_width]
        idx = tmp.index(max(tmp))
        print(cur_len[i+idx])
        gen_text=gen_text+[cur_seq[i+idx]]
    # return tokenizer.sequences_to_texts(cur_seq)
    return self.tokenizer.sequences_to_texts(gen_text)

In [None]:
translator = Beam_Search(tokenizer, transformer)
# translator = Translator(tokenizer, transformer)
sentence = ['I love you','i like you']

translator(tf.constant(sentence))

20
20


["i love you come lying\nthe eunuch might efface\nremembrance of amber chose there master the earth's the earth's the earth's the earth's a fountain's",
 "i like you come splashed free.\nshe played and splashed free.\nshe played and splashed her rays of senses bereft at a fountain's a fountain's"]

# Export

In [None]:
class ExportTranslator(tf.Module):
  def __init__(self, translator):
    self.translator = translator

  @tf.function(input_signature=[tf.TensorSpec(shape=[], dtype=tf.string)])
  def __call__(self, sentence):
    (result)= self.translator(sentence, max_length=seq_len, num_gen=20)

    return result

In [None]:
translator = ExportTranslator(translator)

In [None]:
translator('i love you')

AttributeError: ignored

In [None]:
tf.saved_model.save(translator, export_dir='translator')

In [None]:
reloaded = tf.saved_model.load('translator')

In [None]:
# Inspect the loaded object
print(dir(reloaded))

In [None]:
reloaded.translator('i love you').numpy()

In [None]:
transformer.save('/content')