In [40]:
import evaluate
import pathlib
import tensorflow as tf
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import DataCollatorForSeq2Seq
from transformers import TFAutoModelForSeq2SeqLM
from transformers.keras_callbacks import KerasMetricCallback
from huggingface_hub import notebook_login
from transformers.keras_callbacks import PushToHubCallback

In [41]:
tf.get_logger().setLevel('ERROR')
pwd = pathlib.Path.cwd()

In [None]:
notebook_login()

In [3]:
dataset = load_dataset("dot-ammar/AR-dotless-small")

In [42]:
checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [43]:
source_col = "dotless"
target_col = "clean"
prefix = "translate between dotless and dotted Arabic text: "


def preprocess_function(examples):
    inputs = [prefix + example for example in examples[source_col]]
    targets = [example for example in examples[target_col]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    return model_inputs

In [44]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)

In [9]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint, return_tensors="tf")

In [15]:
metric = evaluate.load("sacrebleu")

In [16]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [47]:
optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)

In [None]:
model = TFAutoModelForSeq2SeqLM.from_pretrained(checkpoint)

In [20]:
tf_train_set = model.prepare_tf_dataset(
    tokenized_dataset["train"],
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator,
)

tf_test_set = model.prepare_tf_dataset(
    tokenized_dataset["train"],
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator,
)

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [45]:
model.compile(optimizer=optimizer)

In [26]:
metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_test_set)

In [34]:
push_to_hub_callback = PushToHubCallback(
    output_dir="dotless_0.01",
    tokenizer=tokenizer,
)

Cloning https://huggingface.co/kaddu341/dotless_0.01 into local empty directory.


In [35]:
callbacks = [metric_callback, push_to_hub_callback]

In [None]:
model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=9, callbacks=callbacks)