In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from tokenizers import *
from datasets import *
from transformers import DataCollatorForSeq2Seq
import evaluate
from transformers import AdamWeightDecay
import numpy as np
from transformers import TFAutoModelForSeq2SeqLM
import tensorflow as tf
from transformers.keras_callbacks import KerasMetricCallback
from transformers.keras_callbacks import PushToHubCallback
from transformers import pipeline

In [None]:

tokenizer = AutoTokenizer.from_pretrained("dot-ammar/AR-dotless-tokenizer")

checkpoint = "google/t5-v1_1-small"


In [None]:
#dataset = load_from_disk("AR-dotless-small-arrow")
dataset = load_dataset("dot-ammar/AR-dotless-small")

In [None]:
dataset = dataset["train"].train_test_split(test_size=0.2)


In [None]:
dataset

In [None]:
example1 = {'clean': ['نهائي', 'لكوينز'], 'dotless': ['نهاىى', 'لكوىنر']}


In [None]:
source_col = "dotless"
target_col = "clean"

def preprocess_function(examples):
    inputs = [example for example in examples[source_col]]
    targets = [example for example in examples[target_col]]

    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)        
    return model_inputs

In [None]:
test = preprocess_function(example1)
print(test)

In [None]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)

In [None]:
tokenized_dataset

In [None]:
print(tokenized_dataset['train'][0])

In [25]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint, return_tensors="tf")
data_collator.tokenizer


T5TokenizerFast(name_or_path='dot-ammar/AR-dotless-tokenizer', vocab_size=52000, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'additional_special_tokens': ['<extra_id_0>', '<extra_id_1>', '<extra_id_2>', '<extra_id_3>', '<extra_id_4>', '<extra_id_5>', '<extra_id_6>', '<extra_id_7>', '<extra_id_8>', '<extra_id_9>', '<extra_id_10>', '<extra_id_11>', '<extra_id_12>', '<extra_id_13>', '<extra_id_14>', '<extra_id_15>', '<extra_id_16>', '<extra_id_17>', '<extra_id_18>', '<extra_id_19>', '<extra_id_20>', '<extra_id_21>', '<extra_id_22>', '<extra_id_23>', '<extra_id_24>', '<extra_id_25>', '<extra_id_26>', '<extra_id_27>', '<extra_id_28>', '<extra_id_29>', '<extra_id_30>', '<extra_id_31>', '<extra_id_32>', '<extra_id_33>', '<extra_id_34>', '<extra_id_35>', '<extra_id_36>', '<extra_id_37>', '<extra_id_38>', '<extra_id_39>', '<extra_id_40>', '<extra_id_41>', '<extra_id_42>', '<ex

In [None]:
metric = evaluate.load("sacrebleu")

In [None]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

## Finetuning with keras/tensorflow


In [None]:
optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)

In [39]:
model = TFAutoModelForSeq2SeqLM.from_pretrained(checkpoint)

All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at google/t5-v1_1-small.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


In [None]:
tf_train_set = model.prepare_tf_dataset(
    tokenized_dataset["train"],
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator,
)

tf_test_set = model.prepare_tf_dataset(
    tokenized_dataset["test"],
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator,
)

In [None]:
model.compile(optimizer=optimizer)

In [29]:
metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_test_set)



In [37]:
push_to_hub_callback = PushToHubCallback(
    output_dir="dot-ammar/dotless_model",
    tokenizer=tokenizer,
)

/Users/ammar/Developer/git-repos/dotless/Models/dot-ammar/dotless_model is already a clone of https://huggingface.co/dot-ammar/dotless_model. Make sure you pull the latest changes with `repo.git_pull()`.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [38]:
callbacks = [metric_callback, push_to_hub_callback]

In [None]:
model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=3, callbacks=callbacks)