In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from tokenizers import *
from datasets import *
from transformers import DataCollatorForSeq2Seq
import evaluate
from transformers import AdamWeightDecay
import numpy as np
from transformers import TFAutoModelForSeq2SeqLM
import tensorflow as tf
from transformers.keras_callbacks import KerasMetricCallback
from transformers.keras_callbacks import PushToHubCallback
from transformers import pipeline
from huggingface_hub import notebook_login

In [None]:

tokenizer = AutoTokenizer.from_pretrained("dot-ammar/AR-dotless-tokenizer")

checkpoint = "google/t5-v1_1-small"


In [None]:
tokenizer.vocab_size

In [14]:
dataset = load_dataset("dot-ammar/AR-dotless-mediumPlus")

In [None]:
dataset = dataset["train"].train_test_split(train_size=0.9, seed=20)
dataset

In [None]:
dataset["validation"] = dataset.pop("test")
dataset

In [None]:
example1 = {'clean': ['نهائي', 'لكوينز'], 'dotless': ['نهاىى', 'لكوىنر']}


In [None]:
source_col = "dotless"
target_col = "clean"

def preprocess_function(examples):
    inputs = [example for example in examples[source_col]]
    targets = [example for example in examples[target_col]]

    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)        
    return model_inputs

In [None]:
test = preprocess_function(example1)
print(test)

In [None]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)

In [None]:
tokenized_dataset

In [None]:
#tokenized_dataset.save_to_disk("AR-dotless-mediumPlus-tokenized-arrow")


In [None]:
#tokenized_dataset.push_to_hub("dot-ammar/AR-dotless-mediumPlus-tokenized")

In [None]:
print(tokenized_dataset['train'][0])

## Finetuning with keras/tensorflow


In [None]:
model = TFAutoModelForSeq2SeqLM.from_pretrained(checkpoint)

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf")

In [None]:
batch = data_collator([tokenized_dataset["train"][i] for i in range(1, 3)])
batch.keys()

In [None]:
batch["labels"]

In [None]:
batch["decoder_input_ids"]

In [None]:
for i in range(1, 3):
    print(tokenized_dataset["train"][i]["labels"])

In [None]:
tf_train_dataset = model.prepare_tf_dataset(
    tokenized_dataset["train"],
    collate_fn=data_collator,
    shuffle=True,
    batch_size=32,
)
tf_eval_dataset = model.prepare_tf_dataset(
    tokenized_dataset["validation"],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=16,
)

In [None]:
metric = evaluate.load("sacrebleu")

In [None]:
import numpy as np
import tensorflow as tf
from tqdm import tqdm

generation_data_collator = DataCollatorForSeq2Seq(
    tokenizer, model=model, return_tensors="tf", pad_to_multiple_of=128
)

tf_generate_dataset = model.prepare_tf_dataset(
    tokenized_dataset["validation"],
    collate_fn=generation_data_collator,
    shuffle=False,
    batch_size=8,
)



def generate_with_xla(batch):
    return model.generate(
        input_ids=batch["input_ids"],
        attention_mask=batch["attention_mask"],
        max_new_tokens=128,
    )


def compute_metrics():
    all_preds = []
    all_labels = []

    for batch, labels in tqdm(tf_generate_dataset):
        predictions = generate_with_xla(batch)
        decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
        labels = labels.numpy()
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
        decoded_preds = [pred.strip() for pred in decoded_preds]
        decoded_labels = [[label.strip()] for label in decoded_labels]
        all_preds.extend(decoded_preds)
        all_labels.extend(decoded_labels)

    result = metric.compute(predictions=all_preds, references=all_labels)
    return {"bleu": result["score"]}

In [None]:
from transformers import create_optimizer
from transformers.keras_callbacks import PushToHubCallback
import tensorflow as tf

# The number of training steps is the number of samples in the dataset, divided by the batch size then multiplied
# by the total number of epochs. Note that the tf_train_dataset here is a batched tf.data.Dataset,
# not the original Hugging Face Dataset, so its len() is already num_samples // batch_size.
num_epochs = 2
num_train_steps = len(tf_train_dataset) * num_epochs
'''
optimizer, schedule = create_optimizer(
    init_lr=5e-5,
    num_warmup_steps=0,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
)
'''
optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)
model.compile(optimizer=optimizer)

# Train in mixed-precision float16
tf.keras.mixed_precision.set_global_policy("mixed_float16")

In [None]:
from transformers.keras_callbacks import KerasMetricCallback

metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_eval_dataset)

In [None]:
from transformers.keras_callbacks import PushToHubCallback

push_to_hub_callback = PushToHubCallback(
    output_dir="dotless_model",
    tokenizer=tokenizer,
    hub_token= "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"

)

In [None]:
callbacks = [metric_callback, push_to_hub_callback]

In [None]:

model.fit(
    tf_train_dataset,
    validation_data=tf_eval_dataset,
    callbacks=callbacks,
    epochs=num_epochs,
)