**Finetuning BERT model on AR-dotted-mediumPlus for masked language modeling**

adapted from: https://huggingface.co/learn/nlp-course/chapter7/3?fw=tf

---



In [None]:
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModelForMaskedLM
from datasets import load_dataset
import numpy as np

In [None]:
checkpoint = "CAMeL-Lab/bert-base-arabic-camelbert-mix"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = TFAutoModelForMaskedLM.from_pretrained(checkpoint)

In [None]:
dataset = load_dataset("dot-ammar/AR-dotted-mediumPlus", split  = "train")

In [None]:
dataset

Dataset({
    features: ['clean'],
    num_rows: 1625508
})

In [None]:
dataset[0]

{'clean': 'يا في مواجهة حالات الغلو والتطرف والتوحش التي تنتشر في العديد من دول المنطقةلقد اختل الأمن القومي العربي في أواخر السبعينات بسبب'}

In [None]:
def tokenize_function(examples):
    result = tokenizer(examples["clean"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result

In [None]:
tokenized_datasets = dataset.map(
    tokenize_function, batched=True, remove_columns=["clean"]
)
tokenized_datasets

Map:   0%|          | 0/1625508 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids'],
    num_rows: 1625508
})

In [None]:
chunk_size = 128

In [None]:
def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # Create a new labels column
    result["labels"] = result["input_ids"].copy()
    return result

In [None]:
lm_datasets = tokenized_datasets.map(group_texts, batched=True)
lm_datasets

Map:   0%|          | 0/1625508 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
    num_rows: 405888
})

In [None]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [None]:
import collections
from transformers.data.data_collator import tf_default_data_collator

wwm_probability = 0.2


def whole_word_masking_data_collator(features):
    for feature in features:
        word_ids = feature.pop("word_ids")

        # Create a map between words and corresponding token indices
        mapping = collections.defaultdict(list)
        current_word_index = -1
        current_word = None
        for idx, word_id in enumerate(word_ids):
            if word_id is not None:
                if word_id != current_word:
                    current_word = word_id
                    current_word_index += 1
                mapping[current_word_index].append(idx)

        # Randomly mask words
        mask = np.random.binomial(1, wwm_probability, (len(mapping),))
        input_ids = feature["input_ids"]
        labels = feature["labels"]
        new_labels = [-100] * len(labels)
        for word_id in np.where(mask)[0]:
            word_id = word_id.item()
            for idx in mapping[word_id]:
                new_labels[idx] = labels[idx]
                input_ids[idx] = tokenizer.mask_token_id
        feature["labels"] = new_labels

    return tf_default_data_collator(features)

In [None]:
samples = [lm_datasets[i] for i in range(2)]
batch = whole_word_masking_data_collator(samples)

for chunk in batch["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")


'>>> [CLS] يا في مواجهة [MASK] الغلو والتطرف [MASK] [MASK] التي [MASK] في العديد من دول المنطقةلقد اختل الأمن [MASK] العربي [MASK] أواخر السبعينات [MASK] [SEP] [CLS] [MASK] التحديات فاتخذت الخطوات اللازمة لتحسين عملية التنفيذ [MASK] هنا سن قانون في كانون الأول ديسمبر الماضي للتعامل مع مزيد من [MASK] [MASK] [MASK] [SEP] [CLS] أعمال الشركة ومركزها المالي عن السنة المالية المنتهية [MASK] ديسمبر هو الأستاذ محمد حامد أبو النصر ولد في [MASK] منفلوط التابعة لمحا [SEP] [CLS] ة النهار في [MASK] [MASK] بكا بسبب تفوق ابنتها في الثانوية [MASK] [MASK] [MASK] بوابة الوطن خدمة معرفة نتيجة الثانوية [MASK] بفرعيها [MASK] [MASK] [MASK] [SEP] [CLS] اتها الى مركز [MASK] [MASK] في الخرطوم والذي لا يقبل'

'>>> المساس حتى بفتاتهاليس [MASK] مجال للمقارنة في امكانيات السودان من الموارد البشرية وال [SEP] [CLS] [MASK] [MASK] وتقام [MASK] العودة بين الفريقين يوم الخميس المقبل على ملعب زالغيريس ستاديوم [MASK] مدينة فيلنيوس الليتوانيةويحمل اشبيلية الرقم ا [SEP] [CLS] شاطر تتوكل [MASK] الله بعد أول صلاة [MASK] بها 

In [None]:
train_size = 360_000
test_size = int(0.1 * train_size)

dataset = lm_datasets.train_test_split(
    train_size=train_size, test_size=test_size, seed=42
)
dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 360000
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 36000
    })
})

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
tf_train_dataset = model.prepare_tf_dataset(
    dataset["train"],
    collate_fn=data_collator,
    shuffle=True,
    batch_size=32,
)

tf_eval_dataset = model.prepare_tf_dataset(
    dataset["test"],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=32,
)

In [None]:
from transformers import create_optimizer
from transformers.keras_callbacks import PushToHubCallback

num_train_steps = len(tf_train_dataset)
optimizer, schedule = create_optimizer(
    init_lr=2e-5,
    num_warmup_steps=1_000,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
)
model.compile(optimizer=optimizer)

# Train in mixed-precision float16
tf.keras.mixed_precision.set_global_policy("mixed_float16")

model_name = checkpoint.split("/")[-1]
callback = PushToHubCallback(
    output_dir=f"{model_name}-finetuned-AR-dotted-mediumPlus", tokenizer=tokenizer
)

In [None]:
model.fit(tf_train_dataset, validation_data=tf_eval_dataset, callbacks=[callback])

Cause: for/else statement not yet supported


Cause: for/else statement not yet supported


**TFLite Model**



---



In [None]:
new_checkpoint = "awwab-ahmed/bert-base-arabic-camelbert-mix-finetuned-AR-dotted-mediumPlus"
tokenizer = AutoTokenizer.from_pretrained(new_checkpoint)
model = TFAutoModelForMaskedLM.from_pretrained(new_checkpoint)

In [None]:
converter = tf.lite.TFLiteConverter.from_keras_model(model)
tflite_model = converter.convert()

with open("model.tflite", "wb") as file:
  file.write(tflite_model)