### Convert model to TFLite
- Credit: @awwab-ahmad
- Model: https://huggingface.co/awwab-ahmed/bert-base-arabic-camelbert-mix-finetuned-AR-dotted-mediumPlus

In [1]:
# %pip install tensorflow transformers
# %pip install tf-keras

In [1]:
# imports
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModelForMaskedLM

In [None]:
# initialize tokenizer and model
checkpoint = "awwab-ahmed/bert-base-arabic-camelbert-mix-finetuned-AR-dotted-mediumPlus"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = TFAutoModelForMaskedLM.from_pretrained(checkpoint)

In [4]:
model.save_pretrained("tf-model")
tokenizer.save_pretrained("tf-model")

In [None]:
converter = tf.lite.TFLiteConverter.from_keras_model(model)
converter.optimizations = [tf.lite.Optimize.DEFAULT]  # dynamic range quantization to reduce size
tflite_model = converter.convert()

with open("quant_model.tflite", "wb") as file:
  file.write(tflite_model)

### Playground

In [3]:
import tensorflow_text as tf_text

In [4]:
# get vocabulary
with open("tf-model/vocab.txt", "r") as file:
    vocab = [line.rstrip() for line in file]

In [36]:
#test
test_str = "طالما أشكو غرامي يا نور الوجود"
tokenized_text = tokenizer.tokenize(test_str)
tokens = tokenizer(test_str)
input_ids = tokenizer.convert_tokens_to_ids(tokenized_text)
input_ids = tf.constant(input_ids, dtype=tf.int32)
print(input_ids)
print(tokens)
input_ids = tf.constant(input_ids, dtype=tf.int32)
print(input_ids)

tf.Tensor([10864 28995  1012  9111  1013  2104  3772 10675], shape=(8,), dtype=int32)
{'input_ids': [2, 10864, 28995, 1012, 9111, 1013, 2104, 3772, 10675, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
tf.Tensor([10864 28995  1012  9111  1013  2104  3772 10675], shape=(8,), dtype=int32)


In [22]:
def encode_word(word):
    tokens = []
    while len(word) > 0:
        i = len(word)
        while i > 0 and word[:i] not in vocab:
            i -= 1
        if i == 0:
            return ["[UNK]"]
        tokens.append(word[:i])
        word = word[i:]
        if len(word) > 0:
            word = f"##{word}"
    return tokens

print(encode_word("أفإستسقيناكموهما"))
print(tokenizer.tokenize("أفإستسقيناكموهما"))


أف
['أف', '##إ', '##ست', '##س', '##قين', '##اكم', '##وهم', '##ا']


In [16]:
test_str = "البيت الكبير [MASK]"
tokenized_text = tokenizer.tokenize(test_str)
masked_word_index = tokenized_text.index('[MASK]')
input_ids = tokenizer.convert_tokens_to_ids(tokenized_text)
input_ids = tf.constant([input_ids], dtype=tf.int32)
output = model(input_ids)
print(output)
predictions = tf.nn.softmax(output.logits[0, masked_word_index])
print(predictions.shape)

TFMaskedLMOutput(loss=None, logits=<tf.Tensor: shape=(1, 3, 30000), dtype=float32, numpy=
array([[[-3.145209  , -0.57819   , -1.1382166 , ..., -2.4511776 ,
         -3.7814074 , -2.852653  ],
        [-5.1905065 , -3.964841  , -0.97261924, ..., -3.3409166 ,
         -6.081559  , -5.9540405 ],
        [-6.482839  , -3.3633132 , -1.3157046 , ..., -5.052669  ,
         -5.12359   , -7.425818  ]]], dtype=float32)>, hidden_states=None, attentions=None)
(30000,)


In [17]:
predictions = tf.math.softmax(output.logits, axis=-1)
print(predictions)

tf.Tensor(
[[[1.8296893e-06 2.3835250e-05 1.3614548e-05 ... 3.6626166e-06
   9.6845622e-07 2.4515068e-06]
  [1.5803280e-08 5.3832835e-08 1.0728837e-06 ... 1.0046484e-07
   6.4828827e-09 7.3646009e-09]
  [6.3258192e-08 1.4318900e-06 1.1096222e-05 ... 2.6438192e-07
   2.4628113e-07 2.4636934e-08]]], shape=(1, 3, 30000), dtype=float32)


In [8]:
model.config.id2label

{0: 'LABEL_0', 1: 'LABEL_1'}

In [9]:
# test
from transformers import pipeline
unmasker = pipeline('fill-mask', model='awwab-ahmed/bert-base-arabic-camelbert-mix-finetuned-AR-dotted-mediumPlus')
unmasker("الهدف من الحياة هو [MASK] .")

All model checkpoint layers were used when initializing TFBertForMaskedLM.

All the layers of TFBertForMaskedLM were initialized from the model checkpoint at awwab-ahmed/bert-base-arabic-camelbert-mix-finetuned-AR-dotted-mediumPlus.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForMaskedLM for predictions without further training.


[{'score': 0.05004889890551567,
  'token': 2854,
  'token_str': 'العمل',
  'sequence': 'الهدف من الحياة هو العمل.'},
 {'score': 0.04471169784665108,
  'token': 3696,
  'token_str': 'الحياة',
  'sequence': 'الهدف من الحياة هو الحياة.'},
 {'score': 0.022871755063533783,
  'token': 7908,
  'token_str': 'التفكير',
  'sequence': 'الهدف من الحياة هو التفكير.'},
 {'score': 0.01748461276292801,
  'token': 7676,
  'token_str': 'العودة',
  'sequence': 'الهدف من الحياة هو العودة.'},
 {'score': 0.016479946672916412,
  'token': 1979,
  'token_str': 'أن',
  'sequence': 'الهدف من الحياة هو أن.'}]