In [1]:
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModelForMaskedLM
from processingDatasetDotless import *
import numpy as np

In [2]:
new_checkpoint = "awwab-ahmed/bert-base-arabic-camelbert-mix-finetuned-AR-dotted-mediumPlus"
tokenizer = AutoTokenizer.from_pretrained(new_checkpoint)
model = TFAutoModelForMaskedLM.from_pretrained(new_checkpoint)

tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/305k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/776k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/659 [00:00<?, ?B/s]




tf_model.h5:   0%|          | 0.00/530M [00:00<?, ?B/s]




All model checkpoint layers were used when initializing TFBertForMaskedLM.

All the layers of TFBertForMaskedLM were initialized from the model checkpoint at awwab-ahmed/bert-base-arabic-camelbert-mix-finetuned-AR-dotted-mediumPlus.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForMaskedLM for predictions without further training.


In [3]:
from tashaphyne.stemming import ArabicLightStemmer
ArListem = ArabicLightStemmer()

def get_candidate_word_probabilities(input_text, candidate_words):
    tokenized_text = tokenizer.tokenize(input_text)
    masked_word_index = tokenized_text.index('[MASK]')
    input_ids = tokenizer.convert_tokens_to_ids(tokenized_text)

    input_ids = tf.constant([input_ids], dtype=tf.int32)  # tf.constant automatically adds batch dimension

    # Perform prediction
    output = model(input_ids)

    # Extract logits for the masked word and apply softmax
    predictions = tf.nn.softmax(output.logits[0, masked_word_index])

    # Tokenize and verify candidate words
    pre_tokenized_candidate_words = [tokenizer.tokenize(word) for word in candidate_words]
    tokenized_candidate_words = []
    verified_words = []
    for word in pre_tokenized_candidate_words:
        rep_word = candidate_words[pre_tokenized_candidate_words.index(word)]
        if len(word) == 1:
            tokenized_candidate_words.append(word[0])
            verified_words.append(rep_word)

        else:
            #if rep_word not in arDictionary: continue
            stem = ArListem.light_stem(rep_word)
            root = ArListem.get_root()

            if len(tokenizer.tokenize(stem)) == 1:
                tokenized_candidate_words.append(tokenizer.tokenize(stem)[0])
                verified_words.append(rep_word)

            elif len(tokenizer.tokenize(root)) == 1:
                tokenized_candidate_words.append(tokenizer.tokenize(root)[0])
                verified_words.append(rep_word)

    # Convert tokens to IDs
    candidate_word_ids = [tokenizer.convert_tokens_to_ids([word]) for word in tokenized_candidate_words]

    # Calculate probabilities for each candidate word
    candidate_probabilities = {word: predictions[word_id].numpy() for word, word_id in zip(verified_words, candidate_word_ids)}

    return candidate_probabilities

In [4]:
def generate_probabilties(example, gen_prob_func=get_candidate_word_probabilities):
    input_text = example["Masked"]
    candidates = example["Options"]

    word_probabilities = gen_prob_func(input_text, candidates)

    sorted_words = sorted(word_probabilities, key=word_probabilities.get, reverse=True)
    if len(sorted_words) > 0:
        most_probable_word = sorted_words[0]
    else:
        most_probable_word = None
        #print(example["Target"])

    return word_probabilities, sorted_words, most_probable_word

In [5]:
def single_test(specific_string=None, specific_index=None, num_eg=None, gen_prob_func=get_candidate_word_probabilities, example = None):
    if example != None: example = example
    if specific_string != None: example = mask_word(specific_string, specific_index)
    word_probabilities, sorted_words, most_probable_word = generate_probabilties(example, gen_prob_func)
    print("Length of words:", len(sorted_words))
    for word in sorted_words:
        probability = word_probabilities[word]
        print(f"Word: '{word}', Probability: {probability:.10f}")

    print()

    print("Most probable word:", most_probable_word)
    print("Target word:", example["Target"])
    print("------------------------------------------")

    found = False
    for i in range(len(sorted_words)):
        if sorted_words[i] == example["Target"]:
            print("Sucess at probability level:", i)
            found = True
            sucess_level = i
            break
    if not found: print("Not found.")

    print("Masked:", example["Masked"])
    print("Options:", example["Options"])
    print("Target:", example["Target"])


In [6]:
single_test('السلام عليكم ورحمة الله وبركاته', 4)

Length of words: 6
Word: 'وبركاته', Probability: 0.9642794132
Word: 'وتركابة', Probability: 0.0000004786
Word: 'وتركابه', Probability: 0.0000002465
Word: 'وبركابه', Probability: 0.0000002465
Word: 'وبركابة', Probability: 0.0000002465
Word: 'وتركاته', Probability: 0.0000000292

Most probable word: وبركاته
Target word: وبركاته
------------------------------------------
Sucess at probability level: 0
Masked: السلام عليكم ورحمة الله [MASK]
Options: ['ؤثزكءثه', 'ؤثزكءثة', 'ؤثزكءته', 'ؤثزكءتة', 'ؤثزكءبه', 'ؤثزكءبة', 'ؤثزكآثه', 'ؤثزكآثة', 'ؤثزكآته', 'ؤثزكآتة', 'ؤثزكآبه', 'ؤثزكآبة', 'ؤثزكإثه', 'ؤثزكإثة', 'ؤثزكإته', 'ؤثزكإتة', 'ؤثزكإبه', 'ؤثزكإبة', 'ؤثزكأثه', 'ؤثزكأثة', 'ؤثزكأته', 'ؤثزكأتة', 'ؤثزكأبه', 'ؤثزكأبة', 'ؤثزكاثه', 'ؤثزكاثة', 'ؤثزكاته', 'ؤثزكاتة', 'ؤثزكابه', 'ؤثزكابة', 'ؤثركءثه', 'ؤثركءثة', 'ؤثركءته', 'ؤثركءتة', 'ؤثركءبه', 'ؤثركءبة', 'ؤثركآثه', 'ؤثركآثة', 'ؤثركآته', 'ؤثركآتة', 'ؤثركآبه', 'ؤثركآبة', 'ؤثركإثه', 'ؤثركإثة', 'ؤثركإته', 'ؤثركإتة', 'ؤثركإبه', 'ؤثركإبة', 'ؤثركأثه', 'ؤثركأثة', 