In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModelForMaskedLM, DataCollatorForLanguageModeling
import torch
from datasets import Dataset
from collections import Counter

2024-04-30 19:51:40.467481: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-30 19:51:40.467586: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-30 19:51:40.646173: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# Load Model

In [2]:
tokenizer = AutoTokenizer.from_pretrained("Twitter/twhin-bert-large")
model = AutoModelForMaskedLM.from_pretrained("Twitter/twhin-bert-large").to('cuda')

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15, return_tensors='pt')

print(model)

tokenizer_config.json:   0%|          | 0.00/373 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/634 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.25G [00:00<?, ?B/s]

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(250002, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-23): 24 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
              (distance_embedding): Embedding(1023, 64)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)


## Run inference

In [3]:
# Input text with masked tokens
input_text = "انا اسمي علي, اذهب كل يوم الي <mask> لكي اتعلم."
# input_text = "My name is ali, eachday i go to <mask>."

# Tokenize input text
tokenized_input = tokenizer(input_text, return_tensors='pt').to('cuda')

# Mask the token corresponding to the masked word
mask_token_index = torch.where(tokenized_input['input_ids'] == tokenizer.mask_token_id)
input_ids = tokenized_input['input_ids'].clone()
input_ids[mask_token_index] = tokenizer.mask_token_id

# Generate predictions
with torch.no_grad():
    outputs = model(input_ids) # fORWARD PASS

# Extract predictions for the masked tokens
masked_token_logits = outputs.logits[mask_token_index]

# Get the top-n predicted token IDs and scores
n = 5  # Number of top predictions to retrieve
top_n_scores, top_n_indices = torch.topk(masked_token_logits, n, dim=-1)

# Decode the top-n predicted token IDs into text
top_n_predicted_tokens = []
for i in range(len(mask_token_index[0])):
    top_n_predicted_tokens.append([tokenizer.decode(token_id.item()) for token_id in top_n_indices[i]])

# Print the top-n predicted tokens and their scores
for i, (tokens, scores) in enumerate(zip(top_n_predicted_tokens, top_n_scores)):
    print(f"Masked token {i+1}:")
    for token, score in zip(tokens, scores):
        print(f"Token: {token}, Score: {score.item()}")

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Masked token 1:
Token: المدرسة, Score: 21.685152053833008
Token: السماء, Score: 21.205638885498047
Token: كم, Score: 20.726768493652344
Token: الأمام, Score: 20.532899856567383
Token: نفسي, Score: 20.517539978027344


# Load Dataset

In [4]:
df = pd.read_csv('/kaggle/input/arabic-classification/arabic_dataset_classifiction.csv/arabic_dataset_classifiction.csv')
df

Unnamed: 0,text,targe
0,بين أستوديوهات ورزازات وصحراء مرزوكة وآثار ولي...,0
1,قررت النجمة الأمريكية أوبرا وينفري ألا يقتصر ع...,0
2,أخبارنا المغربية الوزاني تصوير الشملالي ألهب ا...,0
3,اخبارنا المغربية قال ابراهيم الراشدي محامي سعد...,0
4,تزال صناعة الجلود في المغرب تتبع الطريقة التقل...,0
...,...,...
111723,اللاعب تأخر في العودة إلى التداريب والمدرب غاض...,4
111724,المشرف العام لحسنية أكادير قال إنه سيغادر الفر...,4
111725,نسب إليه نتائج الوداد وصحوة الرجاء وآخر صيحاته...,4
111726,ستحتضن الرباط في الفترة مابين يوليوز المقبل دو...,4


# Preprocessing

In [5]:
def preprocess(sentence: str):
    sentence = sentence.replace('أ', 'ا').replace('إ', 'ا').replace('آ', 'ا')
    
    for char in sentence:
        if char not in 'ابتثجحخدذرزسشصضطظعغفقكلمنهويءئؤىة ':
            sentence = sentence.replace(char, "")

    return sentence

def tokenize(x):
    tokenized_inputs = tokenizer(x["text"], truncation=True, padding="max_length", max_length=128) # input
#     tokenized_inputs["word_ids"] = [tokenized_inputs.word_ids(i) for i in range(len(tokenized_inputs["input_ids"]))]
    
    return {**tokenized_inputs}

def adjust_threshold(my_dict:dict):
    words_freq = list(my_dict.values())
    mean = np.mean(words_freq)
    std_dev = np.std(words_freq)
    k = 1
    threshold = mean + 0.01 * std_dev

    return int(threshold)


def preprocess_input(input_text:str,my_dict:dict,threshold):
    words = input_text.split()
    masked_arr = []
    
    #generate mask 
    for idx,word in enumerate(words):
        freq = my_dict.get(word,0)
        if freq < threshold:
            arr = words.copy()
            arr[idx] = "<mask>"
            sentence = " ".join(arr)
            masked_arr.append(sentence)

    return masked_arr 

def data_vocab(dataframe):
    words_freq = Counter()
    for index, row in dataframe.iterrows():
        sentence = row['text']
        words = sentence.split()
        words_freq.update(words)
    return words_freq

#conatenate 
def concatenate(sentences,masked):
    true_sentence = []
    masks_ids = {}

    #get mask index in each sentence
    for idx_sentence,sentence in enumerate(sentences):
        words = sentence.split()
        for idx_word,word in enumerate(words):
            if word == '<mask>':
                masks_ids[idx_sentence] = idx_word 
                
    #concatenate the true sentence
    for idx_sentence,sentence in enumerate(sentences):
        words = sentence.split()
        for idx_word,word in enumerate(words):
            if words[idx_word] not in true_sentence:
                if idx_word not in masks_ids.values():
                    true_sentence += [words[idx_word]]
                else:
                    if masked:  # Check if masked list is not empty
                        true_sentence.append(masked.pop(0))
    true_sentence = ' '.join(true_sentence)
    
    return true_sentence

In [6]:
# Preprocess
df = df.drop(columns=['targe'], axis=1)

# drop null values
df = df.dropna()

# drop duplicates
df = df.drop_duplicates()

# remove any letter but arabic
df['text'] = df['text'].apply(lambda x: preprocess(x))

# remove sentences that are less than 5 words
df['text'] = df['text'].apply(lambda x: x if len(x.split()) > 5 else None)
df = df.dropna().reset_index(drop=True)

# Vocab of the dataset
words_freq = data_vocab(df)

# Convert to hugging face Dataset
dataset = Dataset.from_pandas(df[:1000]) # 10K rows only

# Mapping columns
dataset = dataset.map(tokenize, batched=True, remove_columns=['text'])

dataset = dataset.train_test_split(test_size=0.2, shuffle=True)

dataset

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 800
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 200
    })
})

In [7]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./model",
    evaluation_strategy="epoch",
    learning_rate=1e-5,
    num_train_epochs=3,
    weight_decay=5e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    push_to_hub=True,
    logging_steps=10,
    eval_steps=10,
    push_to_hub_token="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",
    report_to="wandb"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=data_collator,
)

In [8]:
trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss
1,2.5748,2.238981
2,2.3873,2.345582
3,2.4142,2.285141


TrainOutput(global_step=150, training_loss=2.4738434982299804, metrics={'train_runtime': 373.6758, 'train_samples_per_second': 6.423, 'train_steps_per_second': 0.401, 'total_flos': 559619617996800.0, 'train_loss': 2.4738434982299804, 'epoch': 3.0})

In [9]:
trainer.save_model("/kaggle/working/model")

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/2.25G [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/4.92k [00:00<?, ?B/s]

In [10]:
model = AutoModelForMaskedLM.from_pretrained("/kaggle/working/model").to('cuda')

In [None]:
threshold = adjust_threshold(words_freq)
def predict(input_text):
    # Tokenize input text
    tokenized_input = tokenizer(input_text, return_tensors='pt').to('cuda')

    # Mask the token corresponding to the masked word
    mask_token_index = torch.where(tokenized_input['input_ids'] == tokenizer.mask_token_id)

    # Generate predictions
    with torch.no_grad():
        outputs = model(**tokenized_input) # fORWARD PASS

    # Extract predicted token ID
    predicted_token_id = torch.argmax(outputs.logits[mask_token_index]).item()

    # Decode the predicted token ID
    predicted_token = tokenizer.decode(predicted_token_id)
    
    return predicted_token

def pipeline(input_text):
    arr_input = preprocess_input(input_text,words_freq,threshold)
    masked = []
    for sentence in arr_input:
        mask = predict(sentence)
        masked.append(mask)
    
    true_sentence = concatenate(arr_input,masked)
    
    return true_sentence
    
        

In [20]:
# Test the model
input_text = "قررت النجمة الامركية اوبرا وينفري الا يقتر عملها على الفن"
true_sentence = pipeline(input_text)

print(true_sentence)

المصرية
