## checkout this papers:

[mMARCO: A Multilingual Version of the MS MARCO Passage Ranking Dataset](https://arxiv.org/abs/2108.13897)

[A cost-benefit analysis of cross-lingual transfer methods](https://arxiv.org/abs/2105.06813)


In [None]:
#load the mMARCO a multilingual version of the MS MARCO passage ranking dataset 
#from huggingface https://huggingface.co/datasets/unicamp-dl/mmarco
from datasets import load_dataset
dataset = load_dataset('unicamp-dl/mmarco', 'arabic')
dataset

In [None]:
# https://huggingface.co/aubmindlab/araelectra-base-discriminator
# A preprocessing is recommended by the authors of AraELECTRA and AraBERT before training or testing on any dataset. 
!pip install arabert -q
from arabert.preprocess import ArabertPreprocessor

#model_name="araelectra-base"
model_name="bert-base-arabertv2"
arabert_prep = ArabertPreprocessor(model_name=model_name)

#text = "و لن نبالغ إذا قلنا إن الهاتف أ و كمبيوتر  المكتب في زمننا هذا ضروري"
#arabert_prep.preprocess(text)

In [None]:
# select 5M sample out of 39M sample
dataset_eval = dataset['train'].select(range(5000000, 5005000))
dataset_train = dataset['train'].select(range(0, 5000000))

In [None]:
# The dataset is in the form (query, positive passage, negative passage).
# We split it into the forms (query, positive passage, label=1) and (query, negative passage, label=0)
# and preprocessed it, preprocessing with bert-base-arabertv2 take more than 12 hours it exceed the limit of Kaggle Notbook
#You can run this code for a range of 2.5M samples, then concatenate the resulted dataset. 
#from datasets import load_dataset, load_from_disk, concatenate_datasets
#dataset0 = load_from_disk('path to dataset0')
#dataset1 = load_from_disk('path to dataset1')
#dataset = concatenate_datasets([dataset0, dataset1])
def split_examples(batch):
    queries = []
    passages = []
    labels = []
    for label in ["positive", "negative"]:
        for (query, passage) in zip(batch["query"], batch[label]):
            queries.append(arabert_prep.preprocess(query))
            passages.append(arabert_prep.preprocess(passage))
            labels.append(int(label == "positive"))
    return {"query": queries, "passage": passages, "label": labels}

dataset_train = dataset_train.map(split_examples, batched=True, remove_columns=["positive", "negative"])
dataset_eval = dataset_eval.map(split_examples, batched=True, remove_columns=["positive", "negative"])


In [None]:
# we apply tokenization 
import torch
from datasets import load_dataset
from transformers import AutoTokenizer
args_model="aubmindlab/araelectra-base-discriminator"
tokenizer = AutoTokenizer.from_pretrained(args_model)

def tokenize(batch):
    tokenized = tokenizer(
        batch["query"],
        batch["passage"],
        padding=True,
        truncation="only_second",
        max_length=512,
        )
    tokenized["labels"] = [[float(label)] for label in batch["label"]]
    return tokenized

In [None]:
dataset_train = dataset_train.map(tokenize, batched=True, remove_columns=["query", "passage", "label"])
dataset_train.set_format("torch")
dataset_eval = dataset_eval.map(tokenize, batched=True, remove_columns=["query", "passage", "label"])
dataset_eval.set_format("torch")

In [None]:
# Save the dataset locally
dataset_train.save_to_disk("mmarco_train10M_preprossesd_for_AraBERT")
dataset_eval.save_to_disk("mmarco_eval10k_preprossesd_for_AraBERT")

In [None]:
import huggingface_hub 
hf = huggingface_hub.HfFolder()
access_token = "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" 
organization_dataset_id="hatemestinbejaia/RARAELECTRAandRARABERTusedDATASET"
#To push the dataset to your own Huggingface repository, change the organization_dataset_id and access_token
hf.save_token(access_token)
dataset_train.push_to_hub(organization_dataset_id, "mmarco_train10M_preprossesd_for_AraBERT")
dataset_eval.push_to_hub(organization_dataset_id, "mmarco_eval10k_preprossesd_for_AraBERT")

In [None]:
#You can use the processed dataset directly from our repository to fine-tune your owen version-based AraELECTRA
#using the below code 
from datasets import load_dataset
dataset_train = load_dataset(organization_dataset_id, 'mmarco_train10M_preprossesd_for_AraBERT')
dataset_eval = load_dataset(organization_dataset_id, 'mmarco_eval10k_preprossesd_for_AraBERT')