In [None]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.model_selection import train_test_split
import json
import pandas as pd

# Charger le dataset JSON
with open("squad_arabe_enrichi_cleaned.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Préparer les données pour le modèle
records = []
for item in data["data"]:
    for paragraph in item["paragraphs"]:
        context = paragraph["context"]
        for qa in paragraph["qas"]:
            question = qa["question"]
            for answer in qa["answers"]:
                records.append({
                    "context": context,
                    "question": question,
                    "answer": answer["text"],
                })

# Convertir en DataFrame
df = pd.DataFrame(records)

In [None]:
# Diviser les données en jeu d'entraînement et d'évaluation
train_data, eval_data = train_test_split(df, test_size=0.2, random_state=42)

# Convertir en format Dataset compatible avec Hugging Face
train_dataset = Dataset.from_pandas(train_data)
eval_dataset = Dataset.from_pandas(eval_data)

In [None]:
# Charger le tokenizer et le modèle AraBERT
model_name = "aubmindlab/bert-base-arabertv2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Fonction de tokenization
def preprocess_data(example):
    encoding = tokenizer(
        example["context"],
        example["question"],
        truncation=True,
        padding="max_length",
        max_length=512,
    )
    # Hugging Face s'attend à des indices pour les réponses
    encoding["start_positions"] = example["context"].find(example["answer"])
    encoding["end_positions"] = encoding["start_positions"] + len(example["answer"])
    return encoding

# Appliquer la tokenization
train_dataset = train_dataset.map(preprocess_data, batched=False)
eval_dataset = eval_dataset.map(preprocess_data, batched=False)

Map:   0%|          | 0/347 [00:00<?, ? examples/s]

Map:   0%|          | 0/87 [00:00<?, ? examples/s]

In [None]:
# Enlever les colonnes inutiles pour le modèle
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "start_positions", "end_positions"])
eval_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "start_positions", "end_positions"])

# Définir les arguments d'entraînement
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",  # Évaluation après chaque époque
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01,
    save_strategy="epoch",  # Sauvegarde après chaque époque
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
)

# Configurer le Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
)

# Lancer l'entraînement
trainer.train()

# Sauvegarder le modèle fine-tuné
model.save_pretrained("./arabert-qa-model")
tokenizer.save_pretrained("./arabert-qa-model")

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,2.3647,2.944973
2,2.786,2.801585
3,2.4675,3.369368
4,2.4177,3.719101
5,2.2118,3.325721
6,2.1959,3.739844
7,2.4044,3.567962
8,2.1325,3.738425
9,2.3874,3.535389
10,2.1742,3.532704


('./arabert-qa-model/tokenizer_config.json',
 './arabert-qa-model/special_tokens_map.json',
 './arabert-qa-model/vocab.txt',
 './arabert-qa-model/added_tokens.json',
 './arabert-qa-model/tokenizer.json')

In [None]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch

# Charger le modèle et le tokenizer fine-tunés
model = AutoModelForQuestionAnswering.from_pretrained("./arabert-qa-model")
tokenizer = AutoTokenizer.from_pretrained("./arabert-qa-model")

# Exemple de contexte et de question
context = "هذا نص عام يُستخدم كخلفية لهذه السؤال."
question = "ما هو الفرق الرئيسي الرئيسي بين التعلم تحت الإشراف وغير المنظم للآلات؟"

# Tokeniser le contexte et la question
inputs = tokenizer(question, context, return_tensors="pt", truncation=True, padding=True, max_length=512)

# Obtenir les scores des positions de début et de fin de la réponse
with torch.no_grad():
    outputs = model(**inputs)

start_scores = outputs.start_logits
end_scores = outputs.end_logits

# Debugging: afficher les indices de début et de fin
start_index = torch.argmax(start_scores)
end_index = torch.argmax(end_scores)
print(f"Start index: {start_index}, End index: {end_index}")

# Assurez-vous que les indices sont valides
if start_index > end_index:
    start_index, end_index = end_index, start_index

# Vérifiez si la réponse a des tokens valides
if start_index != end_index:
    answer_tokens = inputs.input_ids[0][start_index:end_index + 1]
    answer = tokenizer.decode(answer_tokens, skip_special_tokens=True)
else:
    answer = "Aucune réponse trouvée."

print(f"Question: {question}")
print(f"Réponse: {answer}")


Start index: 0, End index: 32
Question: ما هو الفرق الرئيسي الرئيسي بين التعلم تحت الإشراف وغير المنظم للآلات؟
Réponse: ما هو الفرق الرئيسي الرئيسي بين التعلم تحت الإشراف وغير المنظم للآلات ؟ هذا نص عام كخلف
