<a href="https://colab.research.google.com/github/latifamankai-afk/500-AI-Machine-learning-Deep-learning-Computer-vision-NLP-Projects-with-code/blob/main/Pretraitement_autremodel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import json

with open("/home/projetFouille/data/train-v2.0.json", encoding="utf-8") as f:
    squad_train = json.load(f)
with open("/home/projetFouille/data/dev-v2.0.json", encoding="utf-8") as f:
    squad_dev = json.load(f)


In [None]:
def extract_information(squad_json):
    contexts = []
    questions = []
    answers = []

    for article in squad_json["data"]:
        for paragraph in article["paragraphs"]:
            context = paragraph["context"]

            for qa in paragraph["qas"]:
                question = qa["question"]

                # CAS 1 : question avec réponse
                if len(qa["answers"]) > 0:
                    answer_text = qa["answers"][0]["text"]
                    answer_start = qa["answers"][0]["answer_start"]

                # CAS 2 : question sans réponse
                else:
                    answer_text = ""
                    answer_start = 0

                contexts.append(context)
                questions.append(question)
                answers.append({
                    "text": answer_text,
                    "answer_start": answer_start
                })

    return contexts, questions, answers


In [None]:
contexts,questions,answers= extract_information(squad_train)
contexts_dev,questions_dev,answers_dev= extract_information(squad_dev)

In [None]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased")



Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def prepare_qa_inputs(batch):
    questions = [q if q is not None else "" for q in batch["question"]]
    contexts = [c if c is not None else "" for c in batch["context"]]
    answers = batch["answers"]

    inputs = tokenizer(
        questions,
        contexts,
        max_length=256,           # plus petit pour accélérer
        truncation="only_second",
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length"
    )

    sample_mapping = inputs.pop("overflow_to_sample_mapping")
    offset_mapping = inputs.pop("offset_mapping")

    start_positions = []
    end_positions = []

    for i, offsets in enumerate(offset_mapping):
        sample_idx = sample_mapping[i]
        answer = answers[sample_idx]

        if len(answer["text"]) == 0 or answer["text"][0] == "":
            start_positions.append(0)
            end_positions.append(0)
            continue

        start_char = answer["answer_start"][0]
        end_char = start_char + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        start_pos, end_pos = 0, 0
        for idx, (start, end) in enumerate(offsets):
            if sequence_ids[idx] != 1:
                continue
            if start <= start_char < end:
                start_pos = idx
            if start < end_char <= end:
                end_pos = idx

        start_positions.append(start_pos)
        end_positions.append(end_pos)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions

    return inputs




In [None]:
from datasets import Dataset

formatted_answers = [
    {
        "text": [a["text"]],
        "answer_start": [a["answer_start"]]
    }
    for a in answers
]


formatted__dev_answers = [
    {
        "text": [a["text"]],
        "answer_start": [a["answer_start"]]
    }
    for a in answers_dev
]

train_dataset = Dataset.from_dict({
    "context": contexts,
    "question": questions,
    "answers": formatted_answers
})

test_dataset = Dataset.from_dict({
    "context": contexts_dev,
    "question": questions_dev,
    "answers": formatted__dev_answers
})




In [None]:
# Tokenize datasets
tokenized_train = train_dataset.map(prepare_qa_inputs, batched=True, remove_columns=train_dataset.column_names)


Map:   0%|          | 0/130319 [00:00<?, ? examples/s]

In [None]:
tokenized_dev = test_dataset.map(prepare_qa_inputs, batched=True, remove_columns=test_dataset.column_names)

Map:   0%|          | 0/11873 [00:00<?, ? examples/s]

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./distilbert_qa",
    eval_strategy="steps",
    eval_steps=500,
    save_steps=500,
    save_total_limit=2,
    learning_rate=3e-5,
    per_device_train_batch_size=4,   # augmente si GPU
    per_device_eval_batch_size=4,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_steps=100,
    push_to_hub=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_dev
)


In [None]:
import evaluate
metric = evaluate.load("squad")

def compute_metrics(p):
    predictions, references = p
    formatted_preds = [{"id": str(i), "prediction_text": pred} for i, pred in enumerate(predictions)]
    formatted_refs = [{"id": str(i), "answers": ref} for i, ref in enumerate(references)]
    return metric.compute(predictions=formatted_preds, references=formatted_refs)

results = trainer.evaluate()
print(results)


  super().__init__(loader)


{'eval_loss': 5.727094650268555, 'eval_model_preparation_time': 0.0128, 'eval_runtime': 36895.1172, 'eval_samples_per_second': 0.372, 'eval_steps_per_second': 0.093}


In [None]:

from tqdm import tqdm
import torch
import numpy as np

model.eval()
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

all_start_logits = []
all_end_logits = []

for batch in tqdm(tokenized_dev):  # tokenized_dev = ton dataset dev tokenizé
    with torch.no_grad():
        input_ids = torch.tensor(batch["input_ids"]).unsqueeze(0).to(device)
        attention_mask = torch.tensor(batch["attention_mask"]).unsqueeze(0).to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        start_logits = outputs.start_logits.cpu().numpy()
        end_logits = outputs.end_logits.cpu().numpy()

        all_start_logits.append(start_logits)
        all_end_logits.append(end_logits)


  2%|▏         | 318/13740 [02:06<1:31:26,  2.45it/s]

In [None]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
Installing collected packages: evaluate
Successfully installed evaluate-0.4.6


In [None]:
from transformers import pipeline

qa_pipeline = pipeline(
    "text2text-generation",
    model="facebook/bart-large-cnn"
)

qa_pipeline("Question: Qui est Hiba ? Context: Hiba est une data engineer en France.")
