## BERT Fine-Tuning

### Setup

In [None]:
#!pip install evaluate datasets transformers accelerate==0.26.0

### Load Data and Preprocessing

In [1]:
from datasets import load_dataset
from datasets import DatasetDict, Dataset

# Charger les fichiers JSONL en DatasetDict
dataset = DatasetDict({
    "train": load_dataset("json", data_files="data/train.json")["train"],
    "test": load_dataset("json", data_files="data/test.json")["train"],
    "valid": load_dataset("json", data_files="data/valid.json")["train"]
})

dataset_not_synthetic_in_train = DatasetDict({
    "train": dataset["train"].filter(lambda example: example["item_id"] <= 44),
    "test": dataset["test"].filter(lambda example: example["item_id"] <= 44),
    "valid": dataset["valid"]  # Keep validation set unchanged
})

print(dataset)
print(dataset_not_synthetic_in_train)

DatasetDict({
    train: Dataset({
        features: ['text', 'level', 'item_id', 'synthetic', 'label', '__index_level_0__'],
        num_rows: 7059
    })
    test: Dataset({
        features: ['text', 'level', 'item_id', 'synthetic', 'label', '__index_level_0__'],
        num_rows: 882
    })
    valid: Dataset({
        features: ['text', 'level', 'item_id', 'synthetic', 'label', '__index_level_0__'],
        num_rows: 883
    })
})
DatasetDict({
    train: Dataset({
        features: ['text', 'level', 'item_id', 'synthetic', 'label', '__index_level_0__'],
        num_rows: 6179
    })
    test: Dataset({
        features: ['text', 'level', 'item_id', 'synthetic', 'label', '__index_level_0__'],
        num_rows: 782
    })
    valid: Dataset({
        features: ['text', 'level', 'item_id', 'synthetic', 'label', '__index_level_0__'],
        num_rows: 883
    })
})


In [2]:
dataset["train"][0]

{'text': "Prompt Level: 2 [SEP] Prompt: What activities do you do at school? [SEP] Response: I'm studying computer science and engineering, I'm learning programming languages like python, java, c++, and I'm doing project in android app development and web development, I'm also learning data science",
 'level': 2,
 'item_id': 80,
 'synthetic': True,
 'label': 2,
 '__index_level_0__': 8371}

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [None]:
tokenizer(dataset["train"][0]["text"])

In [3]:
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=256
    )

In [None]:
tokenized_train = dataset["train"].map(tokenize_function, batched=True)

In [None]:
tokenized_test = dataset["test"].map(tokenize_function, batched=True)

In [None]:
tokenized_valid = dataset["valid"].map(tokenize_function, batched=True)

In [4]:
unique_labels = set(dataset['train']['label'])
num_labels = len(unique_labels)
num_labels

5

### Bert Model Fine tuning with trainer 

In [None]:
from transformers import BertForSequenceClassification, TrainingArguments, Trainer
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)

In [5]:
import numpy as np 
import evaluate

from sklearn.metrics import accuracy_score, precision_recall_fscore_support, cohen_kappa_score, classification_report

from scipy.stats import pearsonr

metric = evaluate.load("accuracy")

In [6]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)  # Convertir les logits en classes prédictes

    # 🎯 Exactitude (Accuracy)
    accuracy = accuracy_score(labels, predictions)

    # 🎯 Précision, Rappel et F1-score (pondérés)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="weighted")

    # 🎯 Score de Cohen's Kappa (pondéré)
    cohen_kappa = cohen_kappa_score(labels, predictions, weights="quadratic")

    # 🎯 Corrélation de Pearson
    pearson_corr, _ = pearsonr(labels, predictions)  # Retourne (coef, p-valeur), on garde seulement coef

     # 🎯 Classification Report
    class_report = classification_report(labels, predictions, output_dict=True)  # Get a dictionary of the report


    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "cohen_kappa": cohen_kappa,
        "pearson_corr": pearson_corr,
        "classification_report": class_report  # Add classification report to the return
    }


In [None]:
args = TrainingArguments(
    output_dir="../../../model_saved/bert-ft-efcamdat-augmented",
    evaluation_strategy="steps",  # Évaluation aux mêmes intervalles que la sauvegarde
    save_strategy="steps",  # Sauvegarde tous les 500 steps
    save_steps=250,
    eval_steps=250,  # ⚠ IMPORTANT : Évaluation aux mêmes steps
    save_total_limit=4,  # Ne garde que 2 checkpoints max
    learning_rate=2e-5,
    warmup_ratio=0.1,
    lr_scheduler_type="linear", 
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=6,
    weight_decay=0.01,
    load_best_model_at_end=True,  
    metric_for_best_model="f1",
    logging_steps=100,
    fp16=True,
)

In [None]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

### Roberta Model Fine Tuning

In [7]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-large")

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [8]:
tok_test = tokenizer(dataset_not_synthetic_in_train["train"][1]["text"], max_length=256, truncation=True)
tok_test

{'input_ids': [0, 35396, 3320, 12183, 35, 132, 646, 3388, 510, 742, 42944, 35, 27705, 21700, 103, 1964, 9, 5296, 47, 32, 2811, 2159, 804, 4, 646, 3388, 510, 742, 19121, 35, 939, 524, 12793, 5326, 154, 907, 118, 154, 10, 278, 9, 29784, 3119, 61, 16, 2933, 6, 44477, 2629, 1459, 6, 8, 182, 2721, 4, 635, 24, 16, 1341, 3214, 4, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [9]:
tokenized_train = dataset["train"].map(tokenize_function, batched=True)
tokenized_test = dataset["test"].map(tokenize_function, batched=True)
tokenized_valid = dataset["valid"].map(tokenize_function, batched=True)

Map:   0%|          | 0/7059 [00:00<?, ? examples/s]

Map:   0%|          | 0/882 [00:00<?, ? examples/s]

Map:   0%|          | 0/883 [00:00<?, ? examples/s]

In [10]:
dataset["train"][0]

{'text': "Prompt Level: 2 [SEP] Prompt: What activities do you do at school? [SEP] Response: I'm studying computer science and engineering, I'm learning programming languages like python, java, c++, and I'm doing project in android app development and web development, I'm also learning data science",
 'level': 2,
 'item_id': 80,
 'synthetic': True,
 'label': 2,
 '__index_level_0__': 8371}

In [11]:
from transformers import RobertaForSequenceClassification, TrainingArguments, Trainer
model = RobertaForSequenceClassification.from_pretrained("FacebookAI/roberta-large", num_labels=num_labels)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
args = TrainingArguments(
    output_dir="../../../model_saved/roberta-large-ft-efcamdat-augmented",
    eval_strategy="steps",  # Évaluation aux mêmes intervalles que la sauvegarde
    save_strategy="steps",  # Sauvegarde tous les 500 steps
    save_steps=200,
    eval_steps=200,  # ⚠ IMPORTANT : Évaluation aux mêmes steps
    save_total_limit=4,  # Ne garde que 4 checkpoints max
    learning_rate=2e-5,
    warmup_ratio=0.1,
    lr_scheduler_type="linear", 
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=6,
    weight_decay=0.01,
    load_best_model_at_end=True,  
    metric_for_best_model="f1",
    logging_steps=100,
    fp16=True,
)

In [13]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    compute_metrics=compute_metrics,
)

In [14]:
trainer.train()

Step,Training Loss,Validation Loss


TrainOutput(global_step=2652, training_loss=0.34042270513051953, metrics={'train_runtime': 946.4422, 'train_samples_per_second': 44.751, 'train_steps_per_second': 2.802, 'total_flos': 1.9735710527609856e+16, 'train_loss': 0.34042270513051953, 'epoch': 6.0})

In [15]:
trainer.evaluate()

{'eval_loss': 1.35331130027771,
 'eval_accuracy': 0.8129251700680272,
 'eval_precision': 0.816128975356663,
 'eval_recall': 0.8129251700680272,
 'eval_f1': 0.8130910869145741,
 'eval_cohen_kappa': 0.9117223933128025,
 'eval_pearson_corr': 0.9124799412910857,
 'eval_classification_report': {'0': {'precision': 0.8907103825136612,
   'recall': 0.8489583333333334,
   'f1-score': 0.8693333333333333,
   'support': 192.0},
  '1': {'precision': 0.7975206611570248,
   'recall': 0.7423076923076923,
   'f1-score': 0.7689243027888446,
   'support': 260.0},
  '2': {'precision': 0.7942122186495176,
   'recall': 0.8487972508591065,
   'f1-score': 0.8205980066445183,
   'support': 291.0},
  '3': {'precision': 0.7321428571428571,
   'recall': 0.8367346938775511,
   'f1-score': 0.780952380952381,
   'support': 98.0},
  '4': {'precision': 0.9411764705882353,
   'recall': 0.7804878048780488,
   'f1-score': 0.8533333333333334,
   'support': 41.0},
  'accuracy': 0.8129251700680272,
  'macro avg': {'precisio

In [29]:
# save tokenizer
#tokenizer.save_pretrained("../../../model_saved/roberta-large-ft-efcamdat-augmented/checkpoint-2600")  # Save tokenizer to the same path

('../../../model_saved/roberta-large-ft-efcamdat-augmented/checkpoint-2600/tokenizer_config.json',
 '../../../model_saved/roberta-large-ft-efcamdat-augmented/checkpoint-2600/special_tokens_map.json',
 '../../../model_saved/roberta-large-ft-efcamdat-augmented/checkpoint-2600/vocab.json',
 '../../../model_saved/roberta-large-ft-efcamdat-augmented/checkpoint-2600/merges.txt',
 '../../../model_saved/roberta-large-ft-efcamdat-augmented/checkpoint-2600/added_tokens.json',
 '../../../model_saved/roberta-large-ft-efcamdat-augmented/checkpoint-2600/tokenizer.json')

### Distilroberta FT

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilroberta-base")

In [None]:
tok_test = tokenizer(dataset["train"][1]["text"], max_length=256, truncation=True)
tok_test

In [None]:
tokenized_train = dataset["train"].map(tokenize_function, batched=True)
tokenized_test = dataset["test"].map(tokenize_function, batched=True)
tokenized_valid = dataset["valid"].map(tokenize_function, batched=True)

In [None]:
from transformers import DistilBertForSequenceClassification, TrainingArguments, Trainer
model = DistilBertForSequenceClassification.from_pretrained("distilbert/distilroberta-base", num_labels=num_labels)

In [None]:
args = TrainingArguments(
    output_dir="../../../model_saved/distilroberta-base-ft-efcamdat-augmented",
    eval_strategy="steps",  # Évaluation aux mêmes intervalles que la sauvegarde
    save_strategy="steps",  # Sauvegarde tous les 500 steps
    save_steps=200,
    eval_steps=200,  # ⚠ IMPORTANT : Évaluation aux mêmes steps
    save_total_limit=4,  # Ne garde que 4 checkpoints max
    learning_rate=2e-5,
    warmup_ratio=0.1,
    lr_scheduler_type="linear", 
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    load_best_model_at_end=True,  
    metric_for_best_model="f1",
    logging_steps=100,
    fp16=True,
)

In [None]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

### Flan T5 Base

In [None]:
from transformers import AutoModelForSeq2SeqLM
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import pandas as pd

model_id="google/flan-t5-base"
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
# Load tokenizer of FLAN-t5-base
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [None]:
train_df = pd.DataFrame(dataset['train'])
test_df = pd.DataFrame(dataset['test'])
valid_df = pd.DataFrame(dataset['valid'])

dataset.clear()
train_df['label'] = train_df['label'].astype(str)
test_df['label'] = test_df['label'].astype(str)
valid_df['label'] = valid_df['label'].astype(str)

dataset['train'] = Dataset.from_pandas(train_df)
dataset['test'] = Dataset.from_pandas(test_df)
dataset['valid'] = Dataset.from_pandas(valid_df)

In [None]:
from datasets import concatenate_datasets

# The maximum total input sequence length after tokenization. 
# Sequences longer than this will be truncated, sequences shorter will be padded.
tokenized_inputs = concatenate_datasets([dataset["train"], dataset["test"]]).map(lambda x: tokenizer(x["text"], truncation=True), batched=True, remove_columns=['text', 'label'])
max_source_length = max([len(x) for x in tokenized_inputs["input_ids"]])
print(f"Max source length: {max_source_length}")

# The maximum total sequence length for target text after tokenization. 
# Sequences longer than this will be truncated, sequences shorter will be padded."
tokenized_targets = concatenate_datasets([dataset["train"], dataset["test"]]).map(lambda x: tokenizer(x["label"], truncation=True), batched=True, remove_columns=['text', 'label'])
max_target_length = max([len(x) for x in tokenized_targets["input_ids"]])
print(f"Max target length: {max_target_length}")

In [None]:
def preprocess_function(sample, padding="max_length"):
    # add prefix to the input for t5
    inputs = [item for item in sample["text"]]

    # tokenize inputs
    model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)

    # Tokenize targets with the `text_target` keyword argument
    labels = tokenizer(text_target=sample["label"], max_length=5, padding=padding, truncation=True)

    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
    # padding in the loss.
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=['text', 'label'])
print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")

In [None]:
import evaluate
import nltk
import numpy as np
from nltk.tokenize import sent_tokenize
nltk.download("punkt")
nltk.download('punkt_tab')

# Metric
metric = evaluate.load("f1")

# helper function to postprocess text
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(sent_tokenize(label)) for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, average='macro')
    result = {k: round(v * 100, 4) for k, v in result.items()}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    return result

In [None]:
from transformers import DataCollatorForSeq2Seq

# we want to ignore tokenizer pad token in the loss
label_pad_token_id = -100
# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)

In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

# Define training args
training_args = Seq2SeqTrainingArguments(
    output_dir="../../../model_saved/flan-t5-base-ft-efcamdat-augmented",
    predict_with_generate=True,
    evaluation_strategy="steps",  # Évaluation aux mêmes intervalles que la sauvegarde
    save_strategy="steps",  # Sauvegarde tous les 500 steps
    save_steps=250,
    eval_steps=250,  # ⚠ IMPORTANT : Évaluation aux mêmes steps
    save_total_limit=4,  # Ne garde que 2 checkpoints max
    learning_rate=2e-5,
    warmup_ratio=0.1,
    lr_scheduler_type="linear", 
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,  
    metric_for_best_model="f1",
    logging_steps=100,
    fp16=True
)

# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    compute_metrics=compute_metrics,
)

In [None]:
# Start training 
trainer.train()

In [None]:
trainer.evaluate()

### Evaluation dataset analysis

In [16]:
list_topic = dataset["valid"]["item_id"]
list_t_set = set(list_topic)
unique_t = (list(list_t_set))

list_level = dataset["valid"]["level"]
list_l_set = set(list_level)
unique_l = (list(list_l_set))


In [17]:
list_r = []

# Assuming 'unique_t' is a list of unique item_ids and 'trainer' is already defined
for t in unique_t:  # Iterate over the first item in unique_t
    sub_ds = tokenized_valid.filter(lambda example: example['item_id'] == t)
    # Get predictions using the trainer
    predictions = trainer.predict(sub_ds)
    # Raw output logits (size [batch_size, num_classes])
    outputs = predictions.predictions
    # Convert logits to predicted class labels (taking the argmax across the classes)
    predicted_labels = np.argmax(outputs, axis=-1)
    ref_label = predictions.label_ids
    # Print or save the predicted classes (this will be a numpy array with the predicted class indices)
    ck = round(cohen_kappa_score(predicted_labels, ref_label, weights="quadratic"), 2)  
    pearson_corr, _ = pearsonr(ref_label, predicted_labels)
    accuracy = accuracy_score(ref_label, predicted_labels)
    precision, recall, f1, _ = precision_recall_fscore_support(ref_label, predicted_labels, average="weighted")

    r = {
        "item_id": t,
        "level": sub_ds["level"][0],
        "synthetic": sub_ds["synthetic"][0],
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "ck": ck,
        "pearson": pearson_corr,
        "n_samples": len(sub_ds)
    }
    list_r.append(r)

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

In [18]:
list_r_level = []

# Assuming 'unique_t' is a list of unique item_ids and 'trainer' is already defined
for l in unique_l:  # Iterate over the first item in unique_t
    sub_ds = tokenized_valid.filter(lambda example: example['level'] == l)
    # Get predictions using the trainer
    predictions = trainer.predict(sub_ds)
    # Raw output logits (size [batch_size, num_classes])
    outputs = predictions.predictions
    # Convert logits to predicted class labels (taking the argmax across the classes)
    predicted_labels = np.argmax(outputs, axis=-1)
    ref_label = predictions.label_ids
    # Print or save the predicted classes (this will be a numpy array with the predicted class indices)
    ck = round(cohen_kappa_score(predicted_labels, ref_label, weights="quadratic"), 2)  
    pearson_corr, _ = pearsonr(ref_label, predicted_labels)
    accuracy = accuracy_score(ref_label, predicted_labels)
    precision, recall, f1, _ = precision_recall_fscore_support(ref_label, predicted_labels, average="weighted")

    r = {
        "level": sub_ds["level"][0],
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "ck": ck,
        "pearson": pearson_corr,
        "n_samples": len(sub_ds)
    }
    list_r_level.append(r)


Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

In [19]:
import pandas as pd
df_eval_results = pd.DataFrame(list_r, columns=["item_id", "level", "synthetic", "accuracy", "precision", "recall", "f1", "ck", "pearson", "n_samples"])
df_eval_results.head(n=10)

Unnamed: 0,item_id,level,synthetic,accuracy,precision,recall,f1,ck,pearson,n_samples
0,0,1,False,0.85,0.869643,0.85,0.84859,0.88,0.89794,20
1,1,1,False,1.0,1.0,1.0,1.0,1.0,1.0,16
2,2,1,False,0.666667,0.790123,0.666667,0.677124,0.61,0.690849,18
3,3,1,False,0.75,0.785301,0.75,0.748434,0.81,0.816765,24
4,4,1,False,0.777778,0.814815,0.777778,0.777778,0.8,0.812158,18
5,5,2,False,0.833333,0.905556,0.833333,0.846451,0.91,0.908812,18
6,6,2,False,0.8,0.815714,0.8,0.798611,0.82,0.825723,20
7,7,2,False,0.722222,0.736111,0.722222,0.70101,0.81,0.824911,18
8,8,2,False,0.625,0.617857,0.625,0.609674,0.75,0.786276,16
9,9,2,False,0.842105,0.894737,0.842105,0.844444,0.83,0.850475,19


In [20]:
df_eval_results.to_csv("result_eval_data_roberta_large_efcamdat.csv", index=False)

In [21]:
import pandas as pd
df_eval_results_level = pd.DataFrame(list_r_level, columns=["level", "accuracy", "precision", "recall", "f1", "ck", "pearson", "n_samples"])
df_eval_results_level.head(n=20)

Unnamed: 0,level,accuracy,precision,recall,f1,ck,pearson,n_samples
0,1,0.831933,0.84054,0.831933,0.833918,0.92,0.918251,119
1,2,0.801587,0.810042,0.801587,0.803258,0.86,0.858346,126
2,3,0.855263,0.860232,0.855263,0.856651,0.93,0.932318,76
3,4,0.718447,0.725035,0.718447,0.72122,0.9,0.896242,103
4,5,0.776,0.780491,0.776,0.772855,0.88,0.883648,125
5,6,0.835821,0.853582,0.835821,0.839487,0.92,0.918837,67
6,7,0.797872,0.807851,0.797872,0.798048,0.88,0.885718,94
7,8,0.75,0.75,0.75,0.75,0.82,0.818182,16
8,10,0.741379,0.746489,0.741379,0.740119,0.83,0.840675,58
9,11,1.0,1.0,1.0,1.0,1.0,1.0,13


In [22]:
df_eval_results_level.to_csv("result_eval_data_roberta_large_efcamdat_by_level.csv", index=False)

### Hyperparameter optimization

In [None]:
#!pip install ray
#!pip install "ray[tune]"

In [None]:
import ray
from pprint import pprint

In [None]:
ray.init(_temp_dir="/home/ec2-user/model_saved/ray_tmp")

In [None]:
pprint(ray.cluster_resources())

In [None]:
use_gpu = True  # set this to False to run on CPUs
num_workers = 1  # set this to number of GPUs or CPUs you want to use

In [None]:
import ray.data

ray_datasets = {
    "train": ray.data.from_huggingface(raw_dataset["train"]),
    "validation": ray.data.from_huggingface(raw_dataset["eval"]),
    "test": ray.data.from_huggingface(raw_dataset["test"]),
}
ray_datasets

In [None]:
import numpy as np
from typing import Dict

# Tokenize input sentences
def collate_fn(examples: Dict[str, np.array]):
    outputs = tokenizer(
        list(examples["text"]),
        truncation=True,
        padding="longest",
        return_tensors="pt",
    )

    outputs["labels"] = torch.LongTensor(examples["label"])

    # Move all input tensors to GPU
    for key, value in outputs.items():
        outputs[key] = value.cuda()

    return outputs

In [None]:
import torch
import numpy as np

from datasets import load_metric
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

import ray.train
from ray.train.huggingface.transformers import prepare_trainer, RayTrainReportCallback


model_checkpoint = "bert-base-uncased"
task = "review"
batch_size = 16

num_labels = 5
metric_name = (
    "accuracy"
)
model_name = model_checkpoint.split("/")[-1]

name = f"{model_name}-finetuned-{task}"

# Calculate the maximum steps per epoch based on the number of rows in the training dataset.
# Make sure to scale by the total number of training workers and the per device batch size.
max_steps_per_epoch = ray_datasets["train"].count() // (batch_size * num_workers)


def train_func(config):
    print(f"Is CUDA available: {torch.cuda.is_available()}")

    metric = load_metric("glue", "cola")
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_checkpoint, num_labels=num_labels
    )

    train_ds = ray.train.get_dataset_shard("train")
    eval_ds = ray.train.get_dataset_shard("eval")

    train_ds_iterable = train_ds.iter_torch_batches(
        batch_size=batch_size, collate_fn=collate_fn
    )
    eval_ds_iterable = eval_ds.iter_torch_batches(
        batch_size=batch_size, collate_fn=collate_fn
    )

    print("max_steps_per_epoch: ", max_steps_per_epoch)

    args = TrainingArguments(
        name,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        logging_strategy="epoch",
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        learning_rate=config.get("learning_rate", 2e-5),
        num_train_epochs=config.get("epochs", 2),
        weight_decay=config.get("weight_decay", 0.01),
        push_to_hub=False,
        max_steps=max_steps_per_epoch * config.get("epochs", 2),
        disable_tqdm=True,  # declutter the output a little
        no_cuda=not use_gpu,  # you need to explicitly set no_cuda if you want CPUs
        report_to="none",
    )

    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=-1)
        return metric.compute(predictions=predictions, references=labels)

    trainer = Trainer(
        model,
        args,
        train_dataset=train_ds_iterable,
        eval_dataset=eval_ds_iterable,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    trainer.add_callback(RayTrainReportCallback())

    trainer = prepare_trainer(trainer)

    print("Starting training")
    trainer.train()

In [None]:
from ray.train.torch import TorchTrainer
from ray.train import RunConfig, ScalingConfig, CheckpointConfig

trainer = TorchTrainer(
    train_func,
    scaling_config=ScalingConfig(
        num_workers=num_workers, 
        resources_per_worker={"GPU": 1, "CPU": 1},
        use_gpu=use_gpu),
    datasets={
        "train": ray_datasets["train"],
        "eval": ray_datasets["validation"],
    },
    run_config=RunConfig(
        checkpoint_config=CheckpointConfig(
            num_to_keep=1,
            checkpoint_score_attribute="eval_loss",
            checkpoint_score_order="min",
            
        ),
    ),
)

In [None]:
result = trainer.fit()

In [None]:
result

### Onnx Optimization 

In [None]:
#!pip install onnxruntime

In [24]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification
import torch
import os
from onnxruntime.quantization import quantize_dynamic, QuantType

# === CONFIGURATION ===
# Chemin vers ton dossier contenant le .bin et le config.json
model_dir = "../../../model_saved/roberta-large-ft-efcamdat-augmented/checkpoint-2600"
onnx_model_path = "../../../model_saved/roberta-large-ft-efcamdat-augmented-2600.onnx"
quantized_model_path = "../../../model_saved/roberta-large-ft-efcamdat-augmented-2600-quantized.onnx"

# === ÉTAPE 1 : Charger le modèle et tokenizer ===
model = RobertaForSequenceClassification.from_pretrained(model_dir)
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-large")
model.eval()

# === ÉTAPE 2 : Préparer un input fictif ===
dummy_text = "Texte d'exemple pour conversion ONNX"
inputs = tokenizer(dummy_text, return_tensors="pt", padding="max_length", max_length=32)

# === ÉTAPE 3 : Exporter vers ONNX ===
torch.onnx.export(
    model,
    (inputs["input_ids"], inputs["attention_mask"]),
    onnx_model_path,
    input_names=["input_ids", "attention_mask"],
    output_names=["logits"],
    dynamic_axes={
        "input_ids": {0: "batch_size", 1: "sequence_length"},
        "attention_mask": {0: "batch_size", 1: "sequence_length"},
        "logits": {0: "batch_size"},
    },
    opset_version=14  # ⬅️ change ici
)

print(f"✅ Modèle exporté en ONNX : {onnx_model_path}")

# === ÉTAPE 4 : Quantization dynamique ===
quantize_dynamic(
    model_input=onnx_model_path,
    model_output=quantized_model_path,
    weight_type=QuantType.QInt8
)

print(f"✅ Modèle quantifié en ONNX : {quantized_model_path}")

✅ Modèle exporté en ONNX : ../../../model_saved/roberta-large-ft-efcamdat-augmented-2600.onnx




Ignore MatMul due to non constant B: /[/roberta/encoder/layer.0/attention/self/MatMul]
Ignore MatMul due to non constant B: /[/roberta/encoder/layer.0/attention/self/MatMul_1]
Ignore MatMul due to non constant B: /[/roberta/encoder/layer.1/attention/self/MatMul]
Ignore MatMul due to non constant B: /[/roberta/encoder/layer.1/attention/self/MatMul_1]
Ignore MatMul due to non constant B: /[/roberta/encoder/layer.2/attention/self/MatMul]
Ignore MatMul due to non constant B: /[/roberta/encoder/layer.2/attention/self/MatMul_1]
Ignore MatMul due to non constant B: /[/roberta/encoder/layer.3/attention/self/MatMul]
Ignore MatMul due to non constant B: /[/roberta/encoder/layer.3/attention/self/MatMul_1]
Ignore MatMul due to non constant B: /[/roberta/encoder/layer.4/attention/self/MatMul]
Ignore MatMul due to non constant B: /[/roberta/encoder/layer.4/attention/self/MatMul_1]
Ignore MatMul due to non constant B: /[/roberta/encoder/layer.5/attention/self/MatMul]
Ignore MatMul due to non constant

In [25]:
import onnxruntime

onnx_session = onnxruntime.InferenceSession(onnx_model_path)
onnx_session_quant = onnxruntime.InferenceSession(quantized_model_path)

In [26]:
max_length = 256  # Ajuste selon la taille maximale de ton modèle

# Fonction d'inférence ONNX
def onnx_infer(input_texts, onnx_model):
    inputs = tokenizer(input_texts, padding=True, truncation=True, max_length=max_length, return_tensors="pt")
    input_ids = inputs["input_ids"].numpy()
    attention_mask = inputs["attention_mask"].numpy()
    onnx_inputs = {"input_ids": input_ids, "attention_mask": attention_mask}
    onnx_outputs = onnx_model.run(None, onnx_inputs)
    return onnx_outputs[0]

def evaluate_with_metrics(dataset, onnx_model, batch_size=16):
    all_logits = []
    all_labels = []

    for i in range(0, len(dataset), batch_size):
        batch = dataset[i:i + batch_size]
        texts = batch["text"]
        labels = batch["label"]

        logits = onnx_infer(texts, onnx_model)
        all_logits.extend(logits)
        all_labels.extend(labels)

    all_logits = np.array(all_logits)
    all_labels = np.array(all_labels)

    # 🔥 Appliquer compute_metrics
    metrics = compute_metrics((all_logits, all_labels))
    return metrics

In [27]:
valid_data = dataset["valid"]  # Ou "valid" selon ton dataset
# === Lancer l'évaluation ===
results = evaluate_with_metrics(valid_data, onnx_model=onnx_session)
print("🎯 Evaluation Results ONNX :")
for k, v in results.items():
    if k == "classification_report":
        print("\n📋 Classification Report :")
        for label, metrics in v.items():
            print(f"{label}: {metrics}")
    else:
        print(f"{k}: {v}")

🎯 Evaluation Results ONNX :
accuracy: 0.796149490373726
precision: 0.8005448097065477
recall: 0.796149490373726
f1: 0.7972984625934728
cohen_kappa: 0.9008232408545239
pearson_corr: 0.9021923676059609

📋 Classification Report :
0: {'precision': 0.9298245614035088, 'recall': 0.8238341968911918, 'f1-score': 0.8736263736263736, 'support': 193.0}
1: {'precision': 0.748062015503876, 'recall': 0.7423076923076923, 'f1-score': 0.7451737451737451, 'support': 260.0}
2: {'precision': 0.7628205128205128, 'recall': 0.815068493150685, 'f1-score': 0.7880794701986755, 'support': 292.0}
3: {'precision': 0.7475728155339806, 'recall': 0.7857142857142857, 'f1-score': 0.7661691542288557, 'support': 98.0}
4: {'precision': 0.9230769230769231, 'recall': 0.9, 'f1-score': 0.9113924050632911, 'support': 40.0}
accuracy: 0.796149490373726
macro avg: {'precision': 0.8222713656677604, 'recall': 0.813384933612771, 'f1-score': 0.8168882296581883, 'support': 883.0}
weighted avg: {'precision': 0.8005448097065477, 'recall

In [28]:
valid_data = dataset["valid"]  # Ou "valid" selon ton dataset
# === Lancer l'évaluation ===
results = evaluate_with_metrics(valid_data, onnx_model=onnx_session_quant)
print("🎯 Evaluation Results ONNX :")
for k, v in results.items():
    if k == "classification_report":
        print("\n📋 Classification Report :")
        for label, metrics in v.items():
            print(f"{label}: {metrics}")
    else:
        print(f"{k}: {v}")

🎯 Evaluation Results ONNX :
accuracy: 0.7950169875424689
precision: 0.8013297252177548
recall: 0.7950169875424689
f1: 0.7966965609003842
cohen_kappa: 0.8988009964168097
pearson_corr: 0.8998499346438857

📋 Classification Report :
0: {'precision': 0.9397590361445783, 'recall': 0.8082901554404145, 'f1-score': 0.8690807799442897, 'support': 193.0}
1: {'precision': 0.7282608695652174, 'recall': 0.7730769230769231, 'f1-score': 0.75, 'support': 260.0}
2: {'precision': 0.7687296416938111, 'recall': 0.8082191780821918, 'f1-score': 0.7879799666110183, 'support': 292.0}
3: {'precision': 0.7708333333333334, 'recall': 0.7551020408163265, 'f1-score': 0.7628865979381443, 'support': 98.0}
4: {'precision': 0.9210526315789473, 'recall': 0.875, 'f1-score': 0.8974358974358975, 'support': 40.0}
accuracy: 0.7950169875424689
macro avg: {'precision': 0.8257271024631775, 'recall': 0.8039376594831712, 'f1-score': 0.81347664838587, 'support': 883.0}
weighted avg: {'precision': 0.8013297252177548, 'recall': 0.795

In [None]:
def create_dynamic_text(prompt_level, prompt, response):
    # Construire un texte avec un niveau de prompt, une question (prompt), et une réponse
    text = f"Prompt Level: {prompt_level} [SEP] Prompt: {prompt} [SEP] Response: {response}"
    return text

def run_test(prompt_level, prompt, response):
    # Créer le texte dynamique
    text = create_dynamic_text(prompt_level, prompt, response)
    
    # Effectuer l'inférence pour ce texte
    logits = onnx_infer(text, onnx_model=onnx_session_quant)
    
    # Afficher les résultats (logits)
    print(f"Logits pour le prompt : '{prompt}' avec niveau {prompt_level}")
    print(logits)
    
    # Décoder les résultats (prédiction de la classe)
    predictions = np.argmax(logits, axis=-1)
    print(f"Prédiction (classe) : {predictions}")

# === Test dynamique ===
prompt_level = 2
prompt = "What activities do you do at school?"
response = "I'm studying computer science and engineering, I'm learning programming languages like python, java, c++, and I'm doing project in android app development and web development, I'm also learning data science"
run_test(prompt_level, prompt, response)

### Save Model to S3

In [30]:
import boto3

In [31]:
# Initialize the S3 client
s3 = boto3.client(
    "s3"
)

# Define your bucket name and desired path in S3
bucket_name = "sagemaker-eu-central-1-505049265445"
s3_key = "models/writing_text_scoring_model/model_2600_roberta_large.onnx"  # Change path as needed
quantized_model_path = "../../../model_saved/roberta-large-ft-efcamdat-augmented-2600-quantized.onnx"
# Upload the ONNX file
s3.upload_file(quantized_model_path, bucket_name, s3_key)

print(f"✅ ONNX model uploaded to s3://{bucket_name}/{s3_key}")

✅ ONNX model uploaded to s3://sagemaker-eu-central-1-505049265445/models/writing_text_scoring_model/model_2600_roberta_large.onnx


### Eval Benchmark dataset

In [32]:
import pandas as pd
df_benchmark = pd.read_csv("data/benchmark_data.csv")


In [33]:
df_benchmark["input_text"] = df_benchmark.apply(
    lambda row: f"Prompt Level: {row['level']} [SEP] Prompt: {row['prompt']} [SEP] Response: {row['answer']}",
    axis=1
)
df_benchmark.head()

Unnamed: 0,prompt,answer,label,synthetic,item_score,level,input_text
0,Describe your office including its equipment a...,Laptop and class and clinics,0,False,0.0,1,Prompt Level: 1 [SEP] Prompt: Describe your of...
1,Write a thoughtful and detailed review of a mo...,kalki a telugu movie which is a thriller combi...,2,False,54.0,11,Prompt Level: 11 [SEP] Prompt: Write a thought...
2,Explain your step by step plan to achieve your...,It is important to understand what do we want ...,3,False,73.0,7,Prompt Level: 7 [SEP] Prompt: Explain your ste...
3,Write a thoughtful and detailed review of a mo...,Last week I saw a beautiful and fantastic movi...,4,False,93.0,11,Prompt Level: 11 [SEP] Prompt: Write a thought...
4,Write an email to introduce yourself.,"Hello , my name is hajar i work as an accoutan...",2,False,45.0,1,Prompt Level: 1 [SEP] Prompt: Write an email t...


In [34]:
from datasets import Dataset

# Keep only the relevant columns
hf_dataset = Dataset.from_pandas(df_benchmark[["input_text", "label"]])
hf_dataset = hf_dataset.rename_column("input_text", "text")  # Rename to match expected input in evaluate_with_metrics
hf_dataset[0]

{'text': 'Prompt Level: 1 [SEP] Prompt: Describe your office including its equipment and materials. [SEP] Response: Laptop and class and clinics',
 'label': 0}

In [35]:
metrics = evaluate_with_metrics(hf_dataset, onnx_session_quant)
print(metrics)

{'accuracy': 0.656, 'precision': 0.6884781477272841, 'recall': 0.656, 'f1': 0.6545529141717424, 'cohen_kappa': 0.8131182309684821, 'pearson_corr': 0.8392447530686421, 'classification_report': {'0': {'precision': 0.7547169811320755, 'recall': 0.8602150537634409, 'f1-score': 0.8040201005025126, 'support': 93.0}, '1': {'precision': 0.5806451612903226, 'recall': 0.72, 'f1-score': 0.6428571428571429, 'support': 100.0}, '2': {'precision': 0.5857142857142857, 'recall': 0.6721311475409836, 'f1-score': 0.6259541984732825, 'support': 122.0}, '3': {'precision': 0.5416666666666666, 'recall': 0.5131578947368421, 'f1-score': 0.527027027027027, 'support': 76.0}, '4': {'precision': 0.9482758620689655, 'recall': 0.5045871559633027, 'f1-score': 0.6586826347305389, 'support': 109.0}, 'accuracy': 0.656, 'macro avg': {'precision': 0.6822037913744632, 'recall': 0.6540182504009138, 'f1-score': 0.6517082207181009, 'support': 500.0}, 'weighted avg': {'precision': 0.6884781477272841, 'recall': 0.656, 'f1-score'

In [36]:
print("🎯 Evaluation Benchmark dataset Results ONNX :")
for k, v in metrics.items():
    if k == "classification_report":
        print("\n📋 Classification Report :")
        for label, metrics in v.items():
            print(f"{label}: {metrics}")
    else:
        print(f"{k}: {v}")

🎯 Evaluation Benchmark dataset Results ONNX :
accuracy: 0.656
precision: 0.6884781477272841
recall: 0.656
f1: 0.6545529141717424
cohen_kappa: 0.8131182309684821
pearson_corr: 0.8392447530686421

📋 Classification Report :
0: {'precision': 0.7547169811320755, 'recall': 0.8602150537634409, 'f1-score': 0.8040201005025126, 'support': 93.0}
1: {'precision': 0.5806451612903226, 'recall': 0.72, 'f1-score': 0.6428571428571429, 'support': 100.0}
2: {'precision': 0.5857142857142857, 'recall': 0.6721311475409836, 'f1-score': 0.6259541984732825, 'support': 122.0}
3: {'precision': 0.5416666666666666, 'recall': 0.5131578947368421, 'f1-score': 0.527027027027027, 'support': 76.0}
4: {'precision': 0.9482758620689655, 'recall': 0.5045871559633027, 'f1-score': 0.6586826347305389, 'support': 109.0}
accuracy: 0.656
macro avg: {'precision': 0.6822037913744632, 'recall': 0.6540182504009138, 'f1-score': 0.6517082207181009, 'support': 500.0}
weighted avg: {'precision': 0.6884781477272841, 'recall': 0.656, 'f1-s

In [79]:
metrics = evaluate_with_metrics(hf_dataset, onnx_session)
print(metrics)

{'accuracy': 0.624, 'precision': 0.6594395434907852, 'recall': 0.624, 'f1': 0.6253174939064662, 'cohen_kappa': 0.8109329869597169, 'pearson_corr': 0.8316285942858317, 'classification_report': {'0': {'precision': 0.7027027027027027, 'recall': 0.8387096774193549, 'f1-score': 0.7647058823529411, 'support': 93.0}, '1': {'precision': 0.559322033898305, 'recall': 0.66, 'f1-score': 0.6055045871559633, 'support': 100.0}, '2': {'precision': 0.5634920634920635, 'recall': 0.5819672131147541, 'f1-score': 0.5725806451612904, 'support': 122.0}, '3': {'precision': 0.47674418604651164, 'recall': 0.5394736842105263, 'f1-score': 0.5061728395061729, 'support': 76.0}, '4': {'precision': 0.9491525423728814, 'recall': 0.5137614678899083, 'f1-score': 0.6666666666666666, 'support': 109.0}, 'accuracy': 0.624, 'macro avg': {'precision': 0.6502827057024929, 'recall': 0.6267824085269088, 'f1-score': 0.6231261241686068, 'support': 500.0}, 'weighted avg': {'precision': 0.6594395434907852, 'recall': 0.624, 'f1-score

In [80]:
print("🎯 Evaluation Benchmark dataset Results ONNX :")
for k, v in metrics.items():
    if k == "classification_report":
        print("\n📋 Classification Report :")
        for label, metrics in v.items():
            print(f"{label}: {metrics}")
    else:
        print(f"{k}: {v}")

🎯 Evaluation Benchmark dataset Results ONNX :
accuracy: 0.624
precision: 0.6594395434907852
recall: 0.624
f1: 0.6253174939064662
cohen_kappa: 0.8109329869597169
pearson_corr: 0.8316285942858317

📋 Classification Report :
0: {'precision': 0.7027027027027027, 'recall': 0.8387096774193549, 'f1-score': 0.7647058823529411, 'support': 93.0}
1: {'precision': 0.559322033898305, 'recall': 0.66, 'f1-score': 0.6055045871559633, 'support': 100.0}
2: {'precision': 0.5634920634920635, 'recall': 0.5819672131147541, 'f1-score': 0.5725806451612904, 'support': 122.0}
3: {'precision': 0.47674418604651164, 'recall': 0.5394736842105263, 'f1-score': 0.5061728395061729, 'support': 76.0}
4: {'precision': 0.9491525423728814, 'recall': 0.5137614678899083, 'f1-score': 0.6666666666666666, 'support': 109.0}
accuracy: 0.624
macro avg: {'precision': 0.6502827057024929, 'recall': 0.6267824085269088, 'f1-score': 0.6231261241686068, 'support': 500.0}
weighted avg: {'precision': 0.6594395434907852, 'recall': 0.624, 'f1-

### Gradio Demo

In [None]:
#!pip install gradio

In [None]:
import gradio as gr
import numpy as np
from scipy.special import softmax

In [None]:
def predict(level, prompt, response):
    # Construire le texte d'entrée
    text = f"Prompt Level: {level} [SEP] Prompt: {prompt} [SEP] Response: {response}"
    
    # Tokenisation
    inputs = tokenizer(text, padding=True, truncation=True, max_length=max_length, return_tensors="pt")
    input_ids = inputs["input_ids"].numpy()
    attention_mask = inputs["attention_mask"].numpy()
    
    # Inférence ONNX
    onnx_inputs = {"input_ids": input_ids, "attention_mask": attention_mask}
    logits = onnx_session_quant.run(None, onnx_inputs)[0]

    # Appliquer softmax pour obtenir des probabilités
    probs = softmax(logits[0])  # logits[0] car batch size = 1

    # Obtenir les 2 meilleures classes avec scores
    top2_indices = np.argsort(probs)[::-1][:2]
    top2_probs = probs[top2_indices]

    results = []
    for idx, prob in zip(top2_indices, top2_probs):
        results.append((f"Classe {idx}", f"{prob:.4f}"))

    return results

In [83]:
interface = gr.Interface(
    fn=predict,
    inputs=[
        gr.Textbox(label="Prompt Level", placeholder="e.g. 2"),
        gr.Textbox(label="Prompt", placeholder="e.g. What activities do you do at school?"),
        gr.Textbox(label="Response", placeholder="e.g. I'm studying computer science...")
    ],
    outputs=gr.Dataframe(headers=["Class", "Probability Score"], label="Top 2 Predictions"),
    title="🧠 ONNX Prediction with RoBERTa",
    description="Enter a prompt level, a prompt, and a response. The model will predict the top 2 most likely classes along with their probability scores."
)

# === Launch the interface in the notebook ===
interface.launch(share=True, inline=True)