## Prepare data

In [1]:
import torch
training_device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
training_device
import numpy as np

In [None]:
#!pip install evaluate datasets transformers accelerate==1.9.0 wandb safetensors==0.4.3

In [2]:
import pandas as pd
from datasets import load_dataset, Dataset, DatasetDict

median_map = {
    "A1": 2,
    "A2": 5,
    "B1": 8,
    "B2": 11,
    "C1": 14,
    "C2": 16
}

df = pd.read_csv("data/acc_data.csv")

df['ef_level'] = df.apply(lambda row: median_map[row['cefr_level']] if pd.isna(row['ef_level']) else row['ef_level'], axis=1)

df.head()

Unnamed: 0,recordId,gpt4o_judge_score,nova_judge_score,llama3_judge_score,majority_value,agreement_percentage,writing_id,task_id,level_title,cefr_level,ef_level,activity_instructions,student_submission
0,CALL0015056,0.0,0.0,0.0,0.0,100.0,3e46481f-4bef-4900-b990-169da9305089,af3c3b87-9a8a-449e-b4f0-737d76275fb9,Meetings,B1,8.0,"Your meeting is finishing, and your boss is re...","Your meeting is finishing, and your boss is re..."
1,CALL0000241,0.0,0.0,0.0,0.0,100.0,d3ca8e74-ec0f-41a1-8bc3-030edf392772,fa1e732b-7698-4b57-85da-907fc3660ec3,2-Beginner,A1,2.0,A new colleague is looking for a place to go f...,"From_\nToAll\nHi, there.\n\nCan you help me? I..."
2,CALL0022001,0.0,0.0,0.0,0.0,100.0,ad6bb8ac-07c2-415e-8195-19d3791f83d3,03251188-de56-444f-8971-e68e55719bc9,Research,B1,8.0,You work for a market research firm. You take ...,s
3,CALL0022165,0.0,0.0,0.0,0.0,100.0,85dc4eb7-4a04-4142-aeb4-740d466bbad1,3bdca4ab-e3ee-4d24-bab1-4de708580e2e,Research,B1,8.0,Write a summary of the report in no more than ...,G
4,CALL0000146,4.0,5.0,5.0,5.0,66.666667,57e620bf-efb2-49bf-9f36-ada83b46768d,b3285147-d9b6-42f7-9ffb-7482cd450db7,2-Beginner,A1,2.0,Write a paragraph about how your friend stays ...,My friend stays healthy and fit by exercising ...


In [3]:
df["task_id"].value_counts()

task_id
cfb893f1-bee1-4894-99f3-43af284493a5    205
5c44f26d-2f5c-4039-bcf5-378754d2fe4e    199
bdd2883c-b71b-4fb7-a44c-a53e7bbf26f4    196
4ce13bf1-4384-44a6-9ff7-5084e02f9e2d    195
7e3ee056-1866-488e-88f3-f56af8d22613    193
                                       ... 
e5780a8d-f6ef-4218-9d34-74e064c3e413     25
178cfadd-4162-4b5b-aaee-97a1bdcd4d98     25
9cf6c0c9-e87b-4e02-84c9-11266ea51d28     25
511ccd73-9db8-45e5-8f85-10e244841ce2     23
d2196935-4153-4f5c-a1d2-dae31174ae55     19
Name: count, Length: 161, dtype: int64

In [4]:
df['text'] = (
    "Prompt Level: " + df['ef_level'].astype(str) +
    " [SEP] Prompt: " + df['activity_instructions'] +
    " [SEP] Response: " + df['student_submission']
)

In [5]:
df.head()

Unnamed: 0,recordId,gpt4o_judge_score,nova_judge_score,llama3_judge_score,majority_value,agreement_percentage,writing_id,task_id,level_title,cefr_level,ef_level,activity_instructions,student_submission,text
0,CALL0015056,0.0,0.0,0.0,0.0,100.0,3e46481f-4bef-4900-b990-169da9305089,af3c3b87-9a8a-449e-b4f0-737d76275fb9,Meetings,B1,8.0,"Your meeting is finishing, and your boss is re...","Your meeting is finishing, and your boss is re...",Prompt Level: 8.0 [SEP] Prompt: Your meeting i...
1,CALL0000241,0.0,0.0,0.0,0.0,100.0,d3ca8e74-ec0f-41a1-8bc3-030edf392772,fa1e732b-7698-4b57-85da-907fc3660ec3,2-Beginner,A1,2.0,A new colleague is looking for a place to go f...,"From_\nToAll\nHi, there.\n\nCan you help me? I...",Prompt Level: 2.0 [SEP] Prompt: A new colleagu...
2,CALL0022001,0.0,0.0,0.0,0.0,100.0,ad6bb8ac-07c2-415e-8195-19d3791f83d3,03251188-de56-444f-8971-e68e55719bc9,Research,B1,8.0,You work for a market research firm. You take ...,s,Prompt Level: 8.0 [SEP] Prompt: You work for a...
3,CALL0022165,0.0,0.0,0.0,0.0,100.0,85dc4eb7-4a04-4142-aeb4-740d466bbad1,3bdca4ab-e3ee-4d24-bab1-4de708580e2e,Research,B1,8.0,Write a summary of the report in no more than ...,G,Prompt Level: 8.0 [SEP] Prompt: Write a summar...
4,CALL0000146,4.0,5.0,5.0,5.0,66.666667,57e620bf-efb2-49bf-9f36-ada83b46768d,b3285147-d9b6-42f7-9ffb-7482cd450db7,2-Beginner,A1,2.0,Write a paragraph about how your friend stays ...,My friend stays healthy and fit by exercising ...,Prompt Level: 2.0 [SEP] Prompt: Write a paragr...


In [6]:
df = df[["text", "task_id", "ef_level", "majority_value"]]
df = df.rename(columns={'majority_value': 'label'})
df.head()

Unnamed: 0,text,task_id,ef_level,label
0,Prompt Level: 8.0 [SEP] Prompt: Your meeting i...,af3c3b87-9a8a-449e-b4f0-737d76275fb9,8.0,0.0
1,Prompt Level: 2.0 [SEP] Prompt: A new colleagu...,fa1e732b-7698-4b57-85da-907fc3660ec3,2.0,0.0
2,Prompt Level: 8.0 [SEP] Prompt: You work for a...,03251188-de56-444f-8971-e68e55719bc9,8.0,0.0
3,Prompt Level: 8.0 [SEP] Prompt: Write a summar...,3bdca4ab-e3ee-4d24-bab1-4de708580e2e,8.0,0.0
4,Prompt Level: 2.0 [SEP] Prompt: Write a paragr...,b3285147-d9b6-42f7-9ffb-7482cd450db7,2.0,5.0


In [7]:
# Apply the mapping to the 'labels' column
#df['label'] = df['label'].map(label_mapping)
df.dropna(subset=['label', 'text'], inplace=True)
df.reset_index(drop=True, inplace=True)

df.head()

Unnamed: 0,text,task_id,ef_level,label
0,Prompt Level: 8.0 [SEP] Prompt: Your meeting i...,af3c3b87-9a8a-449e-b4f0-737d76275fb9,8.0,0.0
1,Prompt Level: 2.0 [SEP] Prompt: A new colleagu...,fa1e732b-7698-4b57-85da-907fc3660ec3,2.0,0.0
2,Prompt Level: 8.0 [SEP] Prompt: You work for a...,03251188-de56-444f-8971-e68e55719bc9,8.0,0.0
3,Prompt Level: 8.0 [SEP] Prompt: Write a summar...,3bdca4ab-e3ee-4d24-bab1-4de708580e2e,8.0,0.0
4,Prompt Level: 2.0 [SEP] Prompt: Write a paragr...,b3285147-d9b6-42f7-9ffb-7482cd450db7,2.0,5.0


In [8]:
df["label"].value_counts()

label
5.0    5682
2.0    3726
0.0    3662
3.0    3018
4.0    2985
1.0     968
Name: count, dtype: int64

In [9]:
ds = Dataset.from_pandas(df)
ds

Dataset({
    features: ['text', 'task_id', 'ef_level', 'label'],
    num_rows: 20041
})

In [10]:
from datasets import ClassLabel, Value, Sequence
new_features = ds.features.copy()
new_features["label"] = ClassLabel(names=[0, 1, 2, 3, 4, 5])
ds = ds.cast(new_features)

# Step 1: Initial train/test split with stratification
train_test_ds = ds.train_test_split(test_size=0.20, seed=20)

# Step 2: Split the test set into half test, half validation
test_valid_split = train_test_ds['test'].train_test_split(test_size=0.5, seed=20)

# Step 3: Combine everything into a single DatasetDict
ds = DatasetDict({
    'train': train_test_ds['train'],
    'test': test_valid_split['train'],    # This becomes the test set
    'validation': test_valid_split['test']  # This becomes the validation set
})
ds

Casting the dataset:   0%|          | 0/20041 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'task_id', 'ef_level', 'label'],
        num_rows: 16032
    })
    test: Dataset({
        features: ['text', 'task_id', 'ef_level', 'label'],
        num_rows: 2004
    })
    validation: Dataset({
        features: ['text', 'task_id', 'ef_level', 'label'],
        num_rows: 2005
    })
})

In [11]:
# Verify label distribution
from collections import Counter

print("Train label counts:", Counter(ds['train']['label']))
print("Test label counts:", Counter(ds['test']['label']))
print("Validation label counts:", Counter(ds['validation']['label']))

Train label counts: Counter({5: 4579, 2: 2970, 0: 2910, 3: 2424, 4: 2374, 1: 775})
Test label counts: Counter({5: 552, 0: 381, 2: 371, 3: 307, 4: 281, 1: 112})
Validation label counts: Counter({5: 551, 2: 385, 0: 371, 4: 330, 3: 287, 1: 81})


In [12]:
import json

# Fonction utilitaire pour sauvegarder un split en JSONL
def save_split_to_jsonl(dataset_split, filename):
    with open(filename, 'w', encoding='utf-8') as f:
        for record in dataset_split:
            f.write(json.dumps(record, ensure_ascii=False) + '\n')

# Sauvegarde des trois splits
save_split_to_jsonl(ds['train'], 'data/train.jsonl')
save_split_to_jsonl(ds['test'], 'data/test.jsonl')
save_split_to_jsonl(ds['validation'], 'data/validation.jsonl')

## Roberta SFT

In [13]:
import numpy as np 
import evaluate

from sklearn.metrics import accuracy_score, precision_recall_fscore_support, cohen_kappa_score, classification_report
from scipy.stats import pearsonr

metric = evaluate.load("accuracy")

In [14]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)  # Convertir les logits en classes prédictes

    # 🎯 Exactitude (Accuracy)
    accuracy = accuracy_score(labels, predictions)

    # 🎯 Précision, Rappel et F1-score (pondérés)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="weighted")

    # 🎯 Score de Cohen's Kappa (pondéré)
    cohen_kappa = cohen_kappa_score(labels, predictions, weights="quadratic")

    # 🎯 Corrélation de Pearson
    pearson_corr, _ = pearsonr(labels, predictions)  # Retourne (coef, p-valeur), on garde seulement coef

     # 🎯 Classification Report
    class_report = classification_report(labels, predictions, output_dict=True)  # Get a dictionary of the report


    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "cohen_kappa": cohen_kappa,
        "pearson_corr": pearson_corr,
        "classification_report": class_report  # Add classification report to the return
    }


In [15]:
from datasets import load_dataset
from datasets import DatasetDict, Dataset

# Charger les fichiers JSONL en DatasetDict
dataset = DatasetDict({
    "train": load_dataset("json", data_files="data/train.jsonl")["train"],
    "test": load_dataset("json", data_files="data/test.jsonl")["train"],
    "valid": load_dataset("json", data_files="data/validation.jsonl")["train"]
})

print(dataset)

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'task_id', 'ef_level', 'label'],
        num_rows: 16032
    })
    test: Dataset({
        features: ['text', 'task_id', 'ef_level', 'label'],
        num_rows: 2004
    })
    valid: Dataset({
        features: ['text', 'task_id', 'ef_level', 'label'],
        num_rows: 2005
    })
})


In [16]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-large")

In [None]:
dataset["train"][0]

In [None]:
tok_test = tokenizer(dataset["train"][1]["text"], max_length=256, truncation=True)
tok_test

In [None]:
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=256
    )

In [None]:
tokenized_train = dataset["train"].map(tokenize_function, batched=True)
tokenized_test = dataset["test"].map(tokenize_function, batched=True)
tokenized_valid = dataset["valid"].map(tokenize_function, batched=True)

In [None]:
unique_labels = set(dataset['train']['label'])
num_labels = len(unique_labels)
num_labels

In [None]:
from transformers import RobertaForSequenceClassification, TrainingArguments, Trainer
model = RobertaForSequenceClassification.from_pretrained("FacebookAI/roberta-large", num_labels=num_labels)

In [None]:
args = TrainingArguments(
    output_dir="../../../model_saved/roberta-large-ft-acc-writing-task-augmented",
    eval_strategy="steps",  # Évaluation aux mêmes intervalles que la sauvegarde
    save_strategy="steps",  # Sauvegarde tous les 500 steps
    save_steps=200,
    eval_steps=200,  # ⚠ IMPORTANT : Évaluation aux mêmes steps
    save_total_limit=4,  # Ne garde que 4 checkpoints max
    learning_rate=2e-5,
    warmup_ratio=0.1,
    lr_scheduler_type="linear", 
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=6,
    weight_decay=0.01,
    load_best_model_at_end=True,  
    metric_for_best_model="f1",
    logging_steps=100,
    fp16=True,
)

In [None]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

## Detailed Evaluation

In [None]:
list_topic = dataset["valid"]["task_id"]
list_t_set = set(list_topic)
unique_t = (list(list_t_set))

list_level = dataset["valid"]["level_title"]
list_l_set = set(list_level)
unique_l = (list(list_l_set))


In [None]:
list_r = []

# Assuming 'unique_t' is a list of unique item_ids and 'trainer' is already defined
for t in unique_t:  # Iterate over the first item in unique_t
    sub_ds = tokenized_valid.filter(lambda example: example['task_id'] == t)
    # Get predictions using the trainer
    predictions = trainer.predict(sub_ds)
    # Raw output logits (size [batch_size, num_classes])
    outputs = predictions.predictions
    # Convert logits to predicted class labels (taking the argmax across the classes)
    predicted_labels = np.argmax(outputs, axis=-1)
    ref_label = predictions.label_ids
    # Print or save the predicted classes (this will be a numpy array with the predicted class indices)
    ck = round(cohen_kappa_score(predicted_labels, ref_label, weights="quadratic"), 2)  
    pearson_corr, _ = pearsonr(ref_label, predicted_labels)
    accuracy = accuracy_score(ref_label, predicted_labels)
    precision, recall, f1, _ = precision_recall_fscore_support(ref_label, predicted_labels, average="weighted")

    r = {
        "task_id": t,
        "level_title": sub_ds["level_title"][0],
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "ck": ck,
        "pearson": pearson_corr,
        "n_samples": len(sub_ds)
    }
    list_r.append(r)

In [None]:
list_r_level = []

# Assuming 'unique_t' is a list of unique item_ids and 'trainer' is already defined
for l in unique_l:  # Iterate over the first item in unique_t
    sub_ds = tokenized_valid.filter(lambda example: example['level_title'] == l)
    # Get predictions using the trainer
    predictions = trainer.predict(sub_ds)
    # Raw output logits (size [batch_size, num_classes])
    outputs = predictions.predictions
    # Convert logits to predicted class labels (taking the argmax across the classes)
    predicted_labels = np.argmax(outputs, axis=-1)
    ref_label = predictions.label_ids
    # Print or save the predicted classes (this will be a numpy array with the predicted class indices)
    ck = round(cohen_kappa_score(predicted_labels, ref_label, weights="quadratic"), 2)  
    pearson_corr, _ = pearsonr(ref_label, predicted_labels)
    accuracy = accuracy_score(ref_label, predicted_labels)
    precision, recall, f1, _ = precision_recall_fscore_support(ref_label, predicted_labels, average="weighted")

    r = {
        "level_title": sub_ds["level_title"][0],
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "ck": ck,
        "pearson": pearson_corr,
        "n_samples": len(sub_ds)
    }
    list_r_level.append(r)


In [None]:
import pandas as pd
df_eval_results = pd.DataFrame(list_r, columns=["task_id", "level_title", "accuracy", "precision", "recall", "f1", "ck", "pearson", "n_samples"])
df_eval_results.head(n=10)

In [None]:
df_eval_results.to_csv("result_eval_data_roberta_large_writing_task_acc.csv", index=False)

In [None]:
import pandas as pd
df_eval_results_level = pd.DataFrame(list_r_level, columns=["level_title", "accuracy", "precision", "recall", "f1", "ck", "pearson", "n_samples"])
df_eval_results_level.head(n=20)

In [None]:
df_eval_results_level.to_csv("result_eval_data_roberta_large_acc_by_level.csv", index=False)

## Onnx Export

In [None]:
#!pip install onnxruntime

In [2]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AutoTokenizer
import torch
import os
from onnxruntime.quantization import quantize_dynamic, QuantType

# === CONFIGURATION ===
# Chemin vers ton dossier contenant le .bin et le config.json
#model_dir = "model_saved/roberta-large-ft-acc-writing-task-augmented/checkpoint-1800"
model_dir="/tmp/tmpdxu3_htb"
onnx_model_path = "model_saved/roberta-large-ft-acc-writing-task-1800.onnx"
quantized_model_path = "model_saved/roberta-large-ft-acc-writing-task-1800-quantized.onnx"

# === ÉTAPE 1 : Charger le modèle et tokenizer ===
model = RobertaForSequenceClassification.from_pretrained(model_dir)
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-large")
model.eval()

# === ÉTAPE 2 : Préparer un input fictif ===
dummy_text = "Texte d'exemple pour conversion ONNX"
inputs = tokenizer(dummy_text, return_tensors="pt", padding="max_length", max_length=256)

# === ÉTAPE 3 : Exporter vers ONNX ===
torch.onnx.export(
    model,
    (inputs["input_ids"], inputs["attention_mask"]),
    onnx_model_path,
    input_names=["input_ids", "attention_mask"],
    output_names=["logits"],
    dynamic_axes={
        "input_ids": {0: "batch_size", 1: "sequence_length"},
        "attention_mask": {0: "batch_size", 1: "sequence_length"},
        "logits": {0: "batch_size"},
    },
    opset_version=14  # ⬅️ change ici
)

print(f"✅ Modèle exporté en ONNX : {onnx_model_path}")

# === ÉTAPE 4 : Quantization dynamique ===
quantize_dynamic(
    model_input=onnx_model_path,
    model_output=quantized_model_path,
    weight_type=QuantType.QInt8
)

print(f"✅ Modèle quantifié en ONNX : {quantized_model_path}")

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

  inverted_mask = torch.tensor(1.0, dtype=dtype) - expanded_mask


✅ Modèle exporté en ONNX : model_saved/roberta-large-ft-acc-writing-task-1800.onnx




Ignore MatMul due to non constant B: /[/roberta/encoder/layer.0/attention/self/MatMul]
Ignore MatMul due to non constant B: /[/roberta/encoder/layer.0/attention/self/MatMul_1]
Ignore MatMul due to non constant B: /[/roberta/encoder/layer.1/attention/self/MatMul]
Ignore MatMul due to non constant B: /[/roberta/encoder/layer.1/attention/self/MatMul_1]
Ignore MatMul due to non constant B: /[/roberta/encoder/layer.2/attention/self/MatMul]
Ignore MatMul due to non constant B: /[/roberta/encoder/layer.2/attention/self/MatMul_1]
Ignore MatMul due to non constant B: /[/roberta/encoder/layer.3/attention/self/MatMul]
Ignore MatMul due to non constant B: /[/roberta/encoder/layer.3/attention/self/MatMul_1]
Ignore MatMul due to non constant B: /[/roberta/encoder/layer.4/attention/self/MatMul]
Ignore MatMul due to non constant B: /[/roberta/encoder/layer.4/attention/self/MatMul_1]
Ignore MatMul due to non constant B: /[/roberta/encoder/layer.5/attention/self/MatMul]
Ignore MatMul due to non constant

In [3]:
import onnxruntime

onnx_session = onnxruntime.InferenceSession(onnx_model_path)
onnx_session_quant = onnxruntime.InferenceSession(quantized_model_path)

In [20]:
max_length = 256  # Ajuste selon la taille maximale de ton modèle

# Fonction d'inférence ONNX
def onnx_infer(input_texts, onnx_model):
    inputs = tokenizer(input_texts, padding=True, truncation=True, max_length=max_length, return_tensors="pt")
    input_ids = inputs["input_ids"].numpy()
    attention_mask = inputs["attention_mask"].numpy()
    onnx_inputs = {"input_ids": input_ids, "attention_mask": attention_mask}
    onnx_outputs = onnx_model.run(None, onnx_inputs)
    return onnx_outputs[0]

from tqdm import tqdm

def evaluate_with_metrics(dataset, onnx_model, batch_size=16):
    all_logits = []
    all_labels = []

    # tqdm pour afficher la progression sur les batches
    for i in tqdm(range(0, len(dataset), batch_size), desc="Evaluation"):
        batch = dataset[i:i + batch_size]
        texts = batch["text"]
        labels = batch["label"]

        logits = onnx_infer(texts, onnx_model)
        all_logits.extend(logits)
        all_labels.extend(labels)

    all_logits = np.array(all_logits)
    all_labels = np.array(all_labels)

    # 🔥 Appliquer compute_metrics
    metrics = compute_metrics((all_logits, all_labels))
    return metrics

In [None]:
valid_data = ds["validation"]  # Ou "valid" selon ton dataset
# === Lancer l'évaluation ===
results = evaluate_with_metrics(valid_data, onnx_model=onnx_session)
print("🎯 Evaluation Results ONNX :")
for k, v in results.items():
    if k == "classification_report":
        print("\n📋 Classification Report :")
        for label, metrics in v.items():
            print(f"{label}: {metrics}")
    else:
        print(f"{k}: {v}")

In [None]:
valid_data = ds["validation"]  # Ou "valid" selon ton dataset
# === Lancer l'évaluation ===
results = evaluate_with_metrics(valid_data, onnx_model=onnx_session_quant)
print("🎯 Evaluation Results ONNX :")
for k, v in results.items():
    if k == "classification_report":
        print("\n📋 Classification Report :")
        for label, metrics in v.items():
            print(f"{label}: {metrics}")
    else:
        print(f"{k}: {v}")

In [6]:
import boto3

# Initialize the S3 client
s3 = boto3.client(
    "s3"
)

# Define your bucket name and desired path in S3
bucket_name = "sagemaker-studio-oxs6vznjds"

s3_key = "writing_task_models/accuracy/model_1800_quantized_roberta_large.onnx"  # Change path as needed
# Upload the ONNX file
bucket_path = "sagemaker-studio-oxs6vznjds"
quantized_model_path = "model_saved/roberta-large-ft-acc-writing-task-1800-quantized.onnx"

s3.upload_file(quantized_model_path, bucket_path, s3_key)

print(f"✅ ONNX model uploaded to s3://{bucket_name}/{s3_key}")

✅ ONNX model uploaded to s3://sagemaker-studio-oxs6vznjds/writing_task_models/accuracy/model_1800_quantized_roberta_large.onnx


## Load Onnx from s3

In [17]:
import boto3
import onnxruntime as ort

# Define S3 bucket and model key
bucket_name = 'sagemaker-studio-oxs6vznjds'
model_key = 'writing_task_models/accuracy/model_1800_quantized_roberta_large.onnx'
local_model_path = '/tmp/roberta-large-ft-acc-writing-task-1800-quant.onnx'  # or wherever you want to save temporarily

# Initialize boto3 S3 client
s3 = boto3.client('s3')

# Download the ONNX model from S3 to local path
s3.download_file(bucket_name, model_key, local_model_path)

# Load the ONNX model using onnxruntime
session = ort.InferenceSession(local_model_path)

print("ONNX model loaded successfully.")

ONNX model loaded successfully.


In [18]:
valid_data = ds["validation"]  # Ou "valid" selon ton dataset

In [21]:
results = evaluate_with_metrics(valid_data, onnx_model=session)
print("🎯 Evaluation Results ONNX :")
for k, v in results.items():
    if k == "classification_report":
        print("\n📋 Classification Report :")
        for label, metrics in v.items():
            print(f"{label}: {metrics}")
    else:
        print(f"{k}: {v}")


Evaluation: 100%|██████████| 126/126 [16:31<00:00,  7.87s/it]

🎯 Evaluation Results ONNX :
accuracy: 0.7581047381546134
precision: 0.7566740775830653
recall: 0.7581047381546134
f1: 0.756985630691875
cohen_kappa: 0.9184130088587737
pearson_corr: 0.9184389337067936

📋 Classification Report :
0: {'precision': 0.9400544959128065, 'recall': 0.9299191374663073, 'f1-score': 0.9349593495934959, 'support': 371.0}
1: {'precision': 0.5074626865671642, 'recall': 0.41975308641975306, 'f1-score': 0.4594594594594595, 'support': 81.0}
2: {'precision': 0.748780487804878, 'recall': 0.7974025974025974, 'f1-score': 0.7723270440251573, 'support': 385.0}
3: {'precision': 0.5939597315436241, 'recall': 0.6167247386759582, 'f1-score': 0.6051282051282051, 'support': 287.0}
4: {'precision': 0.5968253968253968, 'recall': 0.5696969696969697, 'f1-score': 0.5829457364341085, 'support': 330.0}
5: {'precision': 0.8558394160583942, 'recall': 0.8511796733212341, 'f1-score': 0.8535031847133758, 'support': 551.0}
accuracy: 0.7581047381546134
macro avg: {'precision': 0.707153702452044


