## Prepare data

In [None]:
import torch
training_device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
training_device
import numpy as np

In [None]:
#!pip install evaluate datasets transformers accelerate==1.9.0 wandb safetensors==0.4.3

In [None]:
import pandas as pd
from datasets import load_dataset, Dataset, DatasetDict

df = pd.read_csv("data/acc_data.csv")
df.head()

In [None]:
df["task_id"].value_counts()

In [None]:
df['text'] = (
    "Prompt Level: " + df['level_title'].astype(str) +
    " [SEP] Prompt: " + df['activity_instructions'] +
    " [SEP] Response: " + df['student_submission']
)

In [None]:
df.head()

In [None]:
df = df[["text", "task_id", "level_title", "majority_value"]]
df = df.rename(columns={'majority_value': 'label'})
df.head()

In [None]:
# Apply the mapping to the 'labels' column
#df['label'] = df['label'].map(label_mapping)
df.dropna(subset=['label', 'text'], inplace=True)
df.reset_index(drop=True, inplace=True)

df.head()

In [None]:
df["label"].value_counts()

In [None]:
ds = Dataset.from_pandas(df)
ds

In [None]:
from datasets import ClassLabel, Value, Sequence
new_features = ds.features.copy()
new_features["label"] = ClassLabel(names=[0, 1, 2, 3, 4, 5])
ds = ds.cast(new_features)

# Step 1: Initial train/test split with stratification
train_test_ds = ds.train_test_split(test_size=0.20, seed=20)

# Step 2: Split the test set into half test, half validation
test_valid_split = train_test_ds['test'].train_test_split(test_size=0.5, seed=20)

# Step 3: Combine everything into a single DatasetDict
ds = DatasetDict({
    'train': train_test_ds['train'],
    'test': test_valid_split['train'],    # This becomes the test set
    'validation': test_valid_split['test']  # This becomes the validation set
})
ds

In [None]:
# Verify label distribution
from collections import Counter

print("Train label counts:", Counter(ds['train']['label']))
print("Test label counts:", Counter(ds['test']['label']))
print("Validation label counts:", Counter(ds['validation']['label']))

In [None]:
import json

# Fonction utilitaire pour sauvegarder un split en JSONL
def save_split_to_jsonl(dataset_split, filename):
    with open(filename, 'w', encoding='utf-8') as f:
        for record in dataset_split:
            f.write(json.dumps(record, ensure_ascii=False) + '\n')

# Sauvegarde des trois splits
save_split_to_jsonl(ds['train'], 'data/train.jsonl')
save_split_to_jsonl(ds['test'], 'data/test.jsonl')
save_split_to_jsonl(ds['validation'], 'data/validation.jsonl')

## Roberta SFT

In [None]:
import numpy as np 
import evaluate

from sklearn.metrics import accuracy_score, precision_recall_fscore_support, cohen_kappa_score, classification_report

from scipy.stats import pearsonr

metric = evaluate.load("accuracy")

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)  # Convertir les logits en classes prédictes

    # 🎯 Exactitude (Accuracy)
    accuracy = accuracy_score(labels, predictions)

    # 🎯 Précision, Rappel et F1-score (pondérés)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="weighted")

    # 🎯 Score de Cohen's Kappa (pondéré)
    cohen_kappa = cohen_kappa_score(labels, predictions, weights="quadratic")

    # 🎯 Corrélation de Pearson
    pearson_corr, _ = pearsonr(labels, predictions)  # Retourne (coef, p-valeur), on garde seulement coef

     # 🎯 Classification Report
    class_report = classification_report(labels, predictions, output_dict=True)  # Get a dictionary of the report


    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "cohen_kappa": cohen_kappa,
        "pearson_corr": pearson_corr,
        "classification_report": class_report  # Add classification report to the return
    }


In [None]:
from datasets import load_dataset
from datasets import DatasetDict, Dataset

# Charger les fichiers JSONL en DatasetDict
dataset = DatasetDict({
    "train": load_dataset("json", data_files="data/train.jsonl")["train"],
    "test": load_dataset("json", data_files="data/test.jsonl")["train"],
    "valid": load_dataset("json", data_files="data/validation.jsonl")["train"]
})

print(dataset)

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-large")

In [None]:
dataset["train"][0]

In [None]:
tok_test = tokenizer(dataset["train"][1]["text"], max_length=256, truncation=True)
tok_test

In [None]:
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=256
    )

In [None]:
tokenized_train = dataset["train"].map(tokenize_function, batched=True)
tokenized_test = dataset["test"].map(tokenize_function, batched=True)
tokenized_valid = dataset["valid"].map(tokenize_function, batched=True)

In [None]:
unique_labels = set(dataset['train']['label'])
num_labels = len(unique_labels)
num_labels

In [None]:
from transformers import RobertaForSequenceClassification, TrainingArguments, Trainer
model = RobertaForSequenceClassification.from_pretrained("FacebookAI/roberta-large", num_labels=num_labels)

In [None]:
args = TrainingArguments(
    output_dir="../../../model_saved/roberta-large-ft-acc-writing-task-augmented",
    eval_strategy="steps",  # Évaluation aux mêmes intervalles que la sauvegarde
    save_strategy="steps",  # Sauvegarde tous les 500 steps
    save_steps=200,
    eval_steps=200,  # ⚠ IMPORTANT : Évaluation aux mêmes steps
    save_total_limit=4,  # Ne garde que 4 checkpoints max
    learning_rate=2e-5,
    warmup_ratio=0.1,
    lr_scheduler_type="linear", 
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=6,
    weight_decay=0.01,
    load_best_model_at_end=True,  
    metric_for_best_model="f1",
    logging_steps=100,
    fp16=True,
)

In [None]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

## Detailed Evaluation

In [None]:
list_topic = dataset["valid"]["task_id"]
list_t_set = set(list_topic)
unique_t = (list(list_t_set))

list_level = dataset["valid"]["level_title"]
list_l_set = set(list_level)
unique_l = (list(list_l_set))


In [None]:
list_r = []

# Assuming 'unique_t' is a list of unique item_ids and 'trainer' is already defined
for t in unique_t:  # Iterate over the first item in unique_t
    sub_ds = tokenized_valid.filter(lambda example: example['task_id'] == t)
    # Get predictions using the trainer
    predictions = trainer.predict(sub_ds)
    # Raw output logits (size [batch_size, num_classes])
    outputs = predictions.predictions
    # Convert logits to predicted class labels (taking the argmax across the classes)
    predicted_labels = np.argmax(outputs, axis=-1)
    ref_label = predictions.label_ids
    # Print or save the predicted classes (this will be a numpy array with the predicted class indices)
    ck = round(cohen_kappa_score(predicted_labels, ref_label, weights="quadratic"), 2)  
    pearson_corr, _ = pearsonr(ref_label, predicted_labels)
    accuracy = accuracy_score(ref_label, predicted_labels)
    precision, recall, f1, _ = precision_recall_fscore_support(ref_label, predicted_labels, average="weighted")

    r = {
        "task_id": t,
        "level_title": sub_ds["level_title"][0],
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "ck": ck,
        "pearson": pearson_corr,
        "n_samples": len(sub_ds)
    }
    list_r.append(r)

In [None]:
list_r_level = []

# Assuming 'unique_t' is a list of unique item_ids and 'trainer' is already defined
for l in unique_l:  # Iterate over the first item in unique_t
    sub_ds = tokenized_valid.filter(lambda example: example['level_title'] == l)
    # Get predictions using the trainer
    predictions = trainer.predict(sub_ds)
    # Raw output logits (size [batch_size, num_classes])
    outputs = predictions.predictions
    # Convert logits to predicted class labels (taking the argmax across the classes)
    predicted_labels = np.argmax(outputs, axis=-1)
    ref_label = predictions.label_ids
    # Print or save the predicted classes (this will be a numpy array with the predicted class indices)
    ck = round(cohen_kappa_score(predicted_labels, ref_label, weights="quadratic"), 2)  
    pearson_corr, _ = pearsonr(ref_label, predicted_labels)
    accuracy = accuracy_score(ref_label, predicted_labels)
    precision, recall, f1, _ = precision_recall_fscore_support(ref_label, predicted_labels, average="weighted")

    r = {
        "level_title": sub_ds["level_title"][0],
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "ck": ck,
        "pearson": pearson_corr,
        "n_samples": len(sub_ds)
    }
    list_r_level.append(r)


In [None]:
import pandas as pd
df_eval_results = pd.DataFrame(list_r, columns=["task_id", "level_title", "accuracy", "precision", "recall", "f1", "ck", "pearson", "n_samples"])
df_eval_results.head(n=10)

In [None]:
df_eval_results.to_csv("result_eval_data_roberta_large_writing_task_acc.csv", index=False)

In [None]:
import pandas as pd
df_eval_results_level = pd.DataFrame(list_r_level, columns=["level_title", "accuracy", "precision", "recall", "f1", "ck", "pearson", "n_samples"])
df_eval_results_level.head(n=20)

In [None]:
df_eval_results_level.to_csv("result_eval_data_roberta_large_acc_by_level.csv", index=False)

## Onnx Export

In [None]:
#!pip install onnxruntime

In [None]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AutoTokenizer
import torch
import os
from onnxruntime.quantization import quantize_dynamic, QuantType

# === CONFIGURATION ===
# Chemin vers ton dossier contenant le .bin et le config.json
model_dir = "model_saved/roberta-large-ft-acc-writing-task-augmented/checkpoint-1800"
onnx_model_path = "model_saved/roberta-large-ft-acc-writing-task-1800.onnx"
quantized_model_path = "model_saved/roberta-large-ft-acc-writing-task-1800-quantized.onnx"

# === ÉTAPE 1 : Charger le modèle et tokenizer ===
model = RobertaForSequenceClassification.from_pretrained(model_dir)
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-large")
model.eval()

# === ÉTAPE 2 : Préparer un input fictif ===
dummy_text = "Texte d'exemple pour conversion ONNX"
inputs = tokenizer(dummy_text, return_tensors="pt", padding="max_length", max_length=32)

# === ÉTAPE 3 : Exporter vers ONNX ===
torch.onnx.export(
    model,
    (inputs["input_ids"], inputs["attention_mask"]),
    onnx_model_path,
    input_names=["input_ids", "attention_mask"],
    output_names=["logits"],
    dynamic_axes={
        "input_ids": {0: "batch_size", 1: "sequence_length"},
        "attention_mask": {0: "batch_size", 1: "sequence_length"},
        "logits": {0: "batch_size"},
    },
    opset_version=14  # ⬅️ change ici
)

print(f"✅ Modèle exporté en ONNX : {onnx_model_path}")

# === ÉTAPE 4 : Quantization dynamique ===
quantize_dynamic(
    model_input=onnx_model_path,
    model_output=quantized_model_path,
    weight_type=QuantType.QInt8
)

print(f"✅ Modèle quantifié en ONNX : {quantized_model_path}")

In [None]:
import onnxruntime

onnx_session = onnxruntime.InferenceSession(onnx_model_path)
onnx_session_quant = onnxruntime.InferenceSession(quantized_model_path)

In [None]:
max_length = 256  # Ajuste selon la taille maximale de ton modèle

# Fonction d'inférence ONNX
def onnx_infer(input_texts, onnx_model):
    inputs = tokenizer(input_texts, padding=True, truncation=True, max_length=max_length, return_tensors="pt")
    input_ids = inputs["input_ids"].numpy()
    attention_mask = inputs["attention_mask"].numpy()
    onnx_inputs = {"input_ids": input_ids, "attention_mask": attention_mask}
    onnx_outputs = onnx_model.run(None, onnx_inputs)
    return onnx_outputs[0]

def evaluate_with_metrics(dataset, onnx_model, batch_size=16):
    all_logits = []
    all_labels = []

    for i in range(0, len(dataset), batch_size):
        batch = dataset[i:i + batch_size]
        texts = batch["text"]
        labels = batch["label"]

        logits = onnx_infer(texts, onnx_model)
        all_logits.extend(logits)
        all_labels.extend(labels)

    all_logits = np.array(all_logits)
    all_labels = np.array(all_labels)

    # 🔥 Appliquer compute_metrics
    metrics = compute_metrics((all_logits, all_labels))
    return metrics

In [None]:
valid_data = ds["validation"]  # Ou "valid" selon ton dataset
# === Lancer l'évaluation ===
results = evaluate_with_metrics(valid_data, onnx_model=onnx_session)
print("🎯 Evaluation Results ONNX :")
for k, v in results.items():
    if k == "classification_report":
        print("\n📋 Classification Report :")
        for label, metrics in v.items():
            print(f"{label}: {metrics}")
    else:
        print(f"{k}: {v}")

In [None]:
valid_data = ds["validation"]  # Ou "valid" selon ton dataset
# === Lancer l'évaluation ===
results = evaluate_with_metrics(valid_data, onnx_model=onnx_session_quant)
print("🎯 Evaluation Results ONNX :")
for k, v in results.items():
    if k == "classification_report":
        print("\n📋 Classification Report :")
        for label, metrics in v.items():
            print(f"{label}: {metrics}")
    else:
        print(f"{k}: {v}")

In [27]:
import boto3

# Initialize the S3 client
s3 = boto3.client(
    "s3"
)

# Define your bucket name and desired path in S3

bucket_name = "sagemaker-studio-oxs6vznjds"

s3_key = "writing_task_models/accuracy/model_1800_roberta_large.onnx"  # Change path as needed
# Upload the ONNX file
bucket_path = "sagemaker-studio-oxs6vznjds"
quantized_model_path = "model_saved/roberta-large-ft-acc-writing-task-1800.onnx"

s3.upload_file(quantized_model_path, bucket_path, s3_key)

print(f"✅ ONNX model uploaded to s3://{bucket_name}/{s3_key}")

✅ ONNX model uploaded to s3://sagemaker-studio-oxs6vznjds/writing_task_models/accuracy/model_1800_roberta_large.onnx
