In [4]:
import boto3
import os
import tempfile
from transformers import RobertaTokenizer, RobertaForSequenceClassification
import torch

def load_roberta_from_s3(bucket_path):
    # Séparer bucket et prefix
    if "/" not in bucket_path:
        raise ValueError("Le chemin doit être de la forme 'bucket/prefix'")
    bucket_name, prefix = bucket_path.split("/", 1)
    prefix = prefix.rstrip("/")  # enlever / final si présent

    s3 = boto3.client("s3")
    tmp_dir = tempfile.mkdtemp()
    print(f"Dossier temporaire: {tmp_dir}")

    # Lister les fichiers dans le prefix
    response = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
    if "Contents" not in response:
        raise FileNotFoundError(f"Aucun fichier trouvé dans s3://{bucket_name}/{prefix}")

    # Télécharger tous les fichiers
    for obj in response["Contents"]:
        key = obj["Key"]
        if key.endswith("/"):
            continue
        local_path = os.path.join(tmp_dir, os.path.relpath(key, prefix))
        os.makedirs(os.path.dirname(local_path), exist_ok=True)
        print(f"Téléchargement: s3://{bucket_name}/{key} -> {local_path}")
        s3.download_file(bucket_name, key, local_path)

    # Charger modèle et tokenizer
    tokenizer = RobertaTokenizer.from_pretrained(tmp_dir)
    model = RobertaForSequenceClassification.from_pretrained(tmp_dir)

    return tokenizer, model

# ===== Utilisation =====
bucket_path = "sagemaker-studio-oxs6vznjds/writing_task_models/accuracy/checkpoint-1800/"

tokenizer, model = load_roberta_from_s3(bucket_path)

Dossier temporaire: /tmp/tmpdxu3_htb
Téléchargement: s3://sagemaker-studio-oxs6vznjds/writing_task_models/accuracy/checkpoint-1800/config.json -> /tmp/tmpdxu3_htb/config.json
Téléchargement: s3://sagemaker-studio-oxs6vznjds/writing_task_models/accuracy/checkpoint-1800/merges.txt -> /tmp/tmpdxu3_htb/merges.txt
Téléchargement: s3://sagemaker-studio-oxs6vznjds/writing_task_models/accuracy/checkpoint-1800/model.safetensors -> /tmp/tmpdxu3_htb/model.safetensors
Téléchargement: s3://sagemaker-studio-oxs6vznjds/writing_task_models/accuracy/checkpoint-1800/optimizer.pt -> /tmp/tmpdxu3_htb/optimizer.pt
Téléchargement: s3://sagemaker-studio-oxs6vznjds/writing_task_models/accuracy/checkpoint-1800/rng_state.pth -> /tmp/tmpdxu3_htb/rng_state.pth
Téléchargement: s3://sagemaker-studio-oxs6vznjds/writing_task_models/accuracy/checkpoint-1800/scaler.pt -> /tmp/tmpdxu3_htb/scaler.pt
Téléchargement: s3://sagemaker-studio-oxs6vznjds/writing_task_models/accuracy/checkpoint-1800/scheduler.pt -> /tmp/tmpdxu3_

In [5]:
def format_text_inference(ef_level, activity_instructions, student_submission):
    return (
        f"Prompt Level: {ef_level} [SEP] Prompt: {activity_instructions} [SEP] Response: {student_submission}"
    )

In [6]:
def inference(input_json):
    """
    input_json attendu :
    {
      "answer": "...",
      "prompt": "...",
      "level": "..."  # chaîne ou int
    }
    """
    # Extraire les champs
    ef_level = int(input_json["ef_level"])
    activity_instructions = input_json["activity_instructions"]
    student_submission = input_json["student_submission"]
    
    # Formater le texte
    formatted_text = format_text_inference(ef_level, activity_instructions, student_submission)
    
    # Tokenizer
    inputs = tokenizer(formatted_text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    
    # Prédiction
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    probs = torch.softmax(logits, dim=1).squeeze()
    predicted_class = torch.argmax(probs).item()

    mapped_score = map_score_linear(predicted_class)

    
    # Construire la sortie JSON
    output = {
        "cefr_scoring": predicted_class,
        "cefr_scoring_100": mapped_score,
        "scorer": {
            "version": "roberta_large_onnx_scorer",
            "release": "0.3"
        }
    }
    return output


def map_score_linear(score):
    evp_to_score = {
        0: 17,
        1: 33,
        2: 50,
        3: 67,
        4: 83,
        5: 100,
    }
    return evp_to_score.get(score)


In [7]:
example_input = {
    "ef_level": 10,
    "activity_instructions": "Read the email from your manager. Then respond with an email that has several ideas to help her solve the budget problem. Type in the input box. Write between 80 and 100 words. Use your own words where possible. ",
    "student_submission": "Response: Hi Carla,\n\nThe financial report was shocking. We have a budget crisis and I have a list of options how to deal with this crisis on a long-team basis. \n\n-First I would recommend that we would cut down everyone’s working hours. The company would save about $10000 per worker each year. \n-Secondly we should think about offering older workers a large retirement bonus if they accept our resignation package. If we lay off senior workers we could save about $300 000 every year.\n-Thirdly I would also recommend updating our offices to present-day. We have many offices which are too huge and expensive and old-fashioned. If we move office space to another location we could save money in rent. By changing location we could possibly save about $10000"
}

result = inference(example_input)
print(result)

{'cefr_scoring': 4, 'cefr_scoring_100': 83, 'scorer': {'version': 'roberta_large_onnx_scorer', 'release': '0.3'}}


## onnx inference

In [5]:
import boto3
import onnxruntime as ort

# Define S3 bucket and model key
bucket_name = 'sagemaker-studio-oxs6vznjds'
model_key = 'writing_task_models/accuracy/model_1800_roberta_large.onnx'
local_model_path = '/tmp/roberta-large-ft-acc-writing-task-1800.onnx'  # or wherever you want to save temporarily

# Initialize boto3 S3 client
s3 = boto3.client('s3')

# Download the ONNX model from S3 to local path
s3.download_file(bucket_name, model_key, local_model_path)

# Load the ONNX model using onnxruntime
session = ort.InferenceSession(local_model_path)

print("ONNX model loaded successfully.")

ONNX model loaded successfully.


In [14]:
max_length = 256  # Ajuste selon la taille maximale de ton modèle
import torch.nn.functional as F  # pour softmax
import numpy as np

def inference(input_json, onnx_model):
    ef_level = int(input_json["ef_level"])
    activity_instructions = input_json["activity_instructions"]
    student_submission = input_json["student_submission"]
    
    formatted_text = format_text_inference(ef_level, activity_instructions, student_submission)
    
    inputs = tokenizer(
        formatted_text, 
        padding=True, 
        truncation=True, 
        max_length=max_length, 
        return_tensors="pt"
    )
    
    input_ids = inputs["input_ids"].cpu().numpy()
    attention_mask = inputs["attention_mask"].cpu().numpy()
    onnx_inputs = {"input_ids": input_ids, "attention_mask": attention_mask}
    
    onnx_outputs = onnx_model.run(None, onnx_inputs)
    logits = onnx_outputs[0]
    
    predicted_class = int(np.argmax(logits, axis=1)[0])
    
    probs = F.softmax(torch.tensor(logits), dim=1).numpy().squeeze()
    predicted_prob = float(probs[predicted_class])
    
    mapped_score = map_score_linear(predicted_class)
    
    output = {
        "cefr_scoring": predicted_class,
        "cefr_scoring_100": mapped_score,
        "predicted_probability": round(predicted_prob, 2),
        "scorer": {
            "version": "roberta_large_onnx_scorer",
            "release": "0.1"
        }
    }
    return output

def map_score_linear(score):
    evp_to_score = {
        0: 17,
        1: 33,
        2: 50,
        3: 67,
        4: 83,
        5: 100,
    }
    return evp_to_score.get(score)


In [15]:
example_input = {
    "ef_level": 10,
    "activity_instructions": "Read the email from your manager. Then respond with an email that has several ideas to help her solve the budget problem. Type in the input box. Write between 80 and 100 words. Use your own words where possible. ",
    "student_submission": "Response: Hi Carla,\n\nThe financial report was shocking. We have a budget crisis and I have a list of options how to deal with this crisis on a long-team basis. \n\n-First I would recommend that we would cut down everyone’s working hours. The company would save about $10000 per worker each year. \n-Secondly we should think about offering older workers a large retirement bonus if they accept our resignation package. If we lay off senior workers we could save about $300 000 every year.\n-Thirdly I would also recommend updating our offices to present-day. We have many offices which are too huge and expensive and old-fashioned. If we move office space to another location we could save money in rent. By changing location we could possibly save about $10000"
}

result = inference(example_input, onnx_model=session)
print(result)

{'cefr_scoring': 4, 'cefr_scoring_100': 83, 'predicted_probability': 0.88, 'scorer': {'version': 'roberta_large_onnx_scorer', 'release': '0.1'}}


## EXAMPLE inference for multitask

In [2]:
import torch
import torch.nn as nn
import numpy as np
from transformers import AutoTokenizer, RobertaModel
from transformers.modeling_outputs import SequenceClassifierOutput

# ── Constants (must match training) ───────────────────────────────────────────
NUM_LABELS = 6
TASK_NAMES = ["accuracy", "coherence", "range"]
LABEL_NAMES = [f"Score {i}" for i in range(NUM_LABELS)]

  import pynvml  # type: ignore[import]


In [3]:
# ── Model definition (copy from training script) ──────────────────────────────
class MultiTaskCrossEncoderRoberta(nn.Module):
    def __init__(self, model_name: str, num_labels: int, dropout: float = 0.1):
        super().__init__()
        self.encoder = RobertaModel.from_pretrained(model_name)
        hidden_size = self.encoder.config.hidden_size
        self.num_labels = num_labels

        self.shared_projection = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(hidden_size, 512),
            nn.GELU(),
        )

        self.head_accuracy  = nn.Linear(512, num_labels)
        self.head_coherence = nn.Linear(512, num_labels)
        self.head_range     = nn.Linear(512, num_labels)

    def forward(self, input_ids, attention_mask, **kwargs):
        cls = self.encoder(
            input_ids=input_ids,
            attention_mask=attention_mask,
        ).last_hidden_state[:, 0, :]

        shared = self.shared_projection(cls)

        logits = torch.cat([
            self.head_accuracy(shared),
            self.head_coherence(shared),
            self.head_range(shared),
        ], dim=-1)

        return SequenceClassifierOutput(loss=None, logits=logits)


# ── Load model ────────────────────────────────────────────────────────────────
def load_model(checkpoint_path: str, device: str = "cpu"):
    """
    checkpoint_path : dossier contenant pytorch_model.bin + tokenizer
    """
    import os
    tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)

    model = MultiTaskCrossEncoderRoberta(
        model_name="FacebookAI/roberta-large",
        num_labels=NUM_LABELS,
        dropout=0.1,
    )

    weights_path = os.path.join(checkpoint_path, "pytorch_model.bin")
    state_dict = torch.load(weights_path, map_location=device)
    model.load_state_dict(state_dict)
    model.to(device)
    model.eval()

    print(f"✅ Model loaded from {checkpoint_path}")
    return model, tokenizer


# ── Single inference ──────────────────────────────────────────────────────────
def predict(
    model,
    tokenizer,
    text_a: str,
    text_b: str,
    max_length: int = 512,
    device: str = "cpu",
) -> dict:
    """
    text_a : prompt formaté  (ex: "Prompt Level: 8 Prompt: ... Response: ...")
    text_b : correction
    Retourne un dict avec score prédit + proba par tâche.
    """
    encoding = tokenizer(
        text_a,
        text_b,
        truncation=True,
        padding="max_length",
        max_length=max_length,
        return_tensors="pt",
    )

    input_ids      = encoding["input_ids"].to(device)
    attention_mask = encoding["attention_mask"].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits  = outputs.logits  # [1, 18]

    results = {}
    for i, task in enumerate(TASK_NAMES):
        task_logits = logits[0, i * NUM_LABELS:(i + 1) * NUM_LABELS]
        probs       = torch.softmax(task_logits, dim=-1).cpu().numpy()
        pred_score  = int(np.argmax(probs))
        confidence  = float(probs.max())

        results[task] = {
            "predicted_score": pred_score,
            "label":           LABEL_NAMES[pred_score],
            "confidence":      round(confidence, 4),
            "probabilities":   {
                LABEL_NAMES[j]: round(float(probs[j]), 4)
                for j in range(NUM_LABELS)
            },
        }

    return results


# ── Pretty print ──────────────────────────────────────────────────────────────
def print_results(results: dict):
    print("\n" + "=" * 60)
    print("INFERENCE RESULTS")
    print("=" * 60)
    for task, info in results.items():
        print(f"\n  [{task.upper()}]")
        print(f"    Predicted score : {info['predicted_score']}  ({info['label']})")
        print(f"    Confidence      : {info['confidence']:.2%}")
        print("    Probabilities   :")
        for label, prob in info["probabilities"].items():
            bar = "█" * int(prob * 30)
            print(f"      {label} : {prob:.4f}  {bar}")
    print("=" * 60)


In [7]:
# ── Main ──────────────────────────────────────────────────────────────────────
if __name__ == "__main__":

    # ── 1. Paramètres ─────────────────────────────────────────────────────
    CHECKPOINT_PATH = "model_saved/roberta-large-multitask-multitask_cross_encoder_gte60/checkpoint-600"
    DEVICE          = "cuda" if torch.cuda.is_available() else "cpu"
    MAX_LENGTH      = 512   # utiliser la même valeur que l'entraînement

    # ── 2. Exemple de test ────────────────────────────────────────────────
    ef_level              = 8   # correspond à B1
    activity_instructions = "Describe your last holiday in detail."
    student_submission    = "Last summer I goes to Spain with my family. "
    correction            = "Last summer I went to Spain with my family. "

    # Formatage identique à l'entraînement
    text_a = (
        f"Prompt Level: {ef_level} "
        f"Prompt: {activity_instructions} "
        f"Response: {student_submission}"
    )
    text_b = correction

    print(f"Device : {DEVICE}")
    print(f"\n[text_a] {text_a}")
    print(f"[text_b] {text_b}")

    # ── 3. Chargement + inférence ─────────────────────────────────────────
    model, tokenizer = load_model(CHECKPOINT_PATH, device=DEVICE)
    results          = predict(model, tokenizer, text_a, text_b,
                               max_length=MAX_LENGTH, device=DEVICE)
    print_results(results)

Device : cuda

[text_a] Prompt Level: 8 Prompt: Describe your last holiday in detail. Response: Last summer I goes to Spain with my family. 
[text_b] Last summer I went to Spain with my family. 


Some weights of the model checkpoint at FacebookAI/roberta-large were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


✅ Model loaded from model_saved/roberta-large-multitask-multitask_cross_encoder_gte60/checkpoint-600

INFERENCE RESULTS

  [ACCURACY]
    Predicted score : 2  (Score 2)
    Confidence      : 31.35%
    Probabilities   :
      Score 0 : 0.0442  █
      Score 1 : 0.1986  █████
      Score 2 : 0.3135  █████████
      Score 3 : 0.3016  █████████
      Score 4 : 0.1136  ███
      Score 5 : 0.0286  

  [COHERENCE]
    Predicted score : 3  (Score 3)
    Confidence      : 26.82%
    Probabilities   :
      Score 0 : 0.0703  ██
      Score 1 : 0.1834  █████
      Score 2 : 0.2669  ████████
      Score 3 : 0.2682  ████████
      Score 4 : 0.1637  ████
      Score 5 : 0.0475  █

  [RANGE]
    Predicted score : 2  (Score 2)
    Confidence      : 30.18%
    Probabilities   :
      Score 0 : 0.0783  ██
      Score 1 : 0.2091  ██████
      Score 2 : 0.3018  █████████
      Score 3 : 0.2237  ██████
      Score 4 : 0.1424  ████
      Score 5 : 0.0446  █
