In [1]:
import pandas as pd
from datasets import Dataset
from transformers import CamembertTokenizer, CamembertForSequenceClassification, Trainer, TrainingArguments
import torch
import evaluate
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
# Charger les données
df = pd.read_csv('../TP4-augmentation_donnés/dataset_horreur.csv')

# ----------- BINAIRE (horreur ou pas) -----------
df_bin = df[['text', 'label_binary']].rename(columns={'label_binary': 'label'}) 
dataset_bin = Dataset.from_pandas(df_bin)
dataset_bin = dataset_bin.train_test_split(test_size=0.2, seed=42)

tokenizer = CamembertTokenizer.from_pretrained('camembert-base')

def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)

encoded_bin = dataset_bin.map(preprocess_function, batched=True)

model_bin = CamembertForSequenceClassification.from_pretrained('camembert-base', num_labels=2)

metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = torch.argmax(torch.tensor(logits), dim=-1)
    return metric.compute(predictions=preds, references=labels)

training_args_bin = TrainingArguments(
    output_dir='./results_bin',
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir='./logs_bin',
    save_total_limit=1,
)

trainer_bin = Trainer(
    model=model_bin,
    args=training_args_bin,
    train_dataset=encoded_bin['train'],
    eval_dataset=encoded_bin['test'],
    compute_metrics=compute_metrics
)

trainer_bin.train()
print(trainer_bin.evaluate())

# ----------- SEVERITY (0, 1, 2) -----------
df_sev = df[['text', 'label_severity']].rename(columns={'label_severity': 'label'}) 
dataset_sev = Dataset.from_pandas(df_sev)
dataset_sev = dataset_sev.train_test_split(test_size=0.2, seed=42)

encoded_sev = dataset_sev.map(preprocess_function, batched=True)

model_sev = CamembertForSequenceClassification.from_pretrained('camembert-base', num_labels=3)

training_args_sev = TrainingArguments(
    output_dir='./results_sev',
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir='./logs_sev',
    save_total_limit=1,
)

trainer_sev = Trainer(
    model=model_sev,
    args=training_args_sev,
    train_dataset=encoded_sev['train'],
    eval_dataset=encoded_sev['test'],
    compute_metrics=compute_metrics
)

trainer_sev.train()
print(trainer_sev.evaluate())
# Sauvegarde du modèle binaire
model_bin.save_pretrained("./camembert_binary")
tokenizer.save_pretrained("./camembert_binary")

# Sauvegarde du modèle de sévérité
model_sev.save_pretrained("./camembert_severity")
tokenizer.save_pretrained("./camembert_severity")

print("Modèles sauvegardés avec succès.")
# ----------- ÉVALUATION  -----------
def compute_all_metrics(trainer, dataset, average="weighted"):
    predictions = trainer.predict(dataset)
    preds = np.argmax(predictions.predictions, axis=1)
    labels = predictions.label_ids
    return {
        "Accuracy": accuracy_score(labels, preds),
        "Precision": precision_score(labels, preds, average=average, zero_division=0),
        "Recall": recall_score(labels, preds, average=average, zero_division=0),
        "F1-score": f1_score(labels, preds, average=average, zero_division=0),
        "Loss": predictions.metrics.get("test_loss", predictions.metrics.get("eval_loss")),
        "Temps (s)": predictions.metrics.get("test_runtime", predictions.metrics.get("eval_runtime"))
    }

metrics_bin = compute_all_metrics(trainer_bin, encoded_bin["test"])
metrics_sev = compute_all_metrics(trainer_sev, encoded_sev["test"])

# Création de la table
results_df = pd.DataFrame([
    {"Modèle": "Classification binaire", **metrics_bin},
    {"Modèle": "Classification sévérité", **metrics_sev},
])

# Affichage de la table
print("\nÉvaluation complète des modèles :\n")
print(results_df.to_markdown(index=False))
# Sauvegarde dans un fichier CSV
results_df.to_csv("resultats_evaluation.csv", index=False, encoding="utf-8")




Map:   0%|          | 0/76 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


{'eval_loss': 0.24591509997844696, 'eval_accuracy': 0.95, 'eval_runtime': 2.9169, 'eval_samples_per_second': 6.857, 'eval_steps_per_second': 1.029, 'epoch': 2.0}


Map:   0%|          | 0/76 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


{'eval_loss': 0.6642757654190063, 'eval_accuracy': 0.8, 'eval_runtime': 2.9151, 'eval_samples_per_second': 6.861, 'eval_steps_per_second': 1.029, 'epoch': 2.0}
Modèles sauvegardés avec succès.

Évaluation complète des modèles :

| Modèle                  |   Accuracy |   Precision |   Recall |   F1-score |     Loss |   Temps (s) |
|:------------------------|-----------:|------------:|---------:|-----------:|---------:|------------:|
| Classification binaire  |       0.95 |      0.9025 |     0.95 |   0.925641 | 0.245915 |      2.8112 |
| Classification sévérité |       0.8  |      0.64   |     0.8  |   0.711111 | 0.664276 |      3.0152 |
