In [2]:
import pandas as pd
from datasets import Dataset
from transformers import CamembertTokenizer, CamembertForSequenceClassification, Trainer, TrainingArguments
import torch
import evaluate

# Charger les données
df = pd.read_csv('../TP4-augmentation_données/dataset_horreur.csv')

# ----------- BINAIRE (horreur ou pas) -----------
df_bin = df[['text', 'label_binary']].rename(columns={'label_binary': 'label'})  # ✅ CHANGEMENT
dataset_bin = Dataset.from_pandas(df_bin)
dataset_bin = dataset_bin.train_test_split(test_size=0.2, seed=42)

tokenizer = CamembertTokenizer.from_pretrained('camembert-base')

def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)

encoded_bin = dataset_bin.map(preprocess_function, batched=True)

model_bin = CamembertForSequenceClassification.from_pretrained('camembert-base', num_labels=2)

metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = torch.argmax(torch.tensor(logits), dim=-1)
    return metric.compute(predictions=preds, references=labels)

training_args_bin = TrainingArguments(
    output_dir='./results_bin',
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir='./logs_bin',
    save_total_limit=1,
)

trainer_bin = Trainer(
    model=model_bin,
    args=training_args_bin,
    train_dataset=encoded_bin['train'],
    eval_dataset=encoded_bin['test'],
    compute_metrics=compute_metrics
)

trainer_bin.train()
print(trainer_bin.evaluate())

# ----------- SEVERITY (0, 1, 2) -----------
df_sev = df[['text', 'label_severity']].rename(columns={'label_severity': 'label'})  # ✅ CHANGEMENT
dataset_sev = Dataset.from_pandas(df_sev)
dataset_sev = dataset_sev.train_test_split(test_size=0.2, seed=42)

encoded_sev = dataset_sev.map(preprocess_function, batched=True)

model_sev = CamembertForSequenceClassification.from_pretrained('camembert-base', num_labels=3)

training_args_sev = TrainingArguments(
    output_dir='./results_sev',
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir='./logs_sev',
    save_total_limit=1,
)

trainer_sev = Trainer(
    model=model_sev,
    args=training_args_sev,
    train_dataset=encoded_sev['train'],
    eval_dataset=encoded_sev['test'],
    compute_metrics=compute_metrics
)

trainer_sev.train()
print(trainer_sev.evaluate())
# Sauvegarde du modèle binaire
model_bin.save_pretrained("./camembert_binary")
tokenizer.save_pretrained("./camembert_binary")

# Sauvegarde du modèle de sévérité
model_sev.save_pretrained("./camembert_severity")
tokenizer.save_pretrained("./camembert_severity")

print("Modèles sauvegardés avec succès.")



Map:   0%|          | 0/76 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


{'eval_loss': 0.23651166260242462, 'eval_accuracy': 0.95, 'eval_runtime': 2.9091, 'eval_samples_per_second': 6.875, 'eval_steps_per_second': 1.031, 'epoch': 2.0}


Map:   0%|          | 0/76 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


{'eval_loss': 0.6642757654190063, 'eval_accuracy': 0.8, 'eval_runtime': 3.0112, 'eval_samples_per_second': 6.642, 'eval_steps_per_second': 0.996, 'epoch': 2.0}
Modèles sauvegardés avec succès.
