In [2]:
from transformers import CamembertTokenizer, CamembertForSequenceClassification
from datasets import load_dataset

In [3]:
from transformers import CamembertTokenizer, CamembertForSequenceClassification
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding
from datasets import load_dataset
import evaluate
import numpy as np

In [4]:
tokenizer = CamembertTokenizer.from_pretrained("camembert-base")

# Charger le modèle (3 labels : entailment, neutral, contradiction)
model = CamembertForSequenceClassification.from_pretrained("camembert-base", num_labels=3)

print("Tokenizer et modèle chargés !")

Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Tokenizer et modèle chargés !


In [5]:
import spacy
import json

# 1. Charger le modèle spaCy pour le français
nlp = spacy.load("fr_core_news_md")

# 2. Définir la fonction d'annotation
def annotate_pos(text):
    doc = nlp(text)
    tokens = [token.text for token in doc]
    pos_tags = [token.pos_ for token in doc]
    return {"tokens": tokens, "pos_tags": pos_tags}

# 3. Lire le fichier texte et annoter chaque ligne
def process_text_file(input_file, output_file):
    dataset = []
    with open(input_file, "r", encoding="utf-8") as f:
        print('fichier trouvé')
        for line in f:
            line = line.strip()
            if line:  # Ignorer les lignes vides
                annotated = annotate_pos(line)
                dataset.append(annotated)

    # 4. Sauvegarder le résultat au format JSON
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(dataset, f, ensure_ascii=False, indent=2)

    print(f"Dataset sauvegardé dans {output_file} avec {len(dataset)} exemples.")

# 5. Exécuter le script
input_file = "../Youdas/test1.txt"  # Remplace par le chemin de ton fichier
output_file = "../Youdas/dataset_pos.json"  # Fichier de sortie
process_text_file(input_file, output_file)


fichier trouvé
Dataset sauvegardé dans ../Youdas/dataset_pos.json avec 192 exemples.


In [29]:
#import spacy.cli
#spacy.cli.download("fr_core_news_md")


Defaulting to user installation because normal site-packages is not writeable
Collecting fr-core-news-md==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_md-3.8.0/fr_core_news_md-3.8.0-py3-none-any.whl (45.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.8/45.8 MB[0m [31m50.5 MB/s[0m  [33m0:00:00[0mm0:00:01[0m00:01[0m
[?25hInstalling collected packages: fr-core-news-md
Successfully installed fr-core-news-md-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('fr_core_news_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [30]:
#import spacy
#nlp = spacy.load("fr_core_news_md")
#print("Modèle chargé avec succès !")


Modèle chargé avec succès !


### preparer les données

In [6]:
from datasets import Dataset
import json

# Charger le JSON
with open("../Youdas/dataset_pos.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Créer un Dataset Hugging Face
dataset = Dataset.from_list(data)

# Afficher un exemple
print(dataset[0])

# Splitter en train/validation/test (80/10/10)
dataset = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = dataset['train']
eval_dataset = dataset['test'].train_test_split(test_size=0.5, seed=42)
eval_dataset = eval_dataset['train']
test_dataset = eval_dataset['test']

print(f"Train: {len(train_dataset)}, Eval: {len(eval_dataset)}, Test: {len(test_dataset)}")


{'tokens': ['Si', 'vous', 'avez', 'raté', 'ou', 'supprimé', 'une', 'newsletter', ',', 'il', 'faudra', 'patienter', 'un', 'peu', 'pour', 'attendre', 'la', 'prochaine', '^^'], 'pos_tags': ['SCONJ', 'PRON', 'AUX', 'VERB', 'CCONJ', 'VERB', 'DET', 'NOUN', 'PUNCT', 'PRON', 'VERB', 'VERB', 'DET', 'ADV', 'ADP', 'VERB', 'DET', 'ADJ', 'PUNCT']}


ValueError: Column 'test' doesn't exist.

## Encoder les tags

In [None]:
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Récupérer tous les tags uniques
all_tags = set(tag for example in train_dataset["pos_tags"] for tag in example)
tag2id = {tag: id for id, tag in enumerate(sorted(all_tags))}
id2tag = {id: tag for tag, id in tag2id.items()}
num_labels = len(tag2id)

print(f"Nombre de tags uniques : {num_labels}")
print("Exemple de mapping :", {k: tag2id[k] for k in list(tag2id)[:5]})


## Tokenisation et alignement des labels

In [None]:
from transformers import CamembertTokenizer

tokenizer = CamembertTokenizer.from_pretrained("camembert-base")

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["pos_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:  # Token spécial ([CLS], [SEP], [PAD])
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Nouveau mot
                label_ids.append(tag2id[label[word_idx]])
            else:  # Sous-mot du même mot
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Appliquer à tout le dataset
tokenized_train = train_dataset.map(tokenize_and_align_labels, batched=True)
tokenized_eval = eval_dataset.map(tokenize_and_align_labels, batched=True)
tokenized_test = test_dataset.map(tokenize_and_align_labels, batched=True)


## Fine-tuner

In [None]:
from transformers import CamembertForTokenClassification, TrainingArguments, Trainer

model = CamembertForTokenClassification.from_pretrained("camembert-base", num_labels=num_labels)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
)

trainer.train()


## evaluation

In [None]:
from seqeval.metrics import classification_report

predictions = trainer.predict(tokenized_test)
preds = np.argmax(predictions.predictions, axis=2)

# Convertir les IDs en tags
pred_labels = [[id2tag[p] for p in pred if p != -100] for pred in preds]
true_labels = [[id2tag[l] for l in label if l != -100] for label in tokenized_test["labels"]]

print(classification_report(true_labels, pred_labels))
