In [1]:
from transformers import CamembertTokenizer, CamembertForSequenceClassification
from datasets import load_dataset

In [2]:
from transformers import CamembertTokenizer, CamembertForSequenceClassification
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding
from datasets import load_dataset
import evaluate
import numpy as np
from sklearn.preprocessing import LabelEncoder
import spacy
import json

from datasets import load_dataset, Dataset
from itertools import islice
from transformers import CamembertTokenizerFast

In [3]:
"""import random

input_file = '../dataset_g5/fr_part_1.txt'
output_file = '../Youdas/echantillon_10_pourcent.txt'
percentage = 0.4 # 10%

with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
    for line in infile:
        if random.random() < percentage:
            outfile.write(line)
            """



input_file = '../dataset_g5/fr_part_1.txt'
output_file = '../Youdas/echantillon_20_pourcent.txt'
percentage = 0.2  

# Calculer le nombre de lignes à extraire
total_lines = sum(1 for _ in open(input_file, 'r'))
num_lines_to_extract = int(total_lines * percentage)

print(f"Nombre total de lignes : {total_lines}")
print(f"Nombre de lignes à extraire : {num_lines_to_extract}")


with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
    for i, line in enumerate(infile):
        if i < num_lines_to_extract:
            outfile.write(line)
        else:
            break 

print(f"Extraction terminée : {num_lines_to_extract} lignes ont été écrites dans {output_file}.")



Nombre total de lignes : 6689783
Nombre de lignes à extraire : 1337956
Extraction terminée : 1337956 lignes ont été écrites dans ../Youdas/echantillon_20_pourcent.txt.


In [None]:
tokenizer = CamembertTokenizer.from_pretrained("camembert-base")

# Charger le modèle (3 labels : entailment, neutral, contradiction)
model = CamembertForSequenceClassification.from_pretrained("camembert-base", num_labels=3)

print("Tokenizer et modèle chargés !")

In [5]:


nlp = spacy.load("fr_core_news_md")

# 2. Définir la fonction d'annotation
def annotate_pos(text):
    doc = nlp(text)
    tokens = [token.text for token in doc]
    pos_tags = [token.pos_ for token in doc]
    return {"tokens": tokens, "pos_tags": pos_tags}

In [6]:
def process_text_file(input_file, output_file):
    dataset = []

    with open(input_file, "r", encoding="utf-8") as f:
        lines = (line.strip() for line in f if line.strip())

        for doc in nlp.pipe(lines, batch_size=2000):
            
            dataset.append({
                "tokens": [token.text for token in doc],
                "pos_tags": [token.pos_ for token in doc]
            })

    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(dataset, f, ensure_ascii=False, indent=2)

    print(f"Dataset sauvegardé dans {output_file} avec {len(dataset)} exemples.")


In [7]:
#input_file = "../Youdas/echantillon_20_pourcent.txt"  
#output_file = "../Youdas/echantillon_20_pourcent.json"  # Fichier de sortie
#process_text_file(input_file, output_file)

Dataset sauvegardé dans ../Youdas/echantillon_20_pourcent.json avec 1337956 exemples.


In [8]:

JSON_PATH = "../Youdas/echantillon_20_pourcent.json"
OUT_DIR = "./postagging_dataset_20%"
SEED = 42
BUFFER_SIZE = 100_000

def get_stream():
    ds = load_dataset(
        "json",
        data_files=JSON_PATH,
        split="train",
        streaming=True
    )
    ds = ds.shuffle(seed=SEED, buffer_size=BUFFER_SIZE)
    return ds

def train_gen():
    return islice(get_stream(), 0, None, 10)

def eval_gen():
    return islice(get_stream(), 1, None, 10)

def test_gen():
    return islice(get_stream(), 2, None, 10)

print(" Conversion en Arrow...")

train_dataset = Dataset.from_generator(train_gen)
eval_dataset  = Dataset.from_generator(eval_gen)
test_dataset  = Dataset.from_generator(test_gen)

train_dataset.save_to_disk(f"{OUT_DIR}/train")
eval_dataset.save_to_disk(f"{OUT_DIR}/eval")
test_dataset.save_to_disk(f"{OUT_DIR}/test")

print(" Terminé")
print(f"Train: {len(train_dataset)}")
print(f"Eval : {len(eval_dataset)}")
print(f"Test : {len(test_dataset)}")


 Conversion en Arrow...


Saving the dataset (0/1 shards):   0%|          | 0/133796 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/133796 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/133796 [00:00<?, ? examples/s]

 Terminé
Train: 133796
Eval : 133796
Test : 133796


In [9]:



# Récupérer tous les tags uniques
all_tags = set(tag for example in train_dataset["pos_tags"] for tag in example)
tag2id = {tag: id for id, tag in enumerate(sorted(all_tags))}
id2tag = {id: tag for tag, id in tag2id.items()}
print("Mapping des labels :", id2tag)

num_labels = len(tag2id)

print(f"Nombre de tags uniques : {num_labels}")
print("Exemple de mapping :", {k: tag2id[k] for k in list(tag2id)[:5]})


Mapping des labels : {0: 'ADJ', 1: 'ADP', 2: 'ADV', 3: 'AUX', 4: 'CCONJ', 5: 'DET', 6: 'INTJ', 7: 'NOUN', 8: 'NUM', 9: 'PRON', 10: 'PROPN', 11: 'PUNCT', 12: 'SCONJ', 13: 'SPACE', 14: 'SYM', 15: 'VERB', 16: 'X'}
Nombre de tags uniques : 17
Exemple de mapping : {'ADJ': 0, 'ADP': 1, 'ADV': 2, 'AUX': 3, 'CCONJ': 4}


In [10]:

tokenizer = CamembertTokenizerFast.from_pretrained("camembert-base")


def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["pos_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:  # Token spécial ([CLS], [SEP], [PAD])
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Nouveau mot
                label_ids.append(tag2id[label[word_idx]])
            else:  # Sous-mot du même mot
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Appliquer à tout le dataset
tokenized_train = train_dataset.map(tokenize_and_align_labels, batched=True)
tokenized_eval = eval_dataset.map(tokenize_and_align_labels, batched=True)
tokenized_test = test_dataset.map(tokenize_and_align_labels, batched=True)


Map:   0%|          | 0/133796 [00:00<?, ? examples/s]

In [11]:
import torch
torch.cuda.empty_cache()

In [12]:
import torch
import numpy as np
from transformers import (
    CamembertTokenizerFast,
    CamembertForTokenClassification,
    TrainingArguments,
    Trainer,
    DataCollatorForTokenClassification
)
from sklearn.metrics import accuracy_score

# ------------------------------------------------------------
# 0. Sélection automatique GPU / CPU
# ------------------------------------------------------------
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(" GPU détecté :", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print(" Aucun GPU détecté → utilisation du CPU")

# ------------------------------------------------------------
# 1. Charger le tokenizer
# ------------------------------------------------------------
tokenizer = CamembertTokenizerFast.from_pretrained("camembert-base")

# ------------------------------------------------------------
# 2. Charger le modèle
# ------------------------------------------------------------
model = CamembertForTokenClassification.from_pretrained(
    "camembert-base",
    num_labels=num_labels
).to(device)

print(f"✓ Modèle déplacé sur : {device}")

# ------------------------------------------------------------
# 3. Fonction compute_metrics pour accuracy
# ------------------------------------------------------------
def compute_metrics(pred):
    predictions, labels = pred
    preds = np.argmax(predictions, axis=-1)

    mask = labels != -100
    preds = preds[mask]
    labels = labels[mask]

    accuracy = accuracy_score(labels, preds)
    return {"accuracy": accuracy}

# ------------------------------------------------------------
# 4. Data collator
# ------------------------------------------------------------
data_collator = DataCollatorForTokenClassification(
    tokenizer=tokenizer,
    padding=True,
    return_tensors="pt"
)

# ------------------------------------------------------------
# 5. Arguments d'entraînement
# ------------------------------------------------------------
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",     
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,

    fp16=torch.cuda.is_available(),   
    report_to="none",

    load_best_model_at_end=True,      
    metric_for_best_model="accuracy", 
    greater_is_better=True,           
)

# ------------------------------------------------------------
# 6. Trainer
# ------------------------------------------------------------
trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# ------------------------------------------------------------
# 7. Entraînement
# ------------------------------------------------------------
print(f"Début de l'entraînement sur : {device}")
trainer.train()


 GPU détecté : Quadro RTX 6000


Some weights of CamembertForTokenClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✓ Modèle déplacé sur : cuda
Début de l'entraînement sur : cuda


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,0.1722,0.138877,0.954358
2,0.1265,0.124076,0.958344
3,0.1059,0.11975,0.959843


TrainOutput(global_step=25089, training_loss=0.17180974402050028, metrics={'train_runtime': 3408.6761, 'train_samples_per_second': 117.755, 'train_steps_per_second': 7.36, 'total_flos': 4.593428184984254e+16, 'train_loss': 0.17180974402050028, 'epoch': 3.0})

In [17]:
# Sauvegarder le modèle et le tokenizer
model_save_path = "./modele_20"
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)

print(f"✓ Modèle et tokenizer sauvegardés dans : {model_save_path}")


✓ Modèle et tokenizer sauvegardés dans : ./modele_20


In [None]:
from seqeval.metrics import classification_report

predictions = trainer.predict(tokenized_test)
preds = np.argmax(predictions.predictions, axis=2)

# Convertir les IDs en tags
pred_labels = [[id2tag[p] for p in pred if p != -100] for pred in preds]
true_labels = [[id2tag[l] for l in label if l != -100] for label in tokenized_test["labels"]]

print(classification_report(true_labels, pred_labels))


In [18]:
from transformers import CamembertForTokenClassification, AutoTokenizer

# Chemin vers le dossier contenant votre modèle
modele_path = "../Youdas/modele_20"

# Charger le modèle
model = CamembertForTokenClassification.from_pretrained(modele_path).to("cuda")

# Charger le tokenizer
tokenizer = AutoTokenizer.from_pretrained(modele_path)

print("✓ Modèle et tokenizer chargés avec succès !")


✓ Modèle et tokenizer chargés avec succès !


In [20]:
# Ensemble de phrases de test avec leurs étiquettes attendues
test_phrases = [
    ("Le chat dort sur le tapis.", [
        ("Le", "DET"), ("chat", "NOUN"), ("dort", "VERB"),
        ("sur", "ADP"), ("le", "DET"), ("tapis", "NOUN"), (".", "PUNCT")
    ]),
    ("Marie mange une pomme verte.", [
        ("Marie", "PROPN"), ("mange", "VERB"), ("une", "DET"),
        ("pomme", "NOUN"), ("verte", "ADJ"), (".", "PUNCT")
    ]),
    ("Nous allons au parc demain.", [
        ("Nous", "PRON"), ("allons", "VERB"), ("au", "ADP"),
        ("parc", "NOUN"), ("demain", "ADV"), (".", "PUNCT")
    ]),
    ("Paris est une ville magnifique.", [
        ("Paris", "PROPN"), ("est", "AUX"), ("une", "DET"),
        ("ville", "NOUN"), ("magnifique", "ADJ"), (".", "PUNCT")
    ]),
    ("Oh, comme ce gâteau est délicieux !", [
        ("Oh", "INTJ"), (",", "PUNCT"), ("comme", "ADV"), ("ce", "DET"),
        ("gâteau", "NOUN"), ("est", "AUX"), ("délicieux", "ADJ"),
        ("!", "PUNCT")
    ]),
    ("Apple a présenté l’iPhone 15 à New York.", [
        ("Apple", "PROPN"), ("a", "AUX"), ("présenté", "VERB"),
        ("l’", "DET"), ("iPhone", "PROPN"), ("15", "NUM"),
        ("à", "ADP"), ("New", "PROPN"), ("York", "PROPN"), (".", "PUNCT")
    ]),
]

# Dictionnaire id2tag (à adapter si nécessaire)
id2tag = {
    0: 'ADJ', 1: 'ADP', 2: 'ADV', 3: 'AUX', 4: 'CCONJ',
    5: 'DET', 6: 'INTJ', 7: 'NOUN', 8: 'NUM', 9: 'PRON',
    10: 'PROPN', 11: 'PUNCT', 12: 'SCONJ', 13: 'SPACE',
    14: 'SYM', 15: 'VERB', 16: 'X'
}

tag2id = {v: k for k, v in id2tag.items()}


In [46]:
import torch
from transformers import CamembertTokenizerFast, CamembertForTokenClassification

# Charger le modèle et le tokenizer
model_path = "../Youdas/modele_20"
tokenizer = CamembertTokenizerFast.from_pretrained(model_path)
model = CamembertForTokenClassification.from_pretrained(model_path).to("cuda")
"""
def predire_pos(texte, model, tokenizer, id2tag):
    inputs = tokenizer(texte.split(), return_tensors="pt", truncation=True, is_split_into_words=True).to("cuda")
    with torch.no_grad():
        outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=2)
    predicted_tags = [id2tag[id.item()] for id in predictions[0]]
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
    return tokens, predicted_tags
    """


def predire_pos(texte, model, tokenizer, id2tag):
    # Découpage propre : "tapis." devient ["tapis", "."]
    doc = nlp(texte)
    mots = [token.text for token in doc]
    
    # Encodage avec is_split_into_words=True
    inputs = tokenizer(mots, return_tensors="pt", truncation=True, is_split_into_words=True).to("cuda")
    
    with torch.no_grad():
        outputs = model(**inputs)
    
    predictions = torch.argmax(outputs.logits, dim=2)[0]
    word_ids = inputs.word_ids(batch_index=0)
    
    aligned_tags = []
    previous_word_idx = None
    
    for i, word_idx in enumerate(word_ids):
        # On garde uniquement le premier sous-token de chaque mot (ignore les tokens spéciaux)
        if word_idx is not None and word_idx != previous_word_idx:
            aligned_tags.append(id2tag[predictions[i].item()])
        previous_word_idx = word_idx
        
    return mots, aligned_tags


In [47]:
def regrouper_sous_tokens(tokens, tags):
    mots_et_tags = []
    current_token = None
    current_tag = None

    for token, tag in zip(tokens, tags):
        if token.startswith("##"):
            if current_token:
                current_token += token[2:]
        else:
            if current_token:
                mots_et_tags.append((current_token.replace("▁", ""), current_tag))
            current_token = token.replace("▁", "")
            current_tag = tag
    if current_token:
        mots_et_tags.append((current_token, current_tag))

    return mots_et_tags


In [48]:
from sklearn.metrics import accuracy_score

def calculer_precision(test_phrases, model, tokenizer, id2tag):
    y_true = []
    y_pred = []

    for phrase, expected_tags in test_phrases:
        mots, predicted_tags = predire_pos(phrase, model, tokenizer, id2tag)

        # Aligner les mots prédits avec les mots attendus
        for (mot_exp, tag_exp), tag_pred in zip(expected_tags, predicted_tags):
            y_true.append(tag_exp)
            y_pred.append(tag_pred)

    precision = accuracy_score(y_true, y_pred)
    return precision, y_true, y_pred


In [49]:
precision, y_true, y_pred = calculer_precision(test_phrases, model, tokenizer, id2tag)

print(f"Précision globale : {precision:.2%}")

# Afficher les résultats détaillés
for i, (phrase, expected_tags) in enumerate(test_phrases):
    print(f"\nPhrase {i+1} : {phrase}")
    tokens, predicted_tags = predire_pos(phrase, model, tokenizer, id2tag)
    predicted_grouped = regrouper_sous_tokens(tokens, predicted_tags)

    print("Attendu\t\tPrédit")
    for (mot_exp, tag_exp), (mot_pred, tag_pred) in zip(expected_tags, predicted_grouped):
        print(f"{mot_exp} ({tag_exp})\t\t{mot_pred} ({tag_pred})")


Précision globale : 95.35%

Phrase 1 : Le chat dort sur le tapis.
Attendu		Prédit
Le (DET)		Le (DET)
chat (NOUN)		chat (NOUN)
dort (VERB)		dort (VERB)
sur (ADP)		sur (ADP)
le (DET)		le (DET)
tapis (NOUN)		tapis (NOUN)
. (PUNCT)		. (PUNCT)

Phrase 2 : Marie mange une pomme verte.
Attendu		Prédit
Marie (PROPN)		Marie (PROPN)
mange (VERB)		mange (VERB)
une (DET)		une (DET)
pomme (NOUN)		pomme (NOUN)
verte (ADJ)		verte (ADJ)
. (PUNCT)		. (PUNCT)

Phrase 3 : Nous allons au parc demain.
Attendu		Prédit
Nous (PRON)		Nous (PRON)
allons (VERB)		allons (VERB)
au (ADP)		au (ADP)
parc (NOUN)		parc (NOUN)
demain (ADV)		demain (ADV)
. (PUNCT)		. (PUNCT)

Phrase 4 : Paris est une ville magnifique.
Attendu		Prédit
Paris (PROPN)		Paris (PROPN)
est (AUX)		est (AUX)
une (DET)		une (DET)
ville (NOUN)		ville (NOUN)
magnifique (ADJ)		magnifique (ADJ)
. (PUNCT)		. (PUNCT)

Phrase 5 : Oh, comme ce gâteau est délicieux !
Attendu		Prédit
Oh (INTJ)		Oh (ADV)
, (PUNCT)		, (PUNCT)
comme (ADV)		comme (SCONJ)
ce (DE