In [1]:
from datasets import load_dataset, Dataset

ds = load_dataset("konverner/fr-address")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = ds["train"].to_pandas()
df.labels = df.labels.map(lambda x : [1 if t != 0 else 0 for t in x])


In [3]:
ds = Dataset.from_dict( df.to_dict(orient='list'))

In [4]:
ds = ds.train_test_split(test_size=0.2, seed=2023)

In [5]:
ds

DatasetDict({
    train: Dataset({
        features: ['tokens', 'labels'],
        num_rows: 4400
    })
    test: Dataset({
        features: ['tokens', 'labels'],
        num_rows: 1100
    })
})

In [6]:
from transformers import CamembertForTokenClassification, CamembertTokenizerFast

tokenizer = CamembertTokenizerFast.from_pretrained("camembert-base")
model = CamembertForTokenClassification.from_pretrained("camembert-base", num_labels=2)

Some weights of CamembertForTokenClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples['tokens'], padding="max_length", truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples['labels']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(label[word_idx] if label[word_idx] == 0 else -100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs['labels'] = labels
    return tokenized_inputs

In [8]:
tokenized_datasets = ds.map(tokenize_and_align_labels, batched=True)


Map: 100%|██████████| 4400/4400 [00:01<00:00, 3361.67 examples/s]
Map: 100%|██████████| 1100/1100 [00:00<00:00, 3803.57 examples/s]


In [9]:
from transformers import Trainer, TrainingArguments

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
2023-12-07 19:43:54.242512: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [10]:
# Définir les arguments d'entraînement
training_args = TrainingArguments(
    output_dir='./ner_model',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    save_steps=10_000,
    save_total_limit=2,
)

In [11]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['tokens', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 4400
    })
    test: Dataset({
        features: ['tokens', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 1100
    })
})

In [12]:
# Définir le Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'], 
    eval_dataset=tokenized_datasets['test'],
)

# Entraîner le modèle
trainer.train()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
  0%|          | 1/1650 [00:32<14:41:13, 32.06s/it]