## Token classification

The first application we’ll explore is token classification. This generic task encompasses any problem that can be formulated as “attributing a label to each token in a sentence"

In [1]:
#@title import
%%capture
!pip install transformers[torch] datasets seqeval evaluate

from transformers import pipeline, AutoTokenizer, DataCollatorForTokenClassification, AutoModelForTokenClassification, TrainingArguments, Trainer
from datasets import load_dataset
import evaluate
import numpy as np

In [2]:
#@title Load dataset from dataset library
raw_datasets = load_dataset("conll2003")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading data:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/312k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/283k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

# Checking the data

In [3]:
type(raw_datasets)

datasets.dataset_dict.DatasetDict

In [4]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [5]:
example_0 = raw_datasets["train"][0]
example_0["tokens"], example_0["ner_tags"]

(['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'],
 [3, 0, 7, 0, 0, 0, 7, 0, 0])

In [6]:
ner_feature = raw_datasets["train"].features["ner_tags"]
ner_feature

Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None), length=-1, id=None)

In [7]:
label_names = ner_feature.feature.names
label_names

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

# Choosing the model

In [8]:
model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Tokenization

In [9]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [10]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [11]:
tokenized_datasets = raw_datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)

Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [12]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 3453
    })
})

In [13]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [14]:
batch = data_collator([tokenized_datasets["train"][i] for i in range(2)])
batch["labels"]

tensor([[-100,    3,    0,    7,    0,    0,    0,    7,    0,    0,    0, -100],
        [-100,    1,    2, -100, -100, -100, -100, -100, -100, -100, -100, -100]])

In [15]:
tokenized_datasets["train"][1]

{'input_ids': [101, 1943, 14428, 102],
 'token_type_ids': [0, 0, 0, 0],
 'attention_mask': [1, 1, 1, 1],
 'labels': [-100, 1, 2, -100]}

In [16]:
raw_datasets["train"][1]

{'id': '1',
 'tokens': ['Peter', 'Blackburn'],
 'pos_tags': [22, 22],
 'chunk_tags': [11, 12],
 'ner_tags': [1, 2]}

In [17]:
metric = evaluate.load("seqeval")

labels = raw_datasets["train"][0]["ner_tags"]
labels = [label_names[i] for i in labels]
labels

['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']

In [18]:
predictions = labels.copy()
predictions[2] = "O"
metric.compute(predictions=[predictions], references=[labels])

{'MISC': {'precision': 1.0,
  'recall': 0.5,
  'f1': 0.6666666666666666,
  'number': 2},
 'ORG': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'overall_precision': 1.0,
 'overall_recall': 0.6666666666666666,
 'overall_f1': 0.8,
 'overall_accuracy': 0.8888888888888888}

In [19]:
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [20]:
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

In [21]:
print(id2label)

{0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC', 7: 'B-MISC', 8: 'I-MISC'}


In [22]:
print(label2id)

{'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8}


In [23]:
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)

In [24]:
model.config.num_labels

9

# Training

In [25]:
args = TrainingArguments(
    "bert-finetuned-ner",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    # push_to_hub=True,
)

In [26]:
args = TrainingArguments(
    "bert-finetuned-ner",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-5,
    num_train_epochs=20,
    # weight_decay=0.01,
    adam_epsilon = 1e-8,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    # push_to_hub=True,
)

In [27]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.085901,0.855917,0.906765,0.880608,0.975614
2,0.258600,0.061248,0.902744,0.935712,0.918932,0.982855
3,0.061000,0.066051,0.910691,0.940424,0.925319,0.982634
4,0.039000,0.056622,0.924634,0.945641,0.93502,0.985386
5,0.025700,0.061473,0.926139,0.947492,0.936694,0.985577
6,0.017800,0.066397,0.932638,0.948334,0.940421,0.985857
7,0.013800,0.065689,0.93805,0.950522,0.944245,0.987034
8,0.010400,0.069013,0.936318,0.950185,0.943201,0.986666
9,0.010400,0.071072,0.92977,0.951363,0.940443,0.985989
10,0.007100,0.075018,0.933399,0.952878,0.943038,0.986254


TrainOutput(global_step=8780, training_loss=0.02624698114965391, metrics={'train_runtime': 3068.2551, 'train_samples_per_second': 91.524, 'train_steps_per_second': 2.862, 'total_flos': 7791024707068194.0, 'train_loss': 0.02624698114965391, 'epoch': 20.0})

# Inference

In [31]:
token_classifier = pipeline(
    "token-classification", model=model.to("cpu"), tokenizer=tokenizer, aggregation_strategy="simple"
)

## Few examples of inference

In [32]:
token_classifier("I like this Apple laptop, it is very good, so fast! I bought it in Antananarivo ")

[{'entity_group': 'ORG',
  'score': 0.9980337,
  'word': 'Apple',
  'start': 12,
  'end': 17},
 {'entity_group': 'LOC',
  'score': 0.99982107,
  'word': 'Antananarivo',
  'start': 67,
  'end': 79}]

In [33]:
token_classifier("I like this apple, it is very good, it's sweet, from a farm company called Pomme de Cape Town")

[{'entity_group': 'ORG',
  'score': 0.89840376,
  'word': 'Pomme de Cape Town',
  'start': 75,
  'end': 93}]

In [34]:
token_classifier("Je pense que la bague que tu as acheté vient de Lagos")

[{'entity_group': 'PER',
  'score': 0.9869337,
  'word': 'Lagos',
  'start': 48,
  'end': 53}]

# Evaluate on the test data

In [35]:
trainer_2 = Trainer(
    model=model,
    args=args,
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


In [36]:
trainer_2.evaluate()

{'eval_loss': 0.22726663947105408,
 'eval_precision': 0.8881058783086971,
 'eval_recall': 0.9148371104815864,
 'eval_f1': 0.9012733298447584,
 'eval_accuracy': 0.9724240084461323,
 'eval_runtime': 15.1752,
 'eval_samples_per_second': 227.542,
 'eval_steps_per_second': 3.558}

---
**References**

- https://huggingface.co/learn/nlp-course/en/chapter7/2