In [1]:
# imports
import os
from itertools import batched

import numpy as np
from datasets import load_dataset, ClassLabel
import evaluate
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    DataCollatorForTokenClassification,
    TrainingArguments,
    Trainer,
    set_seed
)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# consts
set_seed(42)

MODEL_NAME = "distilbert-base-uncased"
OUTPUT_DIR = "out/distilbert-base-uncased"
MAX_LENGTH = 256            # safe for 8GB GPU
PER_DEVICE_TRAIN_BATCH_SIZE = 8   # tune downward if OOM; gradient_accumulation_steps simulates larger batch
GRADIENT_ACCUMULATION_STEPS = 2    # effective batch = 16
PER_DEVICE_EVAL_BATCH_SIZE = 16
NUM_EPOCHS = 4
LEARNING_RATE = 2e-5
WEIGHT_DECAY = 0.01
WARMUP_RATIO = 0.06

import os
os.environ["HF_HOME"] = "../hf_cache"
os.environ["HF_DATASETS_CACHE"] = "../hf_cache/datasets"

os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"

In [3]:
# dataset
raw_datasets = load_dataset("wikiann", "en", cache_dir="../hf_cache/datasets")

In [4]:
# BIO tags
features = raw_datasets["train"].features
ner_feature = features["ner_tags"]
label_list = ner_feature.feature.names
num_labels = len(label_list)
print("Labels:", label_list)

Labels: ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']


In [5]:
# tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True, cache_dir="../hf_cache")
model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME, num_labels=num_labels, cache_dir="../hf_cache")

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
# enable (or NOT) gradient checkpoint
# try:
#     model.gradient_checkpointing_enable()
#     print("Gradient checkpointing enabled")
# except Exception:
#     pass

In [7]:
# tokenize and align labels
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        is_split_into_words=True,
        truncation=True,
        max_length=MAX_LENGTH,
        return_overflowing_tokens=False,
        return_offsets_mapping=False,
    )

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # list with length = #input_ids
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # special tokens
            elif word_idx != previous_word_idx:
                # first token of the word -> use label
                label_ids.append(label[word_idx])
            else:
                # for subsequent subtokens:
                # option A: set -100 to ignore
                # option B: repeat label (I- vs B- preference)
                # We'll repeat the label (simple approach) to help span continuity
                label_ids.append(label[word_idx])
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [8]:
# tokenize datasets
tokenized_datasets = raw_datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=raw_datasets["train"].column_names
)

Map: 100%|██████████| 10000/10000 [00:00<00:00, 51127.04 examples/s]


In [9]:
# data collator
data_collator = DataCollatorForTokenClassification(tokenizer)

In [10]:
# metrics
metric = evaluate.load("seqeval")

In [11]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [
        [label_list[l] for (l, p) in zip(label, pred) if l != -100]
        for label, pred in zip(labels, predictions)
    ]
    true_predictions = [
        [label_list[p] for (l, p) in zip(label, pred) if l != -100]
        for label, pred in zip(labels, predictions)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }


In [12]:
import accelerate

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=200,
    per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    num_train_epochs=NUM_EPOCHS,
    learning_rate=LEARNING_RATE,
    weight_decay=WEIGHT_DECAY,
    warmup_ratio=WARMUP_RATIO,
    fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    save_total_limit=3,
    dataloader_num_workers=2,
    run_name="deberta-v3-base-conll",
)

In [13]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

  trainer = Trainer(


In [14]:
trainer.train()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.3236,0.287548,0.790071,0.801227,0.79561,0.911838
2,0.2398,0.273795,0.800587,0.824822,0.812524,0.918709
3,0.1736,0.282427,0.814164,0.823604,0.818857,0.919819
4,0.1279,0.29206,0.813012,0.82816,0.820516,0.920966


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

TrainOutput(global_step=5000, training_loss=0.27821298751831053, metrics={'train_runtime': 210.2888, 'train_samples_per_second': 380.429, 'train_steps_per_second': 23.777, 'total_flos': 503319198891696.0, 'train_loss': 0.27821298751831053, 'epoch': 4.0})

In [15]:
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

# 11) Quick eval on test set
print("Evaluating on test set...")
metrics = trainer.evaluate(tokenized_datasets["test"])
print(metrics)

Evaluating on test set...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


{'eval_loss': 0.2781869173049927, 'eval_precision': 0.8170975036580499, 'eval_recall': 0.8346014492753623, 'eval_f1': 0.8257567270853404, 'eval_accuracy': 0.9236028976936724, 'eval_runtime': 4.0357, 'eval_samples_per_second': 2477.913, 'eval_steps_per_second': 154.87, 'epoch': 4.0}


In [16]:
def predict_sentence(sentence_tokens):
    # sentence_tokens: list of tokens (words)
    enc = tokenizer(sentence_tokens, is_split_into_words=True, return_tensors="pt", truncation=True, max_length=MAX_LENGTH)
    # send to same device as model
    device = trainer.model.device
    enc = {k: v.to(device) for k, v in enc.items()}
    outputs = trainer.model(**enc)
    logits = outputs.logits.detach().cpu().numpy()
    pred_ids = np.argmax(logits, axis=-1)[0]
    word_ids = tokenizer(sentence_tokens, is_split_into_words=True).word_ids()
    # align back
    pred_labels = []
    last_w = None
    for idx, wid in enumerate(word_ids):
        if wid is None:
            continue
        # take labels for first subtoken of each word (wid change)
        if wid != last_w:
            pred_labels.append(label_list[pred_ids[idx]])
        last_w = wid
    return list(zip(sentence_tokens, pred_labels))

In [17]:
example = ["EU", "rejects", "German", "call", "to", "boycott", "British", "lamb", "."]
print(predict_sentence(example))

[('EU', 'B-ORG'), ('rejects', 'O'), ('German', 'B-ORG'), ('call', 'I-ORG'), ('to', 'I-ORG'), ('boycott', 'O'), ('British', 'B-ORG'), ('lamb', 'I-ORG'), ('.', 'O')]
