This notebook fine-tunes the distilbert-base-uncased model on the conll2003 dataset 

In [1]:
from datasets import load_dataset

raw_datasets = load_dataset("conll2003")
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [2]:
ner_feature = raw_datasets["train"].features["ner_tags"]
label_names = ner_feature.feature.names
label_names

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [3]:
from transformers import AutoTokenizer

# TODO: try some other models, e.g. distilbert-base-cased, distilbert-base-multilingual-cased
model_checkpoint = "distilbert/distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Preprocessing

In [4]:
from functools import partial

from min_name_classifier.utils import tokenize_and_align_labels

tokenized_datasets = raw_datasets.map(
    partial(tokenize_and_align_labels, tokenizer),
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)

In [5]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [6]:
import evaluate

seqeval = evaluate.load("seqeval")

# Training

In [7]:
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

In [8]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
model.config.num_labels

9

In [10]:
# from huggingface_hub import notebook_login
# 
# notebook_login()

In [11]:
from transformers import Trainer, TrainingArguments

from min_name_classifier.utils import compute_metrics


output_dir = "models/distilbert-uncased-finetuned-ner"
args = TrainingArguments(
    output_dir,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=partial(compute_metrics, metric=seqeval, label_names=label_names),
    tokenizer=tokenizer,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [12]:
# trainer.train()

In [13]:
# trainer.push_to_hub(commit_message="Training complete")

In [14]:
trained_model = AutoModelForTokenClassification.from_pretrained(
    "jackfriedson/distilbert-uncased-finetuned-ner",
    id2label=id2label,
    label2id=label2id,
)

# Evaluation

In [15]:
from torch.utils.data import DataLoader

eval_dataloader = DataLoader(
    tokenized_datasets["validation"], collate_fn=data_collator, batch_size=8
)

In [16]:
from min_name_classifier.utils import NER

ner = NER(model=trained_model, tokenizer=tokenizer, label_names=label_names, metric=seqeval)
ner.evaluate_model(eval_dataloader)

  0%|          | 0/407 [00:00<?, ?it/s]

{'LOC': {'precision': 0.9511802575107297,
  'recall': 0.965160587915079,
  'f1': 0.9581194271818428,
  'number': 1837},
 'MISC': {'precision': 0.8522483940042827,
  'recall': 0.8633405639913232,
  'f1': 0.8577586206896551,
  'number': 922},
 'ORG': {'precision': 0.8823529411764706,
  'recall': 0.9060402684563759,
  'f1': 0.8940397350993378,
  'number': 1341},
 'PER': {'precision': 0.9740540540540541,
  'recall': 0.9782844733984799,
  'f1': 0.9761646803900326,
  'number': 1842},
 'overall_precision': 0.9271369294605809,
 'overall_recall': 0.9400875126220128,
 'overall_f1': 0.9335673101027826,
 'overall_accuracy': 0.9852256660365069}

# Inference

In [17]:
from min_name_classifier.utils import split_into_words

ner.get_predictions(split_into_words("john smith"))

['B-PER', 'I-PER']

In [18]:
ner.get_predictions(split_into_words("building"))

['O']

In [19]:
raw_inputs = ["john smith", "abraham lincoln", "building", "datadog", "smith, john", "sylvain went to the store with martha"]
inputs = list(map(split_into_words, raw_inputs))
ner.get_predictions_batch(inputs)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[['B-PER', 'I-PER'],
 ['B-PER', 'I-PER'],
 ['O'],
 ['B-ORG'],
 ['B-PER', 'O', 'B-PER'],
 ['B-PER', 'O', 'O', 'O', 'O', 'O', 'B-PER']]

In [29]:
def time_inference(word_inputs: list[list[str]]):
    result = %timeit -r 15 -o ner.get_predictions_batch(word_inputs)
    print(f"Predicted {len(word_inputs)} words in {1000 * result.average:.2f} ms")
    print(f"{1000 * result.average / len(word_inputs):.2f} ms per word")

time_inference(inputs)

29.7 ms ± 1.58 ms per loop (mean ± std. dev. of 15 runs, 10 loops each)
Predicted 6 words in 29.67 ms
4.94 ms per word


In [31]:
inputs_med = list(map(split_into_words, raw_inputs * 10))
time_inference(inputs_med)

107 ms ± 1.8 ms per loop (mean ± std. dev. of 15 runs, 10 loops each)
Predicted 60 words in 106.74 ms
1.78 ms per word


In [32]:
inputs_large = list(map(split_into_words, raw_inputs * 100))
time_inference(inputs_large)

821 ms ± 55 ms per loop (mean ± std. dev. of 15 runs, 1 loop each)
Predicted 600 words in 821.32 ms
1.37 ms per word
