In [2]:
from datasets import load_dataset

raw_datasets = load_dataset("conll2003")
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [3]:
ner_feature = raw_datasets["train"].features["ner_tags"]
label_names = ner_feature.feature.names
label_names

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [4]:
from transformers import AutoTokenizer

# Models to try
# distilbert-base-uncased
# distilbert-base-cased
# distilbert-base-multilingual-cased

model_checkpoint = "distilbert/distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Preprocessing

In [5]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [6]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [7]:
tokenized_datasets = raw_datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)

In [8]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [9]:
import evaluate
import numpy as np

metric = evaluate.load("seqeval")

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

# Training

In [10]:
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

In [11]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
model.config.num_labels

9

In [13]:
# from huggingface_hub import notebook_login
# 
# notebook_login()

In [14]:
from transformers import TrainingArguments

output_dir = "models/distilbert-uncased-finetuned-ner"
args = TrainingArguments(
    output_dir,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=True,
)

In [15]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
# trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [16]:
# trainer.push_to_hub(commit_message="Training complete")

In [17]:
trained_model = AutoModelForTokenClassification.from_pretrained(
    "jackfriedson/distilbert-uncased-finetuned-ner",
    id2label=id2label,
    label2id=label2id,
)

# Evaluation

In [67]:
import torch
from torch.utils.data import DataLoader
from tqdm.auto import tqdm

eval_dataloader = DataLoader(
    tokenized_datasets["validation"], collate_fn=data_collator, batch_size=8
)

def postprocess(predictions, labels):
    predictions = predictions.detach().cpu().clone().numpy()
    labels = labels.detach().cpu().clone().numpy()

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    return true_predictions, true_labels


def evaluate_model(model, eval_dataloader, metric) -> dict:
    model.eval()
    for batch in tqdm(eval_dataloader):
        with torch.no_grad():
            outputs = model(**batch)
    
        predictions = outputs.logits.argmax(dim=-1)
        labels = batch["labels"]
    
        true_predictions, true_labels = postprocess(predictions, labels)
        metric.add_batch(predictions=true_predictions, references=true_labels)
        
    return metric.compute()

In [19]:
evaluate_model(trained_model, eval_dataloader, metric)

  0%|          | 0/407 [00:00<?, ?it/s]

{'LOC': {'precision': 0.965160587915079,
  'recall': 0.9511802575107297,
  'f1': 0.9581194271818428,
  'number': 1864},
 'MISC': {'precision': 0.8633405639913232,
  'recall': 0.8522483940042827,
  'f1': 0.8577586206896551,
  'number': 934},
 'ORG': {'precision': 0.9060402684563759,
  'recall': 0.8823529411764706,
  'f1': 0.8940397350993378,
  'number': 1377},
 'PER': {'precision': 0.9782844733984799,
  'recall': 0.9740540540540541,
  'f1': 0.9761646803900326,
  'number': 1850},
 'overall_precision': 0.9400875126220128,
 'overall_recall': 0.9271369294605809,
 'overall_f1': 0.9335673101027826,
 'overall_accuracy': 0.9852256660365069}

# Inference

In [20]:
import re

word_split_regex = re.compile(r'\w+|[.,!?;]')

def split_into_words(input: str) -> list[str]:
    return word_split_regex.findall(input)

def label_ids_to_names(label_ids: list[int]) -> list[str]:
    return [label_names[label_id] for label_id in label_ids if label_id != -100]

def align_predictions_with_words(predictions: list[int] | torch.Tensor, word_ids: list[int | None]) -> list[int]:
    if len(predictions) != len(word_ids):
        raise ValueError(f"Predictions and word_ids should have the same length, got {len(predictions)} predictions and {len(word_ids)} word_ids")

    word_predictions = []
    current_word = None
    for prediction, word_id in zip(predictions, word_ids):
        if word_id is None:
            continue
        if word_id != current_word:
            # Start of a new word
            word_predictions.append(prediction)
        current_word = word_id
    return word_predictions


class NER:
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer

    def get_predictions(self, input: list[str]) -> list[str]:
        # Input must already be split into words
        tokenized_inputs = self.tokenizer(input, return_tensors="pt", is_split_into_words=True)
        with torch.no_grad():
            outputs = self.model(**tokenized_inputs)
        predictions = outputs.logits.argmax(dim=-1)
        word_predictions = align_predictions_with_words(predictions.tolist()[0], tokenized_inputs.word_ids())
        return label_ids_to_names(word_predictions)
    
    def get_predictions_batch(self, inputs: list[list[str]]) -> list[list[str]]:
        # Inputs must already be split into words
        tokenized_inputs = self.tokenizer(inputs, return_tensors="pt", padding=True, truncation=True, is_split_into_words=True)
        with torch.no_grad():
            outputs = self.model(**tokenized_inputs)
        predictions = outputs.logits.argmax(dim=-1)
        word_predictions = []
        for i in range(len(inputs)):
            word_prediction = align_predictions_with_words(predictions[i], tokenized_inputs.word_ids(batch_index=i))
            word_predictions.append(label_ids_to_names(word_prediction))
        return word_predictions
    
    def time_inference(self, word_inputs: list[list[str]]) -> None:
        result = %timeit -r 15 -o self.get_predictions_batch(word_inputs)
        print(f"Predicted {len(word_inputs)} words in {1000 * result.average:.2f} ms")
        print(f"{1000 * result.average / len(word_inputs):.2f} ms per word")

In [21]:
ner = NER(trained_model, tokenizer)

In [22]:
ner.get_predictions(split_into_words("john smith"))

['B-PER', 'I-PER']

In [23]:
ner.get_predictions(split_into_words("building"))

['O']

In [24]:
raw_inputs = ["john smith", "abraham lincoln", "building", "datadog", "smith, john", "sylvain went to the store with martha"]
inputs = list(map(split_into_words, raw_inputs))
ner.get_predictions_batch(inputs)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[['B-PER', 'I-PER'],
 ['B-PER', 'I-PER'],
 ['O'],
 ['B-ORG'],
 ['B-PER', 'O', 'B-PER'],
 ['B-PER', 'O', 'O', 'O', 'O', 'O', 'B-PER']]

In [25]:
ner.time_inference(inputs)

28.4 ms ± 3.17 ms per loop (mean ± std. dev. of 15 runs, 10 loops each)
Predicted 6 words in 28.41 ms
4.73 ms per word


In [111]:
inputs_med = list(map(split_into_words, raw_inputs * 10))
ner.time_inference(inputs_med)

115 ms ± 6.69 ms per loop (mean ± std. dev. of 15 runs, 10 loops each)
Predicted 60 words in 114.66 ms
1.91 ms per word


In [112]:
inputs_large = list(map(split_into_words, raw_inputs * 100))
ner.time_inference(inputs_large)

857 ms ± 58.7 ms per loop (mean ± std. dev. of 15 runs, 1 loop each)
Predicted 600 words in 856.61 ms
1.43 ms per word


# Eval on names dataset

In [109]:
import ast

names_datasets = load_dataset("csv", data_files="data/names_dataset.csv", split="train[:20%]").train_test_split(test_size=0.1, shuffle=False)
def parse_lists(example):
    example["tokens"] = ast.literal_eval(example["tokens"])
    example["ner_tags"] = ast.literal_eval(example["ner_tags"])
    return example

names_datasets = names_datasets.map(parse_lists)

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/131491 [00:00<?, ? examples/s]

Map:   0%|          | 0/14611 [00:00<?, ? examples/s]

In [110]:
names_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'is_name', 'name_type'],
        num_rows: 131491
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'is_name', 'name_type'],
        num_rows: 14611
    })
})

In [78]:
tokenized_names_datasets = names_datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=names_datasets["train"].column_names,
)

Map:   0%|          | 0/131491 [00:00<?, ? examples/s]

Map:   0%|          | 0/14611 [00:00<?, ? examples/s]

In [88]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
batch_size = 64
eval_dataloader = DataLoader(
    tokenized_names_datasets["test"], collate_fn=data_collator, batch_size=batch_size
)

In [80]:
evaluate_model(trained_model, eval_dataloader, metric)

  0%|          | 0/1827 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'LOC': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 0},
 'MISC': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 0},
 'ORG': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 0},
 'PER': {'precision': 0.7306962025316456,
  'recall': 0.3418714835652946,
  'f1': 0.46580593100665724,
  'number': 6754},
 'overall_precision': 0.20313187296560217,
 'overall_recall': 0.3418714835652946,
 'overall_f1': 0.25484244798852157,
 'overall_accuracy': 0.5106166091916629}

In [52]:
# i = 9
# example = list(DataLoader(tokenized_names_datasets["test"], collate_fn=data_collator, batch_size=1))[i]
# 
# with torch.no_grad():
#     outputs = trained_model(**example)
# 
# predictions = outputs.logits.argmax(dim=-1)
# labels = example["labels"]
# 
# true_predictions, true_labels = postprocess(predictions, labels)
# print(names_datasets["test"][i]["tokens"])
# print(true_predictions)
# print(true_labels)

In [89]:
from accelerate import Accelerator
from huggingface_hub import Repository, get_full_repo_name
from torch.optim import AdamW
from transformers import Trainer, TrainingArguments, get_scheduler

train_dataloader = DataLoader(
    tokenized_names_datasets["train"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=batch_size,
)

optimizer = AdamW(model.parameters(), lr=2e-5)

accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

num_train_epochs = 2
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

model_name = "distilbert-uncased-names-accelerate"
output_dir = f"models/{model_name}"
repo_name = get_full_repo_name(model_name)
repo = Repository(output_dir, clone_from=repo_name)

For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got fo

In [114]:
# Training

# from tqdm.auto import tqdm
# import torch
# 
# for epoch in range(num_train_epochs):
#     # Training
#     model.train()
#     for batch in tqdm(train_dataloader):
#         outputs = model(**batch)
#         loss = outputs.loss
#         accelerator.backward(loss)
# 
#         optimizer.step()
#         lr_scheduler.step()
#         optimizer.zero_grad()
# 
#     # Evaluation
#     model.eval()
#     for batch in tqdm(eval_dataloader):
#         with torch.no_grad():
#             outputs = model(**batch)
# 
#         predictions = outputs.logits.argmax(dim=-1)
#         labels = batch["labels"]
# 
#         # Necessary to pad predictions and labels for being gathered
#         predictions = accelerator.pad_across_processes(predictions, dim=1, pad_index=-100)
#         labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)
# 
#         predictions_gathered = accelerator.gather(predictions)
#         labels_gathered = accelerator.gather(labels)
# 
#         true_predictions, true_labels = postprocess(predictions_gathered, labels_gathered)
#         metric.add_batch(predictions=true_predictions, references=true_labels)
# 
#     results = metric.compute()
#     print(
#         f"epoch {epoch}:",
#         {
#             key: results[f"overall_{key}"]
#             for key in ["precision", "recall", "f1", "accuracy"]
#         },
#     )
# 
#     # Save and upload
#     accelerator.wait_for_everyone()
#     unwrapped_model = accelerator.unwrap_model(model)
#     unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
#     if accelerator.is_main_process:
#         tokenizer.save_pretrained(output_dir)
#         repo.push_to_hub(
#             commit_message=f"Training in progress epoch {epoch}", blocking=False
#         )

In [92]:
trained_names_model = AutoModelForTokenClassification.from_pretrained(
    repo_name,
    id2label=id2label,
    label2id=label2id,
)

config.json:   0%|          | 0.00/916 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

In [94]:
eval_dataloader = DataLoader(
    tokenized_names_datasets["test"], collate_fn=data_collator, batch_size=batch_size
)
evaluate_model(trained_names_model, eval_dataloader, metric)

  0%|          | 0/229 [00:00<?, ?it/s]

{'PER': {'precision': 0.8917657045840407,
  'recall': 0.9332247557003257,
  'f1': 0.9120243090724931,
  'number': 6754},
 'overall_precision': 0.8917657045840407,
 'overall_recall': 0.9332247557003257,
 'overall_f1': 0.9120243090724931,
 'overall_accuracy': 0.9430672132214197}