This notebook fine-tunes the distilbert-base-uncased model on a custom dataset of names and non-name English words

In [2]:
import ast

from datasets import load_dataset

raw_datasets = load_dataset("csv", data_files="../data/names_dataset.csv", split="train[:20%]").train_test_split(test_size=0.1, shuffle=False)
def parse_lists(example):
    example["tokens"] = ast.literal_eval(example["tokens"])
    example["ner_tags"] = ast.literal_eval(example["ner_tags"])
    return example

raw_datasets = raw_datasets.map(parse_lists)
raw_datasets

Map:   0%|          | 0/131491 [00:00<?, ? examples/s]

Map:   0%|          | 0/14611 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'is_name', 'name_type'],
        num_rows: 131491
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'is_name', 'name_type'],
        num_rows: 14611
    })
})

In [13]:
label_names = load_dataset("conll2003")["train"].features["ner_tags"].feature.names

In [4]:
from functools import partial

from transformers import AutoTokenizer

from min_name_classifier.utils import tokenize_and_align_labels


model_checkpoint = "distilbert/distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

tokenized_datasets = raw_datasets.map(
    partial(tokenize_and_align_labels, tokenizer),
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)

Map:   0%|          | 0/131491 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/14611 [00:00<?, ? examples/s]

In [5]:
from torch.utils.data import DataLoader
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
batch_size = 64
eval_dataloader = DataLoader(
    tokenized_datasets["test"], collate_fn=data_collator, batch_size=batch_size
)

See how the model fine-tuned on conll2003 performs on the names dataset

In [8]:
from min_name_classifier.utils import NER

import evaluate

seqeval = evaluate.load("seqeval")

conll2003_ner = NER.from_pretrained(
    "jackfriedson/distilbert-uncased-finetuned-ner",
    label_names=label_names,
    metric=seqeval,
)

conll2003_ner.evaluate_model(eval_dataloader)

  0%|          | 0/229 [00:00<?, ?it/s]

{'LOC': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 0},
 'MISC': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 0},
 'ORG': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 0},
 'PER': {'precision': 0.7104615384615385,
  'recall': 0.3418714835652946,
  'f1': 0.46161535385845653,
  'number': 6754},
 'overall_precision': 0.2034899092271085,
 'overall_recall': 0.3418714835652946,
 'overall_f1': 0.2551240262968896,
 'overall_accuracy': 0.5134514577827839}

Not great, let's fine-tune a base model on the names dataset instead

In [14]:
from torch.optim import AdamW
from transformers import AutoModelForTokenClassification, get_scheduler

id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)

train_dataloader = DataLoader(
    tokenized_datasets["train"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=batch_size,
)

optimizer = AdamW(model.parameters(), lr=2e-5)

num_train_epochs = 2
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

model_name = "distilbert-uncased-names-accelerate"
output_dir = f"models/{model_name}"

ner = NER(model, tokenizer, label_names, seqeval)


Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
# ner.train(model_name, output_dir, num_train_epochs, train_dataloader, eval_dataloader, optimizer, lr_scheduler)

In [16]:
from huggingface_hub import get_full_repo_name

eval_dataloader = DataLoader(
    tokenized_datasets["test"], collate_fn=data_collator, batch_size=batch_size
)

repo_name = get_full_repo_name(model_name)
ner = NER.from_pretrained(repo_name, label_names, seqeval)

tokenizer_config.json:   0%|          | 0.00/1.22k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [17]:
ner.evaluate_model(eval_dataloader)

  0%|          | 0/229 [00:00<?, ?it/s]

{'PER': {'precision': 0.8872686483454851,
  'recall': 0.936926265916494,
  'f1': 0.9114215756877431,
  'number': 6754},
 'overall_precision': 0.8872686483454851,
 'overall_recall': 0.936926265916494,
 'overall_f1': 0.9114215756877431,
 'overall_accuracy': 0.9408005891781831}