In [None]:
!pip install datasets transformers seqeval

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.11.0-py3-none-any.whl (468 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.7/468.7 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m55.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting aiohttp
  Downloading aiohttp-3.8.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m42.4 MB/s[0m eta [36m0:00:00[0m
Collecting respo

In [None]:
# Adding hugging face so metrics are saved

from huggingface_hub import notebook_login
notebook_login()

Token is valid.
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
!apt install git-lfs

Reading package lists... Done
Building dependency tree       
Reading state information... Done
git-lfs is already the newest version (2.9.2-1).
0 upgraded, 0 newly installed, 0 to remove and 24 not upgraded.


In [None]:
import transformers
print(transformers.__version__)

4.28.1


In [None]:
from transformers.utils import send_example_telemetry
send_example_telemetry("token_classification_notebook", framework="pytorch")

Fine-tuning a model on a token classification task

In [None]:
task = "ner"
model_checkpoint = "distilbert-base-uncased"
batch_size = 16   # Batch size which was edited

Loading the dataset

In [None]:
from datasets import load_dataset, load_metric

In [None]:
datasets = load_dataset("conll2003")

Downloading builder script:   0%|          | 0.00/9.57k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/12.3k [00:00<?, ?B/s]

Downloading and preparing dataset conll2003/conll2003 to /root/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98...


Downloading data:   0%|          | 0.00/983k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

Dataset conll2003 downloaded and prepared to /root/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [None]:
datasets["train"][0]

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}

In [None]:
datasets["train"].features[f"ner_tags"]

Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None), length=-1, id=None)

In [None]:
label_list = datasets["train"].features[f"{task}_tags"].feature.names
label_list

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [None]:
from datasets import ClassLabel, Sequence
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=10):
  assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset"
  picks = []
  for _ in range(num_examples):
    pick = random.randint(0, len(dataset)-1)
    while pick in picks:
      pick = random.randint(0, len(dataset)-1)
    picks.append(pick)

  df = pd.DataFrame(dataset[picks])
  for column, typ in dataset.features.items():
    if isinstance(typ, ClassLabel):
      df[column] = df[column].transform(lambda i: typ.names[i])
    elif isinstance(typ, Sequence) and isinstance(typ.feature, ClassLabel):
      df[column] = df[column].transform(lambda x: [typ.feature.names[i] for i in x])
  display(HTML(df.to_html()))

In [None]:
show_random_elements(datasets["train"])

Unnamed: 0,id,tokens,pos_tags,chunk_tags,ner_tags
0,1918,"[S., Campbell, 69, ;, G., Rose, 7-73, ), .]","[NNP, NNP, CD, :, NNP, NNP, CD, ), .]","[B-NP, I-NP, I-NP, O, B-NP, I-NP, I-NP, O, O]","[B-PER, I-PER, O, O, B-PER, I-PER, O, O, O]"
1,13153,"[They, were, quite, surprised, ,, "", he, told, the, Miami, Herald, .]","[PRP, VBD, RB, VBN, ,, "", PRP, VBD, DT, NNP, NNP, .]","[B-NP, B-VP, B-ADJP, I-ADJP, O, O, B-NP, B-VP, B-NP, I-NP, I-NP, O]","[O, O, O, O, O, O, O, O, O, B-ORG, I-ORG, O]"
2,5268,"[lost, ,, points, for, ,, against, ,, total, points, ), :]","[VBD, ,, NNS, IN, ,, RB, ,, JJ, NNS, ), :]","[B-VP, O, B-NP, B-PP, O, B-ADVP, O, B-NP, I-NP, O, O]","[O, O, O, O, O, O, O, O, O, O, O]"
3,11165,"[She, died, in, hospital, .]","[PRP, VBD, IN, NN, .]","[B-NP, B-VP, B-PP, B-NP, O]","[O, O, O, O, O]"
4,7967,"[SepOct, 733.75, 743.50, unq, unq]","[JJ, CD, CD, NN, JJ]","[B-NP, I-NP, I-NP, I-NP, B-ADJP]","[O, O, O, O, O]"
5,11328,"[Williams, ', hometown, was, not, immediately, available, .]","[NNP, POS, NN, VBD, RB, RB, JJ, .]","[B-NP, B-NP, I-NP, B-VP, I-VP, I-VP, B-ADJP, O]","[B-PER, O, O, O, O, O, O, O]"
6,13701,"[Tsang, said, three, sets, of, meetings, with, Chinese, authorities, on, Hong, Kong, 's, 1997-98, budget, ,, which, will, span, the, transition, period, ,, had, gone, smoothly, ., ""]","[NNP, VBD, CD, NNS, IN, NNS, IN, JJ, NNS, IN, NNP, NNP, POS, CD, NN, ,, WDT, MD, VB, DT, NN, NN, ,, VBD, VBN, RB, ., ""]","[B-NP, B-VP, B-NP, I-NP, B-PP, B-NP, B-PP, B-NP, I-NP, B-PP, B-NP, I-NP, B-NP, I-NP, I-NP, O, B-NP, B-VP, I-VP, B-NP, I-NP, I-NP, O, B-VP, I-VP, B-ADVP, O, O]","[B-PER, O, O, O, O, O, O, B-MISC, O, O, B-LOC, I-LOC, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
7,6621,"[Shakhtar, 6, 3, 2, 1, 10, 3, 11]","[JJR, CD, CD, CD, CD, CD, CD, CD]","[B-NP, I-NP, I-NP, I-NP, I-NP, I-NP, I-NP, I-NP]","[B-ORG, O, O, O, O, O, O, O]"
8,1084,"[Garlic, pills, do, n't, lower, cholesterol, ,, study, finds, .]","[JJ, NNS, VBP, RB, VB, NN, ,, NN, VBZ, .]","[B-NP, I-NP, B-VP, I-VP, I-VP, B-NP, O, B-NP, B-VP, O]","[O, O, O, O, O, O, O, O, O, O]"
9,3857,"[ATLANTA, 80, 47, .630, -]","[NNP, CD, CD, CD, :]","[B-NP, I-NP, I-NP, I-NP, I-NP]","[B-ORG, O, O, O, O]"


Preprocessing the data

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
import transformers
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

In [None]:
tokenizer("Hello, this is one sentence!")

{'input_ids': [101, 7592, 1010, 2023, 2003, 2028, 6251, 999, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
tokenizer(["Hello", ",", "this", "is", "one", "sentence", "split", "into", "words", "."], is_split_into_words=True)

{'input_ids': [101, 7592, 1010, 2023, 2003, 2028, 6251, 3975, 2046, 2616, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
example = datasets["train"][4]
print(example["tokens"])

['Germany', "'s", 'representative', 'to', 'the', 'European', 'Union', "'s", 'veterinary', 'committee', 'Werner', 'Zwingmann', 'said', 'on', 'Wednesday', 'consumers', 'should', 'buy', 'sheepmeat', 'from', 'countries', 'other', 'than', 'Britain', 'until', 'the', 'scientific', 'advice', 'was', 'clearer', '.']


In [None]:
tokenized_input = tokenizer(example["tokens"], is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
print(tokens)

['[CLS]', 'germany', "'", 's', 'representative', 'to', 'the', 'european', 'union', "'", 's', 'veterinary', 'committee', 'werner', 'z', '##wing', '##mann', 'said', 'on', 'wednesday', 'consumers', 'should', 'buy', 'sheep', '##me', '##at', 'from', 'countries', 'other', 'than', 'britain', 'until', 'the', 'scientific', 'advice', 'was', 'clearer', '.', '[SEP]']


In [None]:
len(example[f"{task}_tags"]), len(tokenized_input["input_ids"])

(31, 39)

In [None]:
print(tokenized_input.word_ids())

[None, 0, 1, 1, 2, 3, 4, 5, 6, 7, 7, 8, 9, 10, 11, 11, 11, 12, 13, 14, 15, 16, 17, 18, 18, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, None]


In [None]:
word_ids = tokenized_input.word_ids()
aligned_labels = [-100 if i is None else example[f"{task}_tags"] [i] for i in word_ids]
print(len(aligned_labels), len(tokenized_input["input_ids"]))

39 39


In [None]:
label_all_tokens = True

In [None]:
def tokenize_and_align_labels(examples):
  tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

  labels = []
  for i, label in enumerate(examples[f"{task}_tags"]):
    word_ids = tokenized_inputs.word_ids(batch_index=i)
    previous_word_idx = None
    label_ids = []
    for word_idx in word_ids:
      if word_idx is None:
        label_ids.append(-100)
      elif word_idx != previous_word_idx:
        label_ids.append(label[word_idx])
      else:
        label_ids.append(label[word_idx] if label_all_tokens else -100)
      previous_word_idx = word_idx
    
    labels.append(label_ids)
  
  tokenized_inputs["labels"] = labels
  return tokenized_inputs

In [None]:
tokenize_and_align_labels(datasets['train'][:5])

{'input_ids': [[101, 7327, 19164, 2446, 2655, 2000, 17757, 2329, 12559, 1012, 102], [101, 2848, 13934, 102], [101, 9371, 2727, 1011, 5511, 1011, 2570, 102], [101, 1996, 2647, 3222, 2056, 2006, 9432, 2009, 18335, 2007, 2446, 6040, 2000, 10390, 2000, 18454, 2078, 2329, 12559, 2127, 6529, 5646, 3251, 5506, 11190, 4295, 2064, 2022, 11860, 2000, 8351, 1012, 102], [101, 2762, 1005, 1055, 4387, 2000, 1996, 2647, 2586, 1005, 1055, 15651, 2837, 14121, 1062, 9328, 5804, 2056, 2006, 9317, 10390, 2323, 4965, 8351, 4168, 4017, 2013, 3032, 2060, 2084, 3725, 2127, 1996, 4045, 6040, 2001, 24509, 1012, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, -100], [-100, 1, 2, -100], [-100, 5, 0, 

In [None]:
tokenized_datasets = datasets.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

Fine-tuning the model

In [None]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))

Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForTokenClassification: ['vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN t

In [None]:
model_name = model_checkpoint.split("/")[-1]
args = TrainingArguments(
    f"{model_name}-finetuned-{task}",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,   # Number of epochs which was edited
    weight_decay=0.01,
    push_to_hub=True,
)

In [None]:
from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(tokenizer)

In [None]:
metric = load_metric("seqeval")

  metric = load_metric("seqeval")


Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

In [None]:
labels = [label_list[i] for i in example[f"{task}_tags"]]
metric.compute(predictions=[labels], references=[labels])

{'LOC': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 2},
 'ORG': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'PER': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'overall_precision': 1.0,
 'overall_recall': 1.0,
 'overall_f1': 1.0,
 'overall_accuracy': 1.0}

In [None]:
import numpy as np

def compute_metrics(p):
  predictions, labels = p
  predictions = np.argmax(predictions, axis=2)

  true_predictions = [
      [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
      for prediction, label in zip(predictions, labels)
  ]
  true_labels = [
      [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
      for prediction, label in zip(predictions, labels)
  ]
  results = metric.compute(predictions=true_predictions, references=true_labels)
  return {
      "precision": results["overall_precision"],
      "recall": results["overall_recall"],
      "f1": results["overall_f1"],
      "accuracy": results["overall_accuracy"],
  }

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

Cloning https://huggingface.co/hannahbillo/distilbert-base-uncased-finetuned-ner into local empty directory.


In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0899,0.062996,0.913653,0.932767,0.923111,0.981842
2,0.0404,0.060806,0.927763,0.93534,0.931536,0.983589
3,0.0281,0.061501,0.924291,0.936906,0.930556,0.983669


TrainOutput(global_step=2634, training_loss=0.04949700153498552, metrics={'train_runtime': 10631.2951, 'train_samples_per_second': 3.962, 'train_steps_per_second': 0.248, 'total_flos': 577226994576624.0, 'train_loss': 0.04949700153498552, 'epoch': 3.0})

In [None]:
trainer.evaluate()

{'eval_loss': 0.06150057166814804,
 'eval_precision': 0.9242909171173159,
 'eval_recall': 0.9369056941492337,
 'eval_f1': 0.9305555555555555,
 'eval_accuracy': 0.983668800737128,
 'eval_runtime': 231.0882,
 'eval_samples_per_second': 14.064,
 'eval_steps_per_second': 0.883,
 'epoch': 3.0}

In [None]:
predictions, labels, _ = trainer.predict(tokenized_datasets["validation"])
predictions = np.argmax(predictions, axis=2)

true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
results

{'LOC': {'precision': 0.946260804208944,
  'recall': 0.9618029029793735,
  'f1': 0.953968554650502,
  'number': 2618},
 'MISC': {'precision': 0.7992248062015503,
  'recall': 0.8375304630381804,
  'f1': 0.817929393097977,
  'number': 1231},
 'ORG': {'precision': 0.8982064953950557,
  'recall': 0.9012645914396887,
  'f1': 0.8997329448895363,
  'number': 2056},
 'PER': {'precision': 0.9757138168690516,
  'recall': 0.9798945286750165,
  'f1': 0.9777997039960533,
  'number': 3034},
 'overall_precision': 0.9242909171173159,
 'overall_recall': 0.9369056941492337,
 'overall_f1': 0.9305555555555555,
 'overall_accuracy': 0.983668800737128}

In [None]:
trainer.push_to_hub()

Several commits (2) will be pushed upstream.
The progress bars may be unreliable.


Upload file runs/Apr21_11-58-50_01a911572c8e/events.out.tfevents.1682078347.01a911572c8e.214.0:   0%|         …

Upload file runs/Apr21_11-58-50_01a911572c8e/events.out.tfevents.1682090543.01a911572c8e.214.3:   0%|         …

To https://huggingface.co/hannahbillo/distilbert-base-uncased-finetuned-ner
   da983a8..71b56eb  main -> main

   da983a8..71b56eb  main -> main

To https://huggingface.co/hannahbillo/distilbert-base-uncased-finetuned-ner
   71b56eb..7789998  main -> main

   71b56eb..7789998  main -> main



'https://huggingface.co/hannahbillo/distilbert-base-uncased-finetuned-ner/commit/71b56eb626acaea04e45ff31a7816f7ca00801e8'