#[01] Data & Tokenizer

In [None]:
%%capture
!pip install datasets
!pip install transformers[torch]

In [None]:
from datasets import load_dataset

In [None]:
data = load_dataset("conll2003")
data

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script:   0%|          | 0.00/9.57k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/12.3k [00:00<?, ?B/s]

The repository for conll2003 contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/conll2003.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0.00/983k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [None]:
data['train'][0]

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}

In [None]:
data['train'].features

{'id': Value(dtype='string', id=None),
 'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'pos_tags': Sequence(feature=ClassLabel(names=['"', "''", '#', '$', '(', ')', ',', '.', ':', '``', 'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNP', 'NNPS', 'NNS', 'NN|SYM', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB'], id=None), length=-1, id=None),
 'chunk_tags': Sequence(feature=ClassLabel(names=['O', 'B-ADJP', 'I-ADJP', 'B-ADVP', 'I-ADVP', 'B-CONJP', 'I-CONJP', 'B-INTJ', 'I-INTJ', 'B-LST', 'I-LST', 'B-NP', 'I-NP', 'B-PP', 'I-PP', 'B-PRT', 'I-PRT', 'B-SBAR', 'I-SBAR', 'B-UCP', 'I-UCP', 'B-VP', 'I-VP'], id=None), length=-1, id=None),
 'ner_tags': Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None), length=-1, id=None)}

In [None]:
data['train'].features['ner_tags']

Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None), length=-1, id=None)

In [None]:
# save for later
label_names = data['train'].features['ner_tags'].feature.names
label_names

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [None]:
from transformers import AutoTokenizer

# also try using bert
# we'll discuss why bert-like models are appropriate for this task later
checkpoint = 'distilbert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [None]:
idx = 0
t = tokenizer(data['train'][idx]['tokens'], is_split_into_words=True)
t

{'input_ids': [101, 7270, 22961, 1528, 1840, 1106, 21423, 1418, 2495, 12913, 119, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
type(t)

In [None]:
t.tokens()

['[CLS]',
 'EU',
 'rejects',
 'German',
 'call',
 'to',
 'boycott',
 'British',
 'la',
 '##mb',
 '.',
 '[SEP]']

#[02] Target Alignment

In [None]:
# value of i indicatees it is the i'th word
# in the input sentence (counting from 0)
t.word_ids()

[None, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, None]

In [None]:
# ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']
begin2inside = {
    1:2,
    3:4,
    5:6,
    7:8,
}

In [None]:
def align_targets(labels, word_ids):
    aligned_labels = []
    last_word = None
    for word in word_ids:
        if word is None:
            # it's a token like [CLS]
            label = -100
        elif word != last_word:
            # it's a new word!
            label = labels[word]
        else:
            # it's the same word as before
            label = labels[word]

            # change B-<tag> to I-<tag> if necessary
            if label in begin2inside:
                label = begin2inside[label]

        # add the label
        aligned_labels.append(label)

        # update last word
        last_word = word

    return aligned_labels

In [None]:
# try our function
labels = data['train'][idx]['ner_tags']
word_ids = t.word_ids()
aligned_targets = align_targets(labels, word_ids)
aligned_targets

[-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100]

In [None]:
aligned_labels = [label_names[t] if t >= 0 else None for t in aligned_targets]
for x, y in zip(t.tokens(), aligned_labels):
    print(f"{x}\t\t{y}")

[CLS]		None
EU		B-ORG
rejects		O
German		B-MISC
call		O
to		O
boycott		O
British		B-MISC
la		O
##mb		O
.		O
[SEP]		None


In [None]:
# make up a fake input just to test it
words = [
    '[CLS]', 'Ger', '##man', 'call', 'to', 'boycott', 'Micro', '##soft', '[SEP]'
]
word_ids = [None, 0, 0, 1, 2, 3, 4, 4, None]
labels = [7, 0, 0, 0, 3]
aligned_targets = align_targets(labels, word_ids)
aligned_labels = [label_names[t] if t >= 0 else None for t in aligned_targets]
for x, y in zip(words, aligned_labels):
    print(f"{x}\t\t{y}")

[CLS]		None
Ger		B-MISC
##man		I-MISC
call		O
to		O
boycott		O
Micro		B-ORG
##soft		I-ORG
[SEP]		None


In [None]:
# tokenize both inputs and targets
def tokenize_fn(batch):
    # tokenize the input sequence first
    # this populates input_ids, attention_mask, etc.
    tokenized_inputs = tokenizer(
        batch['tokens'], truncation=True, is_split_into_words=True
    )

    labels_batch = batch['ner_tags'] # original targets
    aligned_labels_batch = []
    for i, labels in enumerate(labels_batch):
        word_ids = tokenized_inputs.word_ids(i)
        aligned_labels_batch.append(align_targets(labels, word_ids))

    # recall: the 'target' must be stored in key called 'labels'
    tokenized_inputs['labels'] = aligned_labels_batch

    return tokenized_inputs

In [None]:
# want to remove these from model inputs - they are neither inputs nor targets
data['train'].column_names

['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags']

In [None]:
tokenized_datasets = data.map(
    tokenize_fn,
    batched=True,
    remove_columns=data['train'].column_names,
)

Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [None]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3453
    })
})

#[03] Data Collator

In [None]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [None]:
tokenized_datasets['train'][0:2]

{'input_ids': [[101,
   7270,
   22961,
   1528,
   1840,
   1106,
   21423,
   1418,
   2495,
   12913,
   119,
   102],
  [101, 1943, 14428, 102]],
 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1]],
 'labels': [[-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100], [-100, 1, 2, -100]]}

In [None]:
[tokenized_datasets['train'][i] for i in range(2)]

[{'input_ids': [101,
   7270,
   22961,
   1528,
   1840,
   1106,
   21423,
   1418,
   2495,
   12913,
   119,
   102],
  'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
  'labels': [-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100]},
 {'input_ids': [101, 1943, 14428, 102],
  'attention_mask': [1, 1, 1, 1],
  'labels': [-100, 1, 2, -100]}]

In [None]:
# example
batch = data_collator([tokenized_datasets['train'][i] for i in range(2)])
batch["labels"]

tensor([[-100,    3,    0,    7,    0,    0,    0,    7,    0,    0,    0, -100],
        [-100,    1,    2, -100, -100, -100, -100, -100, -100, -100, -100, -100]])

In [None]:
!pip install seqeval

In [None]:
from datasets import load_metric

metric = load_metric('seqeval')

  metric = load_metric('seqeval')


Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

The repository for seqeval contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/seqeval.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


In [None]:
# test it out
metric.compute(
    predictions=[[0,0,0]],
    references=[[0,0,1]]
)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)


{'overall_precision': 0.0,
 'overall_recall': 0.0,
 'overall_f1': 0.0,
 'overall_accuracy': 0.6666666666666666}

In [None]:
# test it out
metric.compute(
    predictions=[['A', 'A', 'A']],
    references=[['A', 'B', 'A']]
)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)


{'overall_precision': 0.0,
 'overall_recall': 0.0,
 'overall_f1': 0.0,
 'overall_accuracy': 0.6666666666666666}

In [None]:
# test it out
metric.compute(
    predictions=[['O', 'O', 'I-ORG', 'B-MISC']],
    references=[['O', 'B-ORG', 'I-ORG', 'B-MISC']]
)

{'MISC': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'ORG': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1},
 'overall_precision': 0.5,
 'overall_recall': 0.5,
 'overall_f1': 0.5,
 'overall_accuracy': 0.75}

In [None]:
import numpy as np

In [None]:
def compute_metrics(logits_and_labels):
    logits, labels = logits_and_labels
    preds = np.argmax(logits, axis=-1)

    # remove -100 from labels and predictions
    # and convert the label_ids to label names
    str_labels = [
        [label_names[t] for t in label if t != -100] for label in labels
    ]

    # do the same for predictions whenever true label is -100
    str_preds = [
        [label_names[p] for p, t in zip(pred, targ) if t != -100] \
        for pred, targ in zip(preds, labels)
    ]

    the_metrics = metric.compute(predictions=str_preds, references=str_labels)
    return {
        'precision' : the_metrics['overall_precision'],
        'recall' : the_metrics['overall_recall'],
        'f1' : the_metrics['overall_f1'],
        'accuracy' : the_metrics['overall_accuracy'],
    }

In [None]:
id2label = {k : v for k, v in enumerate(label_names)}
label2id = {v : k for k, v in id2label.items()}

In [None]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    checkpoint,
    id2label=id2label,
    label2id=label2id,
)

model.safetensors:   0%|          | 0.00/263M [00:00<?, ?B/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    'distilbert-finetuned-ner',
    eval_strategy='epoch',
    save_strategy='epoch',
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
)

In [None]:
from transformers import Trainer

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0922,0.087441,0.873643,0.907607,0.890301,0.97582
2,0.0457,0.069945,0.913273,0.932178,0.922628,0.982236
3,0.0273,0.072498,0.911224,0.934534,0.922732,0.982928


TrainOutput(global_step=5268, training_loss=0.07924055419551122, metrics={'train_runtime': 328.8723, 'train_samples_per_second': 128.083, 'train_steps_per_second': 16.018, 'total_flos': 460431563935266.0, 'train_loss': 0.07924055419551122, 'epoch': 3.0})

In [None]:
trainer.save_model('my_saved_model')

In [None]:
from transformers import pipeline

ner = pipeline(
    'token-classification',
    model='my_saved_model',
    aggregation_strategy='simple',
    device=0
)

s = "Bill Gates was the CEO of Microsoft in Seattle. Washington."

ner(s)

[{'entity_group': 'PER',
  'score': 0.9990595,
  'word': 'Bill Gates',
  'start': 0,
  'end': 10},
 {'entity_group': 'ORG',
  'score': 0.9986338,
  'word': 'Microsoft',
  'start': 26,
  'end': 35},
 {'entity_group': 'LOC',
  'score': 0.8911382,
  'word': 'Seattle.',
  'start': 39,
  'end': 47},
 {'entity_group': 'LOC',
  'score': 0.5508798,
  'word': 'Washington',
  'start': 48,
  'end': 58}]

#[04] POS Tagging & Custom Datasets

In [None]:
import nltk
from nltk.corpus import brown

nltk.download('brown')
nltk.download('universal_tagset')

corpus = brown.tagged_sents(tagset='universal')
corpus

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.


[[('The', 'DET'), ('Fulton', 'NOUN'), ('County', 'NOUN'), ('Grand', 'ADJ'), ('Jury', 'NOUN'), ('said', 'VERB'), ('Friday', 'NOUN'), ('an', 'DET'), ('investigation', 'NOUN'), ('of', 'ADP'), ("Atlanta's", 'NOUN'), ('recent', 'ADJ'), ('primary', 'NOUN'), ('election', 'NOUN'), ('produced', 'VERB'), ('``', '.'), ('no', 'DET'), ('evidence', 'NOUN'), ("''", '.'), ('that', 'ADP'), ('any', 'DET'), ('irregularities', 'NOUN'), ('took', 'VERB'), ('place', 'NOUN'), ('.', '.')], [('The', 'DET'), ('jury', 'NOUN'), ('further', 'ADV'), ('said', 'VERB'), ('in', 'ADP'), ('term-end', 'NOUN'), ('presentments', 'NOUN'), ('that', 'ADP'), ('the', 'DET'), ('City', 'NOUN'), ('Executive', 'ADJ'), ('Committee', 'NOUN'), (',', '.'), ('which', 'DET'), ('had', 'VERB'), ('over-all', 'ADJ'), ('charge', 'NOUN'), ('of', 'ADP'), ('the', 'DET'), ('election', 'NOUN'), (',', '.'), ('``', '.'), ('deserves', 'VERB'), ('the', 'DET'), ('praise', 'NOUN'), ('and', 'CONJ'), ('thanks', 'NOUN'), ('of', 'ADP'), ('the', 'DET'), ('City

In [None]:
inputs = []
targets = []

for sentence_tag_pairs in corpus:
    tokens = []
    target = []
    for token, tag in sentence_tag_pairs:
        tokens.append(token)
        target.append(tag)
    inputs.append(tokens)
    targets.append(target)

In [None]:
# save data to json format
import json

with open('data.json', 'w') as f:
    for x, y in zip(inputs, targets):
        j = {'inputs':x, 'targets':y}
        s = json.dumps(j)
        f.write(f"{s}\n")

In [None]:
data = load_dataset('json', data_files='data.json')
data

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['inputs', 'targets'],
        num_rows: 57340
    })
})

In [None]:
small = data['train'].shuffle(seed=42).select(range(20_000))
small

Dataset({
    features: ['inputs', 'targets'],
    num_rows: 20000
})

In [None]:
data = small.train_test_split(seed=42)

print(data['train'][0])

{'inputs': ['Ulyate', 'and', 'Kearton', 'climbed', 'on', 'toward', 'the', 'sound', 'of', 'the', 'barking', 'of', 'the', 'dogs', 'and', 'the', 'sporadic', 'roaring', 'of', 'the', 'lion', ',', 'till', 'they', 'came', ',', 'out', 'of', 'breath', ',', 'to', 'the', 'crest', ',', 'and', 'peering', 'through', 'the', 'branches', 'of', 'a', 'bush', ',', 'this', 'is', 'what', 'Ulyate', 'saw', ':', 'Jones', 'who', 'had', 'apparently', '(', 'and', 'actually', 'had', ')', 'ridden', 'up', 'the', 'nearly', 'impassable', 'hillside', ',', 'sitting', 'calmly', 'on', 'his', 'horse', 'within', 'forty', 'feet', 'of', 'a', 'full-grown', 'young', 'lioness', ',', 'who', 'was', 'crouched', 'on', 'a', 'flat', 'rock', 'and', 'seemed', 'just', 'about', 'to', 'charge', 'him', ',', 'while', 'the', 'dogs', 'whirled', 'around', 'her', '.'], 'targets': ['NOUN', 'CONJ', 'NOUN', 'VERB', 'PRT', 'ADP', 'DET', 'NOUN', 'ADP', 'DET', 'NOUN', 'ADP', 'DET', 'NOUN', 'CONJ', 'DET', 'ADJ', 'NOUN', 'ADP', 'DET', 'NOUN', '.', 'ADP'

In [None]:
data['train'].features

{'inputs': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'targets': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)}

In [None]:
# map targets to ints
target_set = set()
for target in targets:
    target_set = target_set.union(target)
print(target_set)

{'NUM', 'VERB', 'ADV', 'NOUN', 'X', '.', 'DET', 'ADJ', 'ADP', 'PRT', 'PRON', 'CONJ'}


In [None]:
target_list = list(target_set)
id2label = {k : v for k, v in enumerate(target_list)}
label2id = {v : k for k, v in id2label.items()}

In [None]:
from transformers import AutoTokenizer

checkpoint = 'distilbert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
idx = 0
t = tokenizer(data['train'][idx]['inputs'], is_split_into_words=True)
t

{'input_ids': [101, 158, 25928, 1566, 1105, 26835, 9349, 1320, 5998, 1113, 1755, 1103, 1839, 1104, 1103, 26635, 1104, 1103, 6363, 1105, 1103, 188, 27695, 23041, 1104, 1103, 11160, 117, 6174, 1152, 1338, 117, 1149, 1104, 2184, 117, 1106, 1103, 13468, 117, 1105, 19205, 1194, 1103, 5020, 1104, 170, 13771, 117, 1142, 1110, 1184, 158, 25928, 1566, 1486, 131, 2690, 1150, 1125, 4547, 113, 1105, 2140, 1125, 114, 17698, 1146, 1103, 2212, 24034, 11192, 1895, 25068, 117, 2807, 13285, 1113, 1117, 3241, 1439, 5808, 1623, 1104, 170, 1554, 118, 4215, 1685, 11160, 5800, 117, 1150, 1108, 15062, 1113, 170, 3596, 2067, 1105, 1882, 1198, 1164, 1106, 2965, 1140, 117, 1229, 1103, 6363, 18370, 1213, 1123, 119, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [None]:
type(t)

In [None]:
print(t.tokens())

['[CLS]', 'U', '##lya', '##te', 'and', 'Ke', '##art', '##on', 'climbed', 'on', 'toward', 'the', 'sound', 'of', 'the', 'barking', 'of', 'the', 'dogs', 'and', 'the', 's', '##poradic', 'roaring', 'of', 'the', 'lion', ',', 'till', 'they', 'came', ',', 'out', 'of', 'breath', ',', 'to', 'the', 'crest', ',', 'and', 'peering', 'through', 'the', 'branches', 'of', 'a', 'bush', ',', 'this', 'is', 'what', 'U', '##lya', '##te', 'saw', ':', 'Jones', 'who', 'had', 'apparently', '(', 'and', 'actually', 'had', ')', 'ridden', 'up', 'the', 'nearly', 'imp', '##ass', '##able', 'hillside', ',', 'sitting', 'calmly', 'on', 'his', 'horse', 'within', 'forty', 'feet', 'of', 'a', 'full', '-', 'grown', 'young', 'lion', '##ess', ',', 'who', 'was', 'crouched', 'on', 'a', 'flat', 'rock', 'and', 'seemed', 'just', 'about', 'to', 'charge', 'him', ',', 'while', 'the', 'dogs', 'whirled', 'around', 'her', '.', '[SEP]']


In [None]:
# value of i indicates it is the i'th word
# in the input sentence (counting from 0)
print(t.word_ids())

[None, 0, 0, 0, 1, 2, 2, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 46, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 62, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 75, 75, 76, 77, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, None]


In [None]:
def align_targets(labels, word_ids):
    aligned_labels = []
    for word in word_ids:
        if word is None:
            # it's a token like [CLS]
            label = -100
        else:
            # it's a real word
            label = label2id[labels[word]]

        # add the label
        aligned_labels.append(label)

    return aligned_labels

In [None]:
# try our function
labels = data['train'][idx]['targets']
word_ids = t.word_ids()
aligned_targets = align_targets(labels, word_ids)
print(aligned_targets)

[-100, 3, 3, 3, 11, 3, 3, 3, 1, 9, 8, 6, 3, 8, 6, 3, 8, 6, 3, 11, 6, 7, 7, 3, 8, 6, 3, 5, 8, 10, 1, 5, 9, 8, 3, 5, 8, 6, 3, 5, 11, 1, 8, 6, 3, 8, 6, 3, 5, 6, 1, 6, 3, 3, 3, 1, 5, 3, 10, 1, 2, 5, 11, 2, 1, 5, 1, 8, 6, 2, 7, 7, 7, 3, 5, 1, 2, 8, 6, 3, 8, 0, 3, 8, 6, 7, 7, 7, 7, 3, 3, 5, 10, 1, 1, 8, 6, 7, 3, 11, 1, 2, 2, 9, 1, 10, 5, 8, 6, 3, 1, 8, 10, 5, -100]


In [None]:
aligned_labels = [id2label[i] if i >= 0 else None for i in aligned_targets]
for x, y in zip(t.tokens(), aligned_labels):
    print(f"{x}\t\t{y}")

[CLS]		None
U		NOUN
##lya		NOUN
##te		NOUN
and		CONJ
Ke		NOUN
##art		NOUN
##on		NOUN
climbed		VERB
on		PRT
toward		ADP
the		DET
sound		NOUN
of		ADP
the		DET
barking		NOUN
of		ADP
the		DET
dogs		NOUN
and		CONJ
the		DET
s		ADJ
##poradic		ADJ
roaring		NOUN
of		ADP
the		DET
lion		NOUN
,		.
till		ADP
they		PRON
came		VERB
,		.
out		PRT
of		ADP
breath		NOUN
,		.
to		ADP
the		DET
crest		NOUN
,		.
and		CONJ
peering		VERB
through		ADP
the		DET
branches		NOUN
of		ADP
a		DET
bush		NOUN
,		.
this		DET
is		VERB
what		DET
U		NOUN
##lya		NOUN
##te		NOUN
saw		VERB
:		.
Jones		NOUN
who		PRON
had		VERB
apparently		ADV
(		.
and		CONJ
actually		ADV
had		VERB
)		.
ridden		VERB
up		ADP
the		DET
nearly		ADV
imp		ADJ
##ass		ADJ
##able		ADJ
hillside		NOUN
,		.
sitting		VERB
calmly		ADV
on		ADP
his		DET
horse		NOUN
within		ADP
forty		NUM
feet		NOUN
of		ADP
a		DET
full		ADJ
-		ADJ
grown		ADJ
young		ADJ
lion		NOUN
##ess		NOUN
,		.
who		PRON
was		VERB
crouched		VERB
on		ADP
a		DET
flat		ADJ
rock		NOUN
and		CONJ
se

In [None]:
# tokenize both inputs and targets
def tokenize_fn(batch):
    # tokenize the input sequence first
    # this populates input_ids, attention_mask, etc.
    tokenized_inputs = tokenizer(
        batch['inputs'], truncation=True, is_split_into_words=True
    )

    labels_batch = batch['targets'] # original targets
    aligned_labels_batch = []
    for i, labels in enumerate(labels_batch):
        word_ids = tokenized_inputs.word_ids(i)
        aligned_labels_batch.append(align_targets(labels, word_ids))

    # recall: the 'target' must be stored in key called 'labels'
    tokenized_inputs['labels'] = aligned_labels_batch

    return tokenized_inputs

In [None]:
# want to remove these from model inputs - they are neither inputs nor targets
data['train'].column_names

['inputs', 'targets']

In [None]:
tokenized_datdasets = data.map(
    tokenize_fn,
    batched=True,
    remove_columns=data['train'].column_names,
)

Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [None]:
tokenized_datdasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 15000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 5000
    })
})

In [None]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [None]:
def flatten(list_of_lists):
    flattened = [val for sublist in list_of_lists for val in sublist]
    return flattened

In [None]:
import numpy as np
from sklearn.metrics import f1_score, accuracy_score

def compute_metrics(logits_and_labels):
    logits, labels = logits_and_labels
    preds = np.argmax(logits, axis=-1)

    # remove -100 from labels and predictions
    labels_jagged = [[t for t in label if t != -100] for label in labels]

    # do the same for predictions whenever true label is -100
    preds_jagged = [[p for p, t in zip(ps, ts) if t != -100] \
                    for ps, ts in zip(preds, labels)]

    # flatten labels and preds
    labels_flat = flatten(labels_jagged)
    preds_flat = flatten(preds_jagged)

    acc = accuracy_score(labels_flat, preds_flat)
    f1 = f1_score(labels_flat, preds_flat, average='macro')

    return {
        'f1' : f1,
        'accuracy' : acc
    }

In [None]:
labels = [[-100, 0, 0, 1, 2, 1, -100]]
logits = np.array([[
    [0.8, 0.1, 0.1],
    [0.8, 0.1, 0.1],
    [0.8, 0.1, 0.1],
    [0.8, 0.1, 0.1],
    [0.1, 0.8, 0.1],
    [0.1, 0.8, 0.1],
    [0.1, 0.8, 0.1],
]])
compute_metrics((logits, labels))

{'f1': 0.43333333333333335, 'accuracy': 0.6}

In [None]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    checkpoint,
    id2label=id2label,
    label2id=label2id,
)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    'distilbert-finetuned-ner',
    eval_strategy='epoch',
    save_strategy='epoch',
    num_train_epochs=2,
)

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

trainer.train()

Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.0868,0.176371,0.864965,0.960101
2,0.0315,0.177177,0.887562,0.9685


TrainOutput(global_step=3512, training_loss=0.08935407180856735, metrics={'train_runtime': 209.0022, 'train_samples_per_second': 134.362, 'train_steps_per_second': 16.804, 'total_flos': 306516293005176.0, 'train_loss': 0.08935407180856735, 'epoch': 2.0})

In [None]:
trainer.save_model('my_saved_model')

In [None]:
from transformers import pipeline

pipe = pipeline(
    'token-classification',
    model='my_saved_model',
    device=0
)

s = 'Bill Gates was the CEO of Microsoft in Seattle, Washington.'
pipe(s)

[{'entity': 'VERB',
  'score': 0.9987937,
  'index': 1,
  'word': 'Bill',
  'start': 0,
  'end': 4},
 {'entity': 'ADV',
  'score': 0.99902904,
  'index': 2,
  'word': 'Gates',
  'start': 5,
  'end': 10},
 {'entity': 'NUM',
  'score': 0.9998404,
  'index': 3,
  'word': 'was',
  'start': 11,
  'end': 14},
 {'entity': 'NUM',
  'score': 0.9998776,
  'index': 4,
  'word': 'the',
  'start': 15,
  'end': 18},
 {'entity': 'NUM',
  'score': 0.9996457,
  'index': 5,
  'word': 'CEO',
  'start': 19,
  'end': 22},
 {'entity': 'NUM',
  'score': 0.9997403,
  'index': 6,
  'word': 'of',
  'start': 23,
  'end': 25},
 {'entity': 'NOUN',
  'score': 0.9978194,
  'index': 7,
  'word': 'Microsoft',
  'start': 26,
  'end': 35},
 {'entity': 'NUM',
  'score': 0.99976104,
  'index': 8,
  'word': 'in',
  'start': 36,
  'end': 38},
 {'entity': '.',
  'score': 0.9987637,
  'index': 9,
  'word': 'Seattle',
  'start': 39,
  'end': 46},
 {'entity': 'NUM',
  'score': 0.9989598,
  'index': 10,
  'word': ',',
  'start':