<a href="https://colab.research.google.com/github/linqus/nlp-huggingface/blob/main/notebooks/unit7/nlp_7_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
!pip install datasets



In [7]:
from datasets import load_dataset

dataset = load_dataset("conll2003")
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [8]:
dataset["train"][0]['tokens']

['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']

In [9]:
dataset['train'][0]['ner_tags']

[3, 0, 7, 0, 0, 0, 7, 0, 0]

In [10]:
label_names = dataset['train'].features['ner_tags']
label_names.feature.names


['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [11]:
words = dataset["train"][0]["tokens"]
labels = dataset["train"][0]["ner_tags"]
line1 = ""
line2 = ""
for word, label in zip(words, labels):
    full_label = label_names.feature.names[label]
    max_length = max(len(word), len(full_label))
    line1 += word + " " * (max_length - len(word) + 1)
    line2 += full_label + " " * (max_length - len(full_label) + 1)

print(line1)
print(line2)

EU    rejects German call to boycott British lamb . 
B-ORG O       B-MISC O    O  O       B-MISC  O    O 


In [12]:
words = dataset["train"][4]["tokens"]
labels = dataset["train"][4]["ner_tags"]
line1 = ""
line2 = ""
for word, label in zip(words, labels):
    full_label = label_names.feature.names[label]
    max_length = max(len(word), len(full_label))
    line1 += word + " " * (max_length - len(word) + 1)
    line2 += full_label + " " * (max_length - len(full_label) + 1)

print(line1)
print(line2)

Germany 's representative to the European Union 's veterinary committee Werner Zwingmann said on Wednesday consumers should buy sheepmeat from countries other than Britain until the scientific advice was clearer . 
B-LOC   O  O              O  O   B-ORG    I-ORG O  O          O         B-PER  I-PER     O    O  O         O         O      O   O         O    O         O     O    B-LOC   O     O   O          O      O   O       O 


In [13]:
labels

[5,
 0,
 0,
 0,
 0,
 3,
 4,
 0,
 0,
 0,
 1,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 5,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

In [14]:
words = dataset["train"][0]["tokens"]
labels = dataset["train"][0]["pos_tags"]
line1 = ""
line2 = ""
for word, label in zip(words, labels):
    full_label = dataset['train'].features['pos_tags'].feature.names[label]
    max_length = max(len(word), len(full_label))
    line1 += word + " " * (max_length - len(word) + 1)
    line2 += full_label + " " * (max_length - len(full_label) + 1)

print(line1)
print(line2)

EU  rejects German call to boycott British lamb . 
NNP VBZ     JJ     NN   TO VB      JJ      NN   . 


In [15]:
from transformers import AutoTokenizer

model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

inputs = tokenizer(dataset['train'][0]['tokens'], is_split_into_words=True)
inputs.tokens()

['[CLS]',
 'EU',
 'rejects',
 'German',
 'call',
 'to',
 'boycott',
 'British',
 'la',
 '##mb',
 '.',
 '[SEP]']

In [16]:
inputs.word_ids(), len(inputs.word_ids())

([None, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, None], 12)

In [17]:
inputs

{'input_ids': [101, 7270, 22961, 1528, 1840, 1106, 21423, 1418, 2495, 12913, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [18]:
inputs.input_ids, len(inputs.input_ids)

([101, 7270, 22961, 1528, 1840, 1106, 21423, 1418, 2495, 12913, 119, 102], 12)

In [19]:
dataset['train']['tokens'][0], len(dataset['train'][0]['tokens'])

(['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'],
 9)

In [20]:
def align_labels_with_words(labels, word_ids):
  new_labels = []
  current_word = None
  for word_id in word_ids:
    if word_id != current_word:
      current_word = word_id
      if current_word == None:
        new_labels.append(-100)
      else:
        new_labels.append(labels[word_id])
    elif word_id == None:
      new_labels.append(-100)
    else:
      label = labels[word_id]
      # dirty hack...
      if label % 2 == 1:
          label += 1
      new_labels.append(label)

  return new_labels




In [21]:
dataset['train'][0]["ner_tags"]

[3, 0, 7, 0, 0, 0, 7, 0, 0]

In [22]:
inputs.word_ids()

[None, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, None]

In [23]:
align_labels_with_words(dataset['train'][0]["ner_tags"], inputs.word_ids())

[-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100]

In [24]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer( examples["tokens"], truncation=True, is_split_into_words=True )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
      new_labels.append( align_labels_with_words(labels, tokenized_inputs.word_ids(i)) )
    tokenized_inputs['labels'] = new_labels
    return tokenized_inputs

In [25]:
tokenized = tokenize_and_align_labels(dataset['train'][:10])
tokenized

{'input_ids': [[101, 7270, 22961, 1528, 1840, 1106, 21423, 1418, 2495, 12913, 119, 102], [101, 1943, 14428, 102], [101, 26660, 13329, 12649, 15928, 1820, 118, 4775, 118, 1659, 102], [101, 1109, 1735, 2827, 1163, 1113, 9170, 1122, 19786, 1114, 1528, 5566, 1106, 11060, 1106, 188, 17315, 1418, 2495, 12913, 1235, 6479, 4959, 2480, 6340, 13991, 3653, 1169, 1129, 12086, 1106, 8892, 119, 102], [101, 1860, 112, 188, 4702, 1106, 1103, 1735, 1913, 112, 188, 27431, 3914, 14651, 163, 7635, 4119, 1163, 1113, 9031, 11060, 1431, 4417, 8892, 3263, 2980, 1121, 2182, 1168, 1190, 2855, 1235, 1103, 3812, 5566, 1108, 27830, 119, 102], [101, 107, 1284, 1202, 183, 112, 189, 1619, 1251, 1216, 13710, 1272, 1195, 1202, 183, 112, 189, 1267, 1251, 4745, 1111, 1122, 117, 107, 1103, 2827, 112, 188, 2705, 15465, 28010, 1361, 3498, 4167, 19585, 1116, 1500, 170, 2371, 4094, 1158, 119, 102], [101, 1124, 1163, 1748, 3812, 2025, 1108, 2320, 1105, 1191, 1122, 1108, 1276, 1115, 2168, 1108, 1834, 1122, 1431, 1129, 1678, 111

In [26]:
dataset.column_names['train']

['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags']

In [27]:
tokenized_dataset = dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=dataset['train'].column_names
)



In [28]:
tokenized_dataset['train'].column_names

['input_ids', 'token_type_ids', 'attention_mask', 'labels']

In [29]:
from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(tokenizer)

In [30]:
data_collator( [ tokenized_dataset['train'][i] for i in range(4) ] )

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'input_ids': tensor([[  101,  7270, 22961,  1528,  1840,  1106, 21423,  1418,  2495, 12913,
           119,   102,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0],
        [  101,  1943, 14428,   102,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0],
        [  101, 26660, 13329, 12649, 15928,  1820,   118,  4775,   118,  1659,
           102,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0],
        [  101,  1109,  1735,  2827,  1163,  1113,  9170,  1122, 19786,  1114,
          1528,  5566,  1106, 11060,  1106,   188, 17315,  1418,  2495, 12913,
   

In [31]:
!pip install  evaluate seqeval



In [32]:
import evaluate

metric = evaluate.load('seqeval')

In [33]:
label_names

Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None), length=-1, id=None)

In [34]:
label_names.feature.names

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [35]:
labels

[22, 42, 16, 21, 35, 37, 16, 21, 7]

In [36]:
labels = dataset["train"][0]["ner_tags"]
labels = [label_names.feature.names[i] for i in labels]
labels

['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']

In [37]:
predictions=labels.copy()
predictions[2]='O'

In [38]:
metric.compute(predictions=[predictions], references=[labels])

{'MISC': {'precision': 1.0,
  'recall': 0.5,
  'f1': 0.6666666666666666,
  'number': 2},
 'ORG': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'overall_precision': 1.0,
 'overall_recall': 0.6666666666666666,
 'overall_f1': 0.8,
 'overall_accuracy': 0.8888888888888888}

In [51]:
import numpy as np


def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names.feature.names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names.feature.names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [40]:
id2label = {i: label for i, label in enumerate(label_names.feature.names)}
label2id = {v: k for k, v in id2label.items()}

In [41]:
id2label

{0: 'O',
 1: 'B-PER',
 2: 'I-PER',
 3: 'B-ORG',
 4: 'I-ORG',
 5: 'B-LOC',
 6: 'I-LOC',
 7: 'B-MISC',
 8: 'I-MISC'}

In [42]:
label2id

{'O': 0,
 'B-PER': 1,
 'I-PER': 2,
 'B-ORG': 3,
 'I-ORG': 4,
 'B-LOC': 5,
 'I-LOC': 6,
 'B-MISC': 7,
 'I-MISC': 8}

In [43]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [44]:
model.config.num_labels


9

In [45]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [46]:
!pip install transformers[sentencepiece]



In [47]:
!pip install accelerate -U



In [5]:
from transformers import TrainingArguments

args = TrainingArguments(
    "bert-finetuned-ner",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=True,
)

In [52]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.025,0.067722,0.926877,0.947156,0.936907,0.984753
2,0.0227,0.068088,0.930163,0.948166,0.939078,0.98568
3,0.015,0.067045,0.932872,0.951868,0.942274,0.986652


TrainOutput(global_step=5268, training_loss=0.025378221621908014, metrics={'train_runtime': 605.6077, 'train_samples_per_second': 69.555, 'train_steps_per_second': 8.699, 'total_flos': 920580703084350.0, 'train_loss': 0.025378221621908014, 'epoch': 3.0})

In [53]:
trainer.push_to_hub(commit_message="Training complete")

'https://huggingface.co/linqus/bert-finetuned-ner/tree/main/'