# Load Wikiann 


In [1]:
from datasets import load_dataset, load_metric, concatenate_datasets

datasets_es = load_dataset("tner/wikiann", "es")
datasets_fr = load_dataset("tner/wikiann", "fr")
datasets_en = load_dataset("tner/wikiann", "en")

  from .autonotebook import tqdm as notebook_tqdm
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [2]:
tags = {
    "B-LOC": 0,
    "B-ORG": 1,
    "B-PER": 2,
    "I-LOC": 3,
    "I-ORG": 4,
    "I-PER": 5,
    "O": 6
}
int_to_tag = {v: k for k, v in tags.items()}

In [3]:
train_dataset = concatenate_datasets([datasets_en["train"], datasets_es["train"], datasets_fr["train"]])
val_dataset = concatenate_datasets([datasets_en["validation"], datasets_es["validation"], datasets_fr["validation"]])
test_dataset = concatenate_datasets([datasets_en["test"], datasets_es["test"], datasets_fr["test"]])



# Load tokenizer

In [4]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased") 

# Assert that tokenizer is fast tokenizer 

In [5]:
import transformers
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

# demo of tokenizer

This tokenizer can take in both a raw string or a pre tokenized input 

In [6]:
example  = datasets_en["train"][0]

In [7]:
example

{'tokens': ['R.H.',
  'Saunders',
  '(',
  'St.',
  'Lawrence',
  'River',
  ')',
  '(',
  '968',
  'MW',
  ')'],
 'tags': [1, 4, 6, 1, 4, 4, 6, 6, 6, 6, 6]}

In [8]:
example["tokens"]

['R.H.',
 'Saunders',
 '(',
 'St.',
 'Lawrence',
 'River',
 ')',
 '(',
 '968',
 'MW',
 ')']

In [9]:
tokenizer("Hello, I am Javin and I am testing this tokenizer.")

{'input_ids': [101, 31178, 117, 146, 10392, 28248, 15478, 10111, 146, 10392, 38306, 10531, 18436, 18687, 14210, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [10]:
tokenized_sentence = tokenizer(example["tokens"], is_split_into_words=True)
print(tokenized_sentence)

{'input_ids': [101, 155, 119, 145, 119, 49046, 113, 10838, 119, 16198, 11575, 114, 113, 62449, 27042, 114, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


There is even a convenient function to convert the ids back to the tokens

In [11]:
tokens = tokenizer.convert_ids_to_tokens(tokenized_sentence['input_ids'] )
print(tokens)

['[CLS]', 'R', '.', 'H', '.', 'Saunders', '(', 'St', '.', 'Lawrence', 'River', ')', '(', '968', 'MW', ')', '[SEP]']


# Word Ids is a means to map these tokenized words to their original tokens splits. 

In [12]:
print(tokenized_sentence.word_ids())

[None, 0, 0, 0, 0, 1, 2, 3, 3, 4, 5, 6, 7, 8, 9, 10, None]


# This function was used in the wikineural sample code to align the tokens back to their original labels

In [13]:
label_all_tokens = False

In [14]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    #for wikineural this would be called ner_tags but for wikiann it is tags
    for i, label in enumerate(examples["tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [15]:
tokenize_and_align_labels(datasets_en['train'][:1])

{'input_ids': [[101, 155, 119, 145, 119, 49046, 113, 10838, 119, 16198, 11575, 114, 113, 62449, 27042, 114, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[-100, 1, -100, -100, -100, 4, 6, 1, -100, 4, 4, 6, 6, 6, 6, 6, -100]]}

# Turn Back into text

In [16]:
text_ids = tokenize_and_align_labels(datasets_en['train'][:1])['input_ids'][0]

In [17]:
text = tokenizer.decode(text_ids, skip_special_tokens=False)
text

'[CLS] R. H. Saunders ( St. Lawrence River ) ( 968 MW ) [SEP]'

# compare with original

In [18]:
datasets_en['train'][:1]['tokens'][0]

['R.H.',
 'Saunders',
 '(',
 'St.',
 'Lawrence',
 'River',
 ')',
 '(',
 '968',
 'MW',
 ')']

In [19]:
train_tokenized = train_dataset.map(tokenize_and_align_labels, batched=True)
val_tokenized = val_dataset.map(tokenize_and_align_labels, batched=True)
test_tokenized = test_dataset.map(tokenize_and_align_labels, batched=True)

# Fine tune the model on wikiann


The following labels_vocab and labels_vocab_reverse needs to be changed when switching between wikineural and wikiann. 

In [20]:
label_list = ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

labels_vocab = {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8}
labels_vocab_reverse = {v:k for k,v in labels_vocab.items()}

In [21]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained("bert-base-multilingual-cased", num_labels=len(label_list), label2id=labels_vocab, id2label=labels_vocab_reverse)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
model_name = "bert-base-multilingual-cased"
args = TrainingArguments(
    "wikineural-multilingual-ner",
    evaluation_strategy = "steps",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    push_to_hub=True,
    eval_steps=10000,
    save_steps=10000,
)

In [24]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

In [25]:
metric = load_metric("seqeval")

  metric = load_metric("seqeval")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


# Evaluation script from wikineural paper. 

In [26]:
import numpy as np

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

## Define trainer object 

In [27]:
trainer = Trainer(
    model,
    args,
    train_dataset=train_tokenized,
    eval_dataset=test_tokenized,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [28]:
trainer.train()

  7%|▋         | 503/7500 [00:29<06:48, 17.11it/s]

{'loss': 0.4633, 'grad_norm': 9.536806106567383, 'learning_rate': 1.866666666666667e-05, 'epoch': 0.07}


 13%|█▎        | 1002/7500 [00:58<06:26, 16.81it/s]

{'loss': 0.3171, 'grad_norm': 5.04863977432251, 'learning_rate': 1.7333333333333336e-05, 'epoch': 0.13}


 20%|██        | 1503/7500 [01:28<05:54, 16.91it/s]

{'loss': 0.2954, 'grad_norm': 14.447076797485352, 'learning_rate': 1.6000000000000003e-05, 'epoch': 0.2}


 27%|██▋       | 2003/7500 [01:57<05:34, 16.42it/s]

{'loss': 0.2746, 'grad_norm': 10.987454414367676, 'learning_rate': 1.4666666666666666e-05, 'epoch': 0.27}


 33%|███▎      | 2502/7500 [02:26<05:21, 15.53it/s]

{'loss': 0.2423, 'grad_norm': 8.759507179260254, 'learning_rate': 1.3333333333333333e-05, 'epoch': 0.33}


 40%|████      | 3002/7500 [02:57<04:28, 16.73it/s]

{'loss': 0.2665, 'grad_norm': 21.534631729125977, 'learning_rate': 1.2e-05, 'epoch': 0.4}


 47%|████▋     | 3503/7500 [03:26<03:59, 16.71it/s]

{'loss': 0.2527, 'grad_norm': 8.798910140991211, 'learning_rate': 1.0666666666666667e-05, 'epoch': 0.47}


 53%|█████▎    | 4003/7500 [03:54<03:29, 16.66it/s]

{'loss': 0.2528, 'grad_norm': 0.8640161752700806, 'learning_rate': 9.333333333333334e-06, 'epoch': 0.53}


 60%|██████    | 4502/7500 [04:23<02:58, 16.78it/s]

{'loss': 0.2241, 'grad_norm': 6.091860294342041, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.6}


 67%|██████▋   | 5003/7500 [04:53<02:27, 16.94it/s]

{'loss': 0.2243, 'grad_norm': 4.608696937561035, 'learning_rate': 6.666666666666667e-06, 'epoch': 0.67}


 73%|███████▎  | 5503/7500 [05:22<01:58, 16.79it/s]

{'loss': 0.2127, 'grad_norm': 1.068033218383789, 'learning_rate': 5.333333333333334e-06, 'epoch': 0.73}


 80%|████████  | 6003/7500 [05:51<01:30, 16.63it/s]

{'loss': 0.2168, 'grad_norm': 10.226728439331055, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.8}


 87%|████████▋ | 6503/7500 [06:21<00:57, 17.36it/s]

{'loss': 0.2181, 'grad_norm': 16.722795486450195, 'learning_rate': 2.666666666666667e-06, 'epoch': 0.87}


 93%|█████████▎| 7003/7500 [06:50<00:29, 16.71it/s]

{'loss': 0.1931, 'grad_norm': 15.870783805847168, 'learning_rate': 1.3333333333333334e-06, 'epoch': 0.93}


100%|██████████| 7500/7500 [07:20<00:00, 17.03it/s]

{'loss': 0.1993, 'grad_norm': 6.276727676391602, 'learning_rate': 0.0, 'epoch': 1.0}
{'train_runtime': 440.2859, 'train_samples_per_second': 136.275, 'train_steps_per_second': 17.034, 'train_loss': 0.2568553863525391, 'epoch': 1.0}





TrainOutput(global_step=7500, training_loss=0.2568553863525391, metrics={'train_runtime': 440.2859, 'train_samples_per_second': 136.275, 'train_steps_per_second': 17.034, 'train_loss': 0.2568553863525391, 'epoch': 1.0})

In [29]:
trainer.evaluate()

  0%|          | 0/3750 [00:00<?, ?it/s]

100%|██████████| 3750/3750 [00:36<00:00, 101.98it/s]


{'eval_loss': 0.206732839345932,
 'eval_precision': 0.9299254385551481,
 'eval_recall': 0.9196271265439292,
 'eval_f1': 0.9247476120396321,
 'eval_accuracy': 0.9459044839504787,
 'eval_runtime': 36.7834,
 'eval_samples_per_second': 815.585,
 'eval_steps_per_second': 101.948,
 'epoch': 1.0}

In [30]:
predictions, labels, _ = trainer.predict(test_tokenized)
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
results

100%|██████████| 3750/3750 [00:36<00:00, 101.65it/s]


{'LOC': {'precision': 0.9425409456118665,
  'recall': 0.9320994728397891,
  'f1': 0.9372911304882265,
  'number': 52356},
 'ORG': {'precision': 0.8907841662678608,
  'recall': 0.8770825620140689,
  'f1': 0.883880267955567,
  'number': 29711},
 'PER': {'precision': 0.9495550145667877,
  'recall': 0.9438670263408442,
  'f1': 0.9467024768725754,
  'number': 25208},
 'overall_precision': 0.9299254385551481,
 'overall_recall': 0.9196271265439292,
 'overall_f1': 0.9247476120396321,
 'overall_accuracy': 0.9459044839504787}