In [1]:
from datasets import load_dataset

raw_dataset = load_dataset("wikiann", 'en')
dataset_feature = raw_dataset["train"].features
ner_labels = dataset_feature["ner_tags"].feature.names
id2label = {'0': 'O', '1': 'B-MIS', '2': 'I-MIS', '3': 'B-PER', '4': 'I-PER', '5': 'B-ORG', '6': 'I-ORG', '7': 'B-LOC', '8': 'I-LOC'}
label2id = {value: key for key, value in id2label.items()}
print(id2label)

Found cached dataset wikiann (C:/Users/zimin/.cache/huggingface/datasets/wikiann/en/1.1.0/4bfd4fe4468ab78bb6e096968f61fab7a888f44f9d3371c2f3fea7e74a5a354e)


  0%|          | 0/3 [00:00<?, ?it/s]

{'0': 'O', '1': 'B-MIS', '2': 'I-MIS', '3': 'B-PER', '4': 'I-PER', '5': 'B-ORG', '6': 'I-ORG', '7': 'B-LOC', '8': 'I-LOC'}


In [2]:
def align_labels_and_tokens(word_ids, labels):
    updated_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            current_word = word_id
            updated_labels.append(-100 if word_id is None else labels[word_id])   
        elif word_id is None:
            updated_labels.append(-100)
        else:
            label = labels[word_id]
            if label % 2 == 1:
                label+=1
            updated_labels.append(label)
    return updated_labels


In [4]:
def tokenize_and_align_labels(dataset):
    tokenized_data = tokenizer(dataset["tokens"], truncation=True, is_split_into_words=True)
    all_labels = dataset["ner_tags"]
    updated_labels = []
    for i, labels in enumerate(all_labels):
        updated_labels.append(align_labels_and_tokens(tokenized_data.word_ids(i), labels))
    tokenized_data["labels"] = updated_labels
    return tokenized_data

from transformers import AutoTokenizer
model_checkpoint = "dslim/bert-base-NER"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
batch_size = 16

tokenized_dataset = raw_dataset.map(
    tokenize_and_align_labels,
    batched = True,
    remove_columns = raw_dataset["train"].column_names
)



Loading cached processed dataset at C:/Users/zimin/.cache/huggingface/datasets/wikiann/en/1.1.0/4bfd4fe4468ab78bb6e096968f61fab7a888f44f9d3371c2f3fea7e74a5a354e\cache-eab2f4ebde86760d.arrow
Loading cached processed dataset at C:/Users/zimin/.cache/huggingface/datasets/wikiann/en/1.1.0/4bfd4fe4468ab78bb6e096968f61fab7a888f44f9d3371c2f3fea7e74a5a354e\cache-cf2c59578e7dae45.arrow
Loading cached processed dataset at C:/Users/zimin/.cache/huggingface/datasets/wikiann/en/1.1.0/4bfd4fe4468ab78bb6e096968f61fab7a888f44f9d3371c2f3fea7e74a5a354e\cache-0ff7e2b26b644afd.arrow


In [5]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from datasets import load_metric

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint, 
    num_labels= 9,
    id2label = id2label,
    label2id = label2id
)


args = TrainingArguments(
    model_checkpoint,
    evaluation_strategy = "epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01
)

data_collator = DataCollatorForTokenClassification(tokenizer)
metric = load_metric("seqeval")




  metric = load_metric("seqeval")


In [6]:
import numpy as np
def compute_metrics(p):
    
    predictions, labels = p 
    predictions = np.argmax(predictions, axis=2)
    true_predictions = [
        [ner_labels[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [ner_labels[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    metrics = metric.compute(predictions=true_predictions, references=true_labels)
    
    return {
        "precision": metrics["overall_precision"],
        "recall": metrics["overall_recall"],
        "f1": metrics["overall_f1"],
        "accuracy": metrics["overall_accuracy"]
    }

In [7]:
trainer = Trainer(
    model,
    args,
    train_dataset = tokenized_dataset["train"],
    eval_dataset =  tokenized_dataset["validation"],
    data_collator = data_collator,
    tokenizer = tokenizer,
    compute_metrics = compute_metrics
)

trainer.train() 
trainer.save_model('fine-tune-1.model')

***** Running training *****
  Num examples = 20000
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 3750
  Number of trainable parameters = 107726601


  0%|          | 0/3750 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Saving model checkpoint to dslim/bert-base-NER\checkpoint-500
Configuration saved in dslim/bert-base-NER\checkpoint-500\config.json


{'loss': 0.442, 'learning_rate': 8.666666666666667e-05, 'epoch': 0.4}


Model weights saved in dslim/bert-base-NER\checkpoint-500\pytorch_model.bin
tokenizer config file saved in dslim/bert-base-NER\checkpoint-500\tokenizer_config.json
Special tokens file saved in dslim/bert-base-NER\checkpoint-500\special_tokens_map.json
Saving model checkpoint to dslim/bert-base-NER\checkpoint-1000
Configuration saved in dslim/bert-base-NER\checkpoint-1000\config.json


{'loss': 0.3397, 'learning_rate': 7.333333333333333e-05, 'epoch': 0.8}


Model weights saved in dslim/bert-base-NER\checkpoint-1000\pytorch_model.bin
tokenizer config file saved in dslim/bert-base-NER\checkpoint-1000\tokenizer_config.json
Special tokens file saved in dslim/bert-base-NER\checkpoint-1000\special_tokens_map.json
***** Running Evaluation *****
  Num examples = 10000
  Batch size = 16


  0%|          | 0/625 [00:00<?, ?it/s]

{'eval_loss': 0.30720797181129456, 'eval_precision': 0.7701528722472893, 'eval_recall': 0.8084264102926623, 'eval_f1': 0.7888256595964821, 'eval_accuracy': 0.9118443776244619, 'eval_runtime': 201.8974, 'eval_samples_per_second': 49.53, 'eval_steps_per_second': 3.096, 'epoch': 1.0}


Saving model checkpoint to dslim/bert-base-NER\checkpoint-1500
Configuration saved in dslim/bert-base-NER\checkpoint-1500\config.json


{'loss': 0.2575, 'learning_rate': 6e-05, 'epoch': 1.2}


Model weights saved in dslim/bert-base-NER\checkpoint-1500\pytorch_model.bin
tokenizer config file saved in dslim/bert-base-NER\checkpoint-1500\tokenizer_config.json
Special tokens file saved in dslim/bert-base-NER\checkpoint-1500\special_tokens_map.json
Saving model checkpoint to dslim/bert-base-NER\checkpoint-2000
Configuration saved in dslim/bert-base-NER\checkpoint-2000\config.json


{'loss': 0.209, 'learning_rate': 4.666666666666667e-05, 'epoch': 1.6}


Model weights saved in dslim/bert-base-NER\checkpoint-2000\pytorch_model.bin
tokenizer config file saved in dslim/bert-base-NER\checkpoint-2000\tokenizer_config.json
Special tokens file saved in dslim/bert-base-NER\checkpoint-2000\special_tokens_map.json
Saving model checkpoint to dslim/bert-base-NER\checkpoint-2500
Configuration saved in dslim/bert-base-NER\checkpoint-2500\config.json


{'loss': 0.1957, 'learning_rate': 3.3333333333333335e-05, 'epoch': 2.0}


Model weights saved in dslim/bert-base-NER\checkpoint-2500\pytorch_model.bin
tokenizer config file saved in dslim/bert-base-NER\checkpoint-2500\tokenizer_config.json
Special tokens file saved in dslim/bert-base-NER\checkpoint-2500\special_tokens_map.json
***** Running Evaluation *****
  Num examples = 10000
  Batch size = 16


  0%|          | 0/625 [00:00<?, ?it/s]

{'eval_loss': 0.2730152904987335, 'eval_precision': 0.7843137254901961, 'eval_recall': 0.8228474480418493, 'eval_f1': 0.8031186393900716, 'eval_accuracy': 0.9174549184279966, 'eval_runtime': 200.2464, 'eval_samples_per_second': 49.938, 'eval_steps_per_second': 3.121, 'epoch': 2.0}


Saving model checkpoint to dslim/bert-base-NER\checkpoint-3000
Configuration saved in dslim/bert-base-NER\checkpoint-3000\config.json


{'loss': 0.105, 'learning_rate': 2e-05, 'epoch': 2.4}


Model weights saved in dslim/bert-base-NER\checkpoint-3000\pytorch_model.bin
tokenizer config file saved in dslim/bert-base-NER\checkpoint-3000\tokenizer_config.json
Special tokens file saved in dslim/bert-base-NER\checkpoint-3000\special_tokens_map.json
Saving model checkpoint to dslim/bert-base-NER\checkpoint-3500
Configuration saved in dslim/bert-base-NER\checkpoint-3500\config.json


{'loss': 0.1075, 'learning_rate': 6.666666666666667e-06, 'epoch': 2.8}


Model weights saved in dslim/bert-base-NER\checkpoint-3500\pytorch_model.bin
tokenizer config file saved in dslim/bert-base-NER\checkpoint-3500\tokenizer_config.json
Special tokens file saved in dslim/bert-base-NER\checkpoint-3500\special_tokens_map.json
***** Running Evaluation *****
  Num examples = 10000
  Batch size = 16


  0%|          | 0/625 [00:00<?, ?it/s]



Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to fine-tune-1.model
Configuration saved in fine-tune-1.model\config.json


{'eval_loss': 0.31454959511756897, 'eval_precision': 0.8040067911714771, 'eval_recall': 0.8369150289834583, 'eval_f1': 0.8201309272280143, 'eval_accuracy': 0.921987183420852, 'eval_runtime': 209.838, 'eval_samples_per_second': 47.656, 'eval_steps_per_second': 2.978, 'epoch': 3.0}
{'train_runtime': 6498.3994, 'train_samples_per_second': 9.233, 'train_steps_per_second': 0.577, 'train_loss': 0.2275054712931315, 'epoch': 3.0}


Model weights saved in fine-tune-1.model\pytorch_model.bin
tokenizer config file saved in fine-tune-1.model\tokenizer_config.json
Special tokens file saved in fine-tune-1.model\special_tokens_map.json
