In [1]:
import os

from datasets import load_dataset, load_metric
import numpy as np
from transformers import AutoModelForTokenClassification, AutoTokenizer, DataCollatorForTokenClassification, Trainer, TrainingArguments

from dataprocessor.conll import DatasetCombiner


In [2]:
def tokenize_and_align_labels(examples, tokenizer, task="ner", label_all_tokens=True):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples[f"{task}_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

metric = load_metric("seqeval")
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }


In [3]:
def build_trainer(dataset):
    label_list = dataset["train"].features[f"ner_tags"].feature.names

    tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
    tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True, fn_kwargs={"tokenizer": tokenizer})

    data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

    model = AutoModelForTokenClassification.from_pretrained("distilbert-base-uncased", num_labels=len(label_list))

    training_args = TrainingArguments(
        output_dir="./results",
        evaluation_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=10,
        weight_decay=0.01,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["test"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )
    return label_list, trainer

In [8]:
dataset = load_dataset("conll2003")

label_list, trainer = build_trainer(dataset)
trainer.train()

Reusing dataset conll2003 (C:\Users\jesse\.cache\huggingface\datasets\conll2003\conll2003\1.0.0\63f4ebd1bcb7148b1644497336fd74643d4ce70123334431a3c053b7ee4e96ee)


  0%|          | 0/3 [00:00<?, ?it/s]

loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at C:\Users\jesse/.cache\huggingface\transformers\23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.18.0",
  "vocab_size": 30522
}

loading file https://huggingface.co/distilbert-base-uncased/resolve/main/vocab.txt from cache at C:\Users\jesse/.cache\huggingface\transformers\0e1bbfda7f6

  0%|          | 0/4 [00:00<?, ?ba/s]

Loading cached processed dataset at C:\Users\jesse\.cache\huggingface\datasets\conll2003\conll2003\1.0.0\63f4ebd1bcb7148b1644497336fd74643d4ce70123334431a3c053b7ee4e96ee\cache-b5637782ee4e68d3.arrow
loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at C:\Users\jesse/.cache\huggingface\transformers\23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2441,0.11466,0.864528,0.873249,0.868867,0.971066
2,0.0487,0.12536,0.875015,0.889865,0.882377,0.972723
3,0.0285,0.130604,0.893117,0.888559,0.890832,0.974829
4,0.0162,0.142743,0.889505,0.895205,0.892346,0.974812
5,0.0102,0.154291,0.890271,0.891645,0.890958,0.975036
6,0.0076,0.158897,0.895976,0.898528,0.897251,0.97602
7,0.0045,0.168889,0.888221,0.894968,0.891582,0.974967
8,0.0038,0.173031,0.891148,0.899715,0.895411,0.975606
9,0.0034,0.174154,0.88967,0.901495,0.895544,0.975589
10,0.0026,0.175273,0.893539,0.899478,0.896499,0.975796


Saving model checkpoint to ./results\checkpoint-500
Configuration saved in ./results\checkpoint-500\config.json
Model weights saved in ./results\checkpoint-500\pytorch_model.bin
tokenizer config file saved in ./results\checkpoint-500\tokenizer_config.json
Special tokens file saved in ./results\checkpoint-500\special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: ner_tags, tokens, id, chunk_tags, pos_tags. If ner_tags, tokens, id, chunk_tags, pos_tags are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3454
  Batch size = 16
Saving model checkpoint to ./results\checkpoint-1000
Configuration saved in ./results\checkpoint-1000\config.json
Model weights saved in ./results\checkpoint-1000\pytorch_model.bin
tokenizer config file saved in ./results\checkpoint-1000\tokenizer_config

tokenizer config file saved in ./results\checkpoint-7500\tokenizer_config.json
Special tokens file saved in ./results\checkpoint-7500\special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: ner_tags, tokens, id, chunk_tags, pos_tags. If ner_tags, tokens, id, chunk_tags, pos_tags are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3454
  Batch size = 16
Saving model checkpoint to ./results\checkpoint-8000
Configuration saved in ./results\checkpoint-8000\config.json
Model weights saved in ./results\checkpoint-8000\pytorch_model.bin
tokenizer config file saved in ./results\checkpoint-8000\tokenizer_config.json
Special tokens file saved in ./results\checkpoint-8000\special_tokens_map.json
Saving model checkpoint to ./results\checkpoint-8500
Configuration saved in ./results\check

TrainOutput(global_step=8780, training_loss=0.03044775820131454, metrics={'train_runtime': 553.6488, 'train_samples_per_second': 253.627, 'train_steps_per_second': 15.858, 'total_flos': 1701127258035660.0, 'train_loss': 0.03044775820131454, 'epoch': 10.0})

In [9]:
trainer.evaluate()

The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: ner_tags, tokens, id, chunk_tags, pos_tags. If ner_tags, tokens, id, chunk_tags, pos_tags are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3454
  Batch size = 16


{'eval_loss': 0.17527250945568085,
 'eval_precision': 0.8935392596085829,
 'eval_recall': 0.8994778067885117,
 'eval_f1': 0.8964986988407855,
 'eval_accuracy': 0.9757958704509357,
 'eval_runtime': 4.5329,
 'eval_samples_per_second': 761.989,
 'eval_steps_per_second': 47.652,
 'epoch': 10.0}

In [4]:
dataset_combiner = DatasetCombiner(os.path.join("datasets", "ATIS"))
dataset = dataset_combiner.dataset
label_list, trainer = build_trainer(dataset)
trainer.train()

  0%|          | 0/5 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForTokenClassification: ['vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN t

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,2.218185,0.26549,0.252288,0.25872,0.701515
2,0.523600,2.444638,0.266964,0.255644,0.261181,0.701515
3,0.523600,2.615385,0.270626,0.257169,0.263726,0.702356
4,0.091800,2.720783,0.270418,0.256559,0.263306,0.702146
5,0.091800,2.802723,0.270679,0.256559,0.26343,0.701935
6,0.048200,2.866261,0.271001,0.256864,0.263743,0.701935
7,0.048200,2.908258,0.270592,0.256559,0.263389,0.70183
8,0.031300,2.938429,0.270331,0.256559,0.263265,0.701725
9,0.023500,2.95907,0.269923,0.256254,0.262911,0.701515
10,0.023500,2.966643,0.270331,0.256559,0.263265,0.70183


The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: tokens, id, ner_tags. If tokens, id, ner_tags are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 893
  Batch size = 16
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to ./results\checkpoint-500
Configuration saved in ./results\checkpoint-500\config.json
Model weights saved in ./results\checkpoint-500\pytorch_model.bin
tokenizer config file saved in ./results\checkpoint-500\tokenizer_config.json
Special tokens file saved in ./results\checkpoint-500\special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: tokens, id, ner_tags. If tokens, id, ne

The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: tokens, id, ner_tags. If tokens, id, ner_tags are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 893
  Batch size = 16


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=2800, training_loss=0.13044223700250898, metrics={'train_runtime': 141.4856, 'train_samples_per_second': 316.499, 'train_steps_per_second': 19.79, 'total_flos': 271311782348448.0, 'train_loss': 0.13044223700250898, 'epoch': 10.0})

In [5]:
trainer.evaluate()

The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: tokens, id, ner_tags. If tokens, id, ner_tags are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 893
  Batch size = 16


{'eval_loss': 2.9666426181793213,
 'eval_precision': 0.27033108325297334,
 'eval_recall': 0.2565588773642465,
 'eval_f1': 0.26326498669588355,
 'eval_accuracy': 0.7018300378628524,
 'eval_runtime': 0.8378,
 'eval_samples_per_second': 1065.94,
 'eval_steps_per_second': 66.845,
 'epoch': 10.0}