In [1]:
# imports
import numpy as np
from datasets import load_dataset, ClassLabel, load_from_disk, DatasetDict
import evaluate
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    DataCollatorForTokenClassification,
    TrainingArguments,
    Trainer,
    set_seed
)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# consts
set_seed(42)

MODEL_NAME = "microsoft/deberta-v3-base"
OUTPUT_DIR = "out/pos+ner_deberta-base"
MAX_LENGTH = 256
PER_DEVICE_TRAIN_BATCH_SIZE = 8
GRADIENT_ACCUMULATION_STEPS = 2
PER_DEVICE_EVAL_BATCH_SIZE = 4
NUM_EPOCHS = 6
LEARNING_RATE = 2e-5
WEIGHT_DECAY = 0.01
WARMUP_RATIO = 0.06

import os
os.environ["HF_HOME"] = "../hf_cache"
os.environ["HF_DATASETS_CACHE"] = "../data"

os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"

In [3]:
blob_data = load_from_disk("./data/deberta_fewnerd_aug")
spilt_data = blob_data.train_test_split(test_size=0.1)
raw_datasets = DatasetDict({
    'train': spilt_data['train'],
    'validation': spilt_data['test'],
})
# BIO tags
features = raw_datasets["train"].features
ner_feature = features["ner_tags"]
label_list = ner_feature.feature.names
num_labels = len(label_list)
print("Labels:", label_list)

Labels: ['O', 'B-ART', 'I-ART', 'B-BUILDING', 'I-BUILDING', 'B-EVENT', 'I-EVENT', 'B-LOC', 'I-LOC', 'B-ORG', 'I-ORG', 'B-OTHER', 'I-OTHER', 'B-PER', 'I-PER', 'B-PROD', 'I-PROD', 'B-NOUN', 'I-NOUN', 'B-PRON', 'I-PRON']


In [4]:
# tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True, cache_dir="../hf_cache")
model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME, num_labels=num_labels, cache_dir="../hf_cache")

Some weights of DebertaV2ForTokenClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
# tokenize and align labels
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        is_split_into_words=True,
        truncation=True,
        max_length=MAX_LENGTH,
        return_overflowing_tokens=False,
        return_offsets_mapping=False,
    )

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # list with length = #input_ids
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(label[word_idx])
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [6]:
# tokenize datasets
tokenized_datasets = raw_datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=raw_datasets["train"].column_names
)

In [7]:
# data collator
data_collator = DataCollatorForTokenClassification(tokenizer)

In [8]:
# metrics
metric = evaluate.load("seqeval")

In [9]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [
        [label_list[l] for (l, p) in zip(label, pred) if l != -100]
        for label, pred in zip(labels, predictions)
    ]
    true_predictions = [
        [label_list[p] for (l, p) in zip(label, pred) if l != -100]
        for label, pred in zip(labels, predictions)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }


In [10]:
import accelerate

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=200,
    per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    num_train_epochs=NUM_EPOCHS,
    learning_rate=LEARNING_RATE,
    weight_decay=WEIGHT_DECAY,
    warmup_ratio=WARMUP_RATIO,
    fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    save_total_limit=3,
    dataloader_num_workers=2,
    run_name="deberta-v3-base-conll",
)

In [11]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

  trainer = Trainer(


In [12]:
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0215,0.020715,0.980826,0.972575,0.976683,0.991963
2,0.017,0.019426,0.98127,0.976995,0.979128,0.992775
3,0.0107,0.021416,0.982121,0.97736,0.979734,0.99301
4,0.0069,0.025733,0.981467,0.97831,0.979886,0.993018
5,0.0055,0.029266,0.983236,0.977581,0.980401,0.993239
6,0.0035,0.034403,0.983448,0.978041,0.980737,0.993344


  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=44472, training_loss=0.024335840544397826, metrics={'train_runtime': 6601.6963, 'train_samples_per_second': 107.781, 'train_steps_per_second': 6.736, 'total_flos': 1.956102880358718e+16, 'train_loss': 0.024335840544397826, 'epoch': 6.0})

In [14]:
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

# 11) Quick eval on test set
print("Evaluating on test set...")
metrics = trainer.evaluate(tokenized_datasets["validation"])
print(metrics)

Evaluating on test set...


{'eval_loss': 0.03440329432487488, 'eval_precision': 0.9834475067707503, 'eval_recall': 0.9780407814059604, 'eval_f1': 0.9807366924305132, 'eval_accuracy': 0.9933442742523705, 'eval_runtime': 35.425, 'eval_samples_per_second': 371.969, 'eval_steps_per_second': 93.013, 'epoch': 6.0}


In [15]:
def predict_sentence(sentence_tokens):
    # sentence_tokens: list of tokens (words)
    enc = tokenizer(sentence_tokens, is_split_into_words=True, return_tensors="pt", truncation=True, max_length=MAX_LENGTH)
    # send to same device as model
    device = trainer.model.device
    enc = {k: v.to(device) for k, v in enc.items()}
    outputs = trainer.model(**enc)
    logits = outputs.logits.detach().cpu().numpy()
    pred_ids = np.argmax(logits, axis=-1)[0]
    word_ids = tokenizer(sentence_tokens, is_split_into_words=True).word_ids()
    # align back
    pred_labels = []
    last_w = None
    for idx, wid in enumerate(word_ids):
        if wid is None:
            continue
        # take labels for first subtoken of each word (wid change)
        if wid != last_w:
            pred_labels.append(label_list[pred_ids[idx]])
        last_w = wid
    return list(zip(sentence_tokens, pred_labels))

In [16]:
example = ["EU", "rejects", "German", "call", "to", "boycott", "British", "lamb", "."]
print(predict_sentence(example))

[('EU', 'O'), ('rejects', 'O'), ('German', 'O'), ('call', 'B-NOUN'), ('to', 'O'), ('boycott', 'O'), ('British', 'O'), ('lamb', 'B-NOUN'), ('.', 'O')]
