In [1]:
from datasets import load_dataset
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import AutoTokenizer
from transformers import DataCollatorForTokenClassification
import numpy as np
import evaluate

  from pandas.core import (
2024-03-07 14:02:03.736735: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-07 14:02:03.736770: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-07 14:02:03.736807: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-03-07 14:02:03.743973: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
dataset = load_dataset("boapps/kmdb_entities")

In [3]:
label_list = ['O',
              'B-NEG-ORG', 'B-POS-ORG',
              'I-NEG-ORG', 'I-POS-ORG',
              'B-NEG-PER', 'B-POS-PER',
              'I-NEG-PER', 'I-POS-PER',
              'B-NEG-LOC', 'B-POS-LOC',
              'I-NEG-LOC', 'I-POS-LOC',
              ]

entcolumns = {'ORG': 'institutions', 'LOC': 'places', 'PER': 'people'}

In [4]:
row = dataset['train'][0]

In [5]:
def anyin(l1, l2):
    for e1 in l1:
        if e1 in l2:
            return True
    return False

In [6]:
def getsame(e, row):
    same = set()
    for entity in row['ent_tokens']:
        if e in entity['lemma']:
            same.add(entity['lemma'])
    return list(same)

In [7]:
def getnertags(row):
    for entity in row['ent_tokens']:
        if entity['tokens'][0]['ent_type'] in entcolumns:
            if anyin(getsame(entity['lemma'], row), row[entcolumns[entity['tokens'][0]['ent_type']]]):
                entity['status'] = 'POS'
            else:
                entity['status'] = 'NEG'
    labelbyid = {}
    for entity in row['ent_tokens']:
        if entity['tokens'][0]['ent_type'] in entcolumns:
            for token in entity['tokens']:
                labelbyid[token['i']] = token['iob']+'-'+entity['status']+'-'+token['ent_type']
    nertags = []
    for i, word in enumerate(row['words']):
        if i in labelbyid:
            nertags.append(label_list.index(labelbyid[i]))
        else:
            nertags.append(0)
    return {'nertags': nertags}

In [8]:
dataset=dataset.map(getnertags)

In [9]:
example=dataset['train'][0]

In [10]:
tokenizer = AutoTokenizer.from_pretrained('NYTK/PULI-BERT-Large')

In [11]:
tokenized_input = tokenizer(example["words"], is_split_into_words=True)


In [12]:
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
tokens

['[CLS]',
 'a',
 'x',
 '##i',
 '.',
 'ker',
 '##ule',
 '##ti',
 'onk',
 '##orm',
 '##any',
 '##zat',
 'tobb',
 'mint',
 'mas',
 '##felm',
 '##ill',
 '##ia',
 '##r',
 '##d',
 'forint',
 '##ert',
 'elad',
 '##ta',
 'utols',
 '##o',
 ',',
 '25',
 'sza',
 '##zal',
 '##ek',
 '##os',
 'u',
 '##z',
 '##let',
 '##resz',
 '##e',
 '##t',
 'az',
 'ob',
 '##ol',
 'x',
 '##i',
 '.',
 'kft',
 '.',
 '-',
 'ben',
 ',',
 'amely',
 'a',
 'la',
 '##g',
 '##y',
 '##man',
 '##yos',
 '##i',
 'kopasz',
 '##i',
 '-',
 'g',
 '##at',
 'hasznos',
 '##itas',
 '##ara',
 'alakult',
 '.',
 'ezzel',
 'a',
 'ker',
 '##ule',
 '##t',
 'gyakorlatilag',
 'kisz',
 '##all',
 '##t',
 'abb',
 '##ol',
 'a',
 'ceg',
 '##bol',
 ',',
 'amelyet',
 'le',
 '##iszt',
 '##inger',
 'tam',
 '##as',
 'nagy',
 '##vall',
 '##alk',
 '##ozo',
 '##val',
 'koz',
 '##ose',
 '##n',
 'alap',
 '##ito',
 '##tt',
 ',',
 'es',
 'amely',
 'a',
 'g',
 '##aton',
 'szor',
 '##ak',
 '##ozo',
 '##negyed',
 '##e',
 '##t',
 'es',
 'u',
 '##dul',
 '##opa',
 '

In [13]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["words"], truncation=True, is_split_into_words=True, max_length=512)

    labels = []
    for i, label in enumerate(examples[f"nertags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [14]:
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

In [15]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [16]:
labels = [label_list[i] for i in example[f"nertags"]]

seqeval = evaluate.load("seqeval")


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [17]:
id2label = {i: l for i, l in enumerate(label_list)}
label2id = {l: i for i, l in enumerate(label_list)}

In [18]:
model = AutoModelForTokenClassification.from_pretrained(
    "NYTK/PULI-BERT-Large", num_labels=len(label_list), id2label=id2label, label2id=label2id
)

  return self.fget.__get__(instance, owner)()
Some weights of MegatronBertForTokenClassification were not initialized from the model checkpoint at NYTK/PULI-BERT-Large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
train_testvalid = tokenized_dataset['train'].train_test_split()
test_valid = train_testvalid['test'].train_test_split()

In [20]:
training_args = TrainingArguments(
    output_dir="kmdb_ner_model",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=2,
    weight_decay=0.01,
    save_strategy="epoch",
    #    push_to_hub=True,
    eval_steps=1000,
    logging_steps=10,
    evaluation_strategy='steps',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_testvalid["train"],
    eval_dataset=test_valid["train"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mboapps[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1000,0.0613,0.05746,0.710177,0.735915,0.722817,0.977626
2000,0.0559,0.052501,0.710769,0.761121,0.735084,0.978752
3000,0.0569,0.051159,0.737493,0.785646,0.760808,0.979839
4000,0.0481,0.049622,0.74018,0.797492,0.767768,0.980325
5000,0.041,0.047443,0.756677,0.798133,0.776852,0.980924
6000,0.0434,0.046345,0.7588,0.785335,0.77184,0.981286
7000,0.0577,0.046221,0.759245,0.789551,0.774101,0.981238
8000,0.0422,0.042805,0.781952,0.805734,0.793665,0.98283
9000,0.0346,0.044339,0.775792,0.815328,0.795068,0.982533
10000,0.0301,0.043091,0.783064,0.812354,0.79744,0.983005


TrainOutput(global_step=17594, training_loss=0.04469800686529987, metrics={'train_runtime': 24185.6688, 'train_samples_per_second': 2.91, 'train_steps_per_second': 0.727, 'total_flos': 6.525623837551385e+16, 'train_loss': 0.04469800686529987, 'epoch': 2.0})

In [21]:
trainer.push_to_hub()

events.out.tfevents.1709805648.archlinux.2530787.0:   0%|          | 0.00/5.11k [00:00<?, ?B/s]

events.out.tfevents.1709806152.archlinux.2534847.0:   0%|          | 0.00/6.17k [00:00<?, ?B/s]

events.out.tfevents.1709809548.archlinux.2534847.1:   0%|          | 0.00/5.95k [00:00<?, ?B/s]

events.out.tfevents.1709809990.archlinux.2534847.2:   0%|          | 0.00/5.53k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Upload 10 LFS files:   0%|          | 0/10 [00:00<?, ?it/s]

events.out.tfevents.1709811263.archlinux.2569236.0:   0%|          | 0.00/6.00k [00:00<?, ?B/s]

events.out.tfevents.1709811778.archlinux.2572753.0:   0%|          | 0.00/48.5k [00:00<?, ?B/s]

events.out.tfevents.1709816408.archlinux.2602258.0:   0%|          | 0.00/4.18k [00:00<?, ?B/s]

events.out.tfevents.1709816538.archlinux.2604731.0:   0%|          | 0.00/385k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/4.47k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/boapps/kmdb_ner_model/commit/cbe054a29f18fb1e28bfd420578fca077b86fb99', commit_message='End of training', commit_description='', oid='cbe054a29f18fb1e28bfd420578fca077b86fb99', pr_url=None, pr_revision=None, pr_num=None)