In [1]:
from datasets import load_dataset
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import AutoTokenizer
from transformers import DataCollatorForTokenClassification
import numpy as np
import evaluate

  from pandas.core import (
2024-04-07 15:03:56.053662: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-07 15:03:56.053718: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-07 15:03:56.053763: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-04-07 15:03:56.061005: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
dataset = load_dataset("boapps/kmdb_entities")

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'people', 'institutions', 'places', 'text', 'ent_lemmas', 'ent_tokens', 'words'],
        num_rows: 46914
    })
})

In [4]:
label_list = ['O',
              'B-NEG-ORG', 'B-POS-ORG',
              'I-NEG-ORG', 'I-POS-ORG',
              'B-NEG-PER', 'B-POS-PER',
              'I-NEG-PER', 'I-POS-PER',
              'B-NEG-LOC', 'B-POS-LOC',
              'I-NEG-LOC', 'I-POS-LOC',
              ]

entcolumns = {'ORG': 'institutions', 'LOC': 'places', 'PER': 'people'}

In [5]:
row = dataset['train'][0]

In [6]:
def anyin(l1, l2):
    for e1 in l1:
        if e1 in l2:
            return e1
    return False

In [7]:
def getsame(e, row):
    same = set()
    for entity in row['ent_tokens']:
        if e in entity['lemma']:
            same.add(entity['lemma'])
    return list(same)

In [8]:
def getnertags(row):
    evalues = {'institutions': [], 'places': [], 'people': []}
    for e in entcolumns.values():
        evalues[e] = row[e].copy()
    for entity in row['ent_tokens']:
        if entity['tokens'][0]['ent_type'] in entcolumns:
            e1 = anyin(getsame(entity['lemma'], row), row[entcolumns[entity['tokens'][0]['ent_type']]])
            if e1:
                entity['status'] = 'POS'
                if e1 in evalues[entcolumns[entity['tokens'][0]['ent_type']]]:
                    evalues[entcolumns[entity['tokens'][0]['ent_type']]].remove(e1)
            else:
                entity['status'] = 'NEG'
    labelbyid = {}
    for entity in row['ent_tokens']:
        if entity['tokens'][0]['ent_type'] in entcolumns:
            for token in entity['tokens']:
                labelbyid[token['i']] = token['iob']+'-'+entity['status']+'-'+token['ent_type']
    nertags = []
    for i, word in enumerate(row['words']):
        if i in labelbyid:
            nertags.append(label_list.index(labelbyid[i]))
        else:
            nertags.append(0)
    for e in entcolumns.values():
        if len(evalues[e]) > 1:
            return {'nertags': None}
    return {'nertags': nertags}

In [9]:
dataset = dataset.map(getnertags)

In [10]:
dataset = dataset.filter(lambda r: r['nertags'] is not None)

In [11]:
example=dataset['train'][0]

In [12]:
tokenizer = AutoTokenizer.from_pretrained('SZTAKI-HLT/hubert-base-cc')

In [13]:
tokenized_input = tokenizer(example["words"], is_split_into_words=True, truncation=True, max_length=510, return_overflowing_tokens=True)

In [14]:
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"][0])
len(tokens)

510

In [15]:
from itertools import islice


def flatten_extend(matrix):
    flat_list = []
    for row in matrix:
        flat_list.extend(row)
    return flat_list


def divide_chunks(l, n):
    for i in range(0, len(l), n):
        yield l[i:i + n]


def tokenize_and_align_labels(examples):
    result = {'input_ids': [], 'attention_mask': [], 'token_type_ids': [], 'labels': []}

    tokenized_inputs = tokenizer(examples["words"], truncation=False, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"nertags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    for l in tokenized_inputs['input_ids']:
        result['input_ids'] += divide_chunks(l, 510)
    for l in tokenized_inputs['attention_mask']:
        result['attention_mask'] += divide_chunks(l, 510)
    for l in tokenized_inputs['token_type_ids']:
        result['token_type_ids'] += divide_chunks(l, 510)
    for l in tokenized_inputs['labels']:
        result['labels'] += divide_chunks(l, 510)

    return result

In [16]:
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True, remove_columns=['id', 'people', 'institutions', 'places', 'text', 'ent_lemmas', 'ent_tokens', 'words', 'nertags'])

Map:   0%|          | 0/30100 [00:00<?, ? examples/s]

In [17]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, max_length=512)

In [18]:
labels = [label_list[i] for i in example[f"nertags"]]

seqeval = evaluate.load("seqeval")


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [19]:
id2label = {i: l for i, l in enumerate(label_list)}
label2id = {l: i for i, l in enumerate(label_list)}

In [20]:
model = AutoModelForTokenClassification.from_pretrained(
    "SZTAKI-HLT/hubert-base-cc", num_labels=len(label_list), id2label=id2label, label2id=label2id
)

  return self.fget.__get__(instance, owner)()
Some weights of BertForTokenClassification were not initialized from the model checkpoint at SZTAKI-HLT/hubert-base-cc and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
from peft import LoraConfig, TaskType

lora_config = LoraConfig(
    task_type=TaskType. TOKEN_CLS, inference_mode=True, r=256, lora_alpha=256, lora_dropout=0.1, bias="none", use_rslora = True, target_modules=["query", "value"]
)

In [22]:
from peft import get_peft_model
#model = get_peft_model(model, lora_config)

In [23]:
len(tokenized_dataset['train'][14]['labels'])

510

In [24]:
train_testvalid = tokenized_dataset['train'].train_test_split()
test_valid = train_testvalid['test'].train_test_split()

In [None]:
training_args = TrainingArguments(
    output_dir="kmdb_ner_model",
    learning_rate=1e-4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=4,
    weight_decay=0.01,
    save_strategy="epoch",
    #    push_to_hub=True,
    eval_steps=100,
    logging_steps=10,
    evaluation_strategy='steps',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_testvalid["train"],
    eval_dataset=test_valid["train"].select(range(100)),
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
[34m[1mwandb[0m: Currently logged in as: [33mboapps[0m. Use [1m`wandb login --relogin`[0m to force relogin




Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
100,0.1602,0.182899,0.324324,0.46882,0.38341,0.939442
200,0.0987,0.094952,0.473512,0.543952,0.506294,0.96453
300,0.0798,0.079659,0.544437,0.621337,0.580351,0.970033
400,0.0621,0.072081,0.639711,0.665665,0.65243,0.970206
500,0.0755,0.075397,0.397626,0.503381,0.444297,0.96851
600,0.0652,0.07564,0.47171,0.557476,0.511019,0.966607
700,0.0469,0.075343,0.627365,0.647633,0.637338,0.970482
800,0.0611,0.064975,0.642282,0.659654,0.650852,0.971451
900,0.0683,0.066413,0.583685,0.623591,0.602979,0.971244
1000,0.0465,0.065446,0.640969,0.655898,0.648348,0.97114


  _warn_prf(average, modifier, msg_start, len(result))


In [28]:
trainer.push_to_hub()

events.out.tfevents.1712435180.archlinux.7751.0:   0%|          | 0.00/965k [00:00<?, ?B/s]

events.out.tfevents.1712471459.archlinux.98970.0:   0%|          | 0.00/49.0k [00:00<?, ?B/s]

Upload 26 LFS files:   0%|          | 0/26 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

events.out.tfevents.1712470709.archlinux.96749.0:   0%|          | 0.00/29.2k [00:00<?, ?B/s]

events.out.tfevents.1712472429.archlinux.103553.0:   0%|          | 0.00/7.38k [00:00<?, ?B/s]

events.out.tfevents.1712472502.archlinux.104021.0:   0%|          | 0.00/32.3k [00:00<?, ?B/s]

events.out.tfevents.1712473118.archlinux.106728.0:   0%|          | 0.00/75.2k [00:00<?, ?B/s]

events.out.tfevents.1712475098.archlinux.115178.0:   0%|          | 0.00/40.3k [00:00<?, ?B/s]

events.out.tfevents.1712475643.archlinux.117598.0:   0%|          | 0.00/7.18k [00:00<?, ?B/s]

events.out.tfevents.1712475776.archlinux.118270.0:   0%|          | 0.00/4.18k [00:00<?, ?B/s]

events.out.tfevents.1712475793.archlinux.118475.0:   0%|          | 0.00/25.2k [00:00<?, ?B/s]

events.out.tfevents.1712476746.archlinux.118475.1:   0%|          | 0.00/12.3k [00:00<?, ?B/s]

events.out.tfevents.1712477108.archlinux.124470.0:   0%|          | 0.00/74.4k [00:00<?, ?B/s]

events.out.tfevents.1712478064.archlinux.124470.1:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

events.out.tfevents.1712478481.archlinux.130518.0:   0%|          | 0.00/11.8k [00:00<?, ?B/s]

events.out.tfevents.1712478596.archlinux.131121.0:   0%|          | 0.00/242k [00:00<?, ?B/s]

events.out.tfevents.1712482596.archlinux.148007.0:   0%|          | 0.00/358k [00:00<?, ?B/s]

events.out.tfevents.1712488370.archlinux.172308.0:   0%|          | 0.00/94.1k [00:00<?, ?B/s]

events.out.tfevents.1712492906.archlinux.191287.0:   0%|          | 0.00/9.74k [00:00<?, ?B/s]

events.out.tfevents.1712493145.archlinux.192437.0:   0%|          | 0.00/12.3k [00:00<?, ?B/s]

events.out.tfevents.1712493892.archlinux.195717.0:   0%|          | 0.00/11.2k [00:00<?, ?B/s]

events.out.tfevents.1712494396.archlinux.195717.1:   0%|          | 0.00/4.18k [00:00<?, ?B/s]

events.out.tfevents.1712494443.archlinux.198747.0:   0%|          | 0.00/7.64k [00:00<?, ?B/s]

events.out.tfevents.1712494555.archlinux.199122.0:   0%|          | 0.00/8.90k [00:00<?, ?B/s]

events.out.tfevents.1712495086.archlinux.201679.0:   0%|          | 0.00/1.15M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/4.47k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/boapps/kmdb_ner_model/commit/5a27a9e3862744fa444dd19071b4efde0127bdd7', commit_message='End of training', commit_description='', oid='5a27a9e3862744fa444dd19071b4efde0127bdd7', pr_url=None, pr_revision=None, pr_num=None)

### 