In [None]:
!pip install datasets
!pip install transformers==4.28.0
!pip install evaluate
!pip install seqeval

## NER Finetuning mit validen Berufsbezeichnungen
Mithilfe von Sätzen, in denen valide Berufsbezeichnungen enthalten sind, wird ein Model zur Berufserkennung trainiert. 
Testweise werden dabei nur bestimmte Berufsbezeichnungen ausgewählt, um zu überprüfen, ob andere Berufe weiterhin identifiziert werden.

In [1]:
from transformers import AutoTokenizer, AutoModel, AutoModelForTokenClassification
from transformers import DataCollatorForTokenClassification
from transformers import TrainingArguments, Trainer
from datasets import Dataset, DatasetDict

from evaluate import load
from ast import literal_eval
from sklearn import metrics as sk_metrics

import numpy as np
import pandas as pd
import re
from collections import Counter

import torch

from IPython.display import display, Markdown

In [2]:
import torch
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
device

device(type='cpu')

#Load splitted Data

In [3]:
id2label = {0: 'O', 1: 'B-PROF', 2: 'I-PROF'}
label2id = {'O': 0, 'B-PROF': 1, 'I-PROF': 2}

train_test_df = pd.read_csv('../data/trainval.csv',
                           encoding='utf-8', sep='\t', index_col=0)

train_test_df['tokens'] = train_test_df['tokens'].apply(lambda x: literal_eval(x))
train_test_df['annotations'] = train_test_df['annotations'].apply(lambda x: literal_eval(x))
train_test_df['annotations'] = train_test_df['annotations'].apply(lambda x: [label2id[l] for l in x])

print(len(train_test_df))
train_test_df.head()

654


Unnamed: 0_level_0,tokens,annotations
id,Unnamed: 1_level_1,Unnamed: 2_level_1
406,"[Für, die, Bauern, ,, die, Bäuerinnen, und, di...","[0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
617,"[Den, Bauern, ist, da, nur, begrenzt, ein, Vor...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
689,"[Warum, kamen, Verkäufer, ,, Gutachter, und, K...","[0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ..."
127,"[Das, fängt, bei, der, Bundeskanzlerin, an, ,,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
376,"[Anders, als, die, im, Handel, oder, im, Gewer...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."


In [4]:
model_checkpoint = 'bert-base-german-cased'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading (…)enizer_config.json";:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading (…)"config.json";:   0%|          | 0.00/433 [00:00<?, ?B/s]

Downloading (…)"vocab.txt";:   0%|          | 0.00/255k [00:00<?, ?B/s]

Downloading (…)"tokenizer.json";:   0%|          | 0.00/485k [00:00<?, ?B/s]

In [5]:
raw_dataset = Dataset.from_pandas(train_test_df[['tokens', 'annotations']].sample(frac=1.0)) #
print(raw_dataset)

Dataset({
    features: ['tokens', 'annotations', 'id'],
    num_rows: 654
})


In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
label_all_tokens = True

In [None]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples["annotations"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs


tokenized_datasets = raw_dataset.map(
    tokenize_and_align_labels,
    batched=True)#,
print(tokenized_datasets)

In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
metric = load('seqeval')

label_names = ['O', 'B-PROF', 'I-PROF']

def compute_metrics(p):

    predictions, labels = p
    
    #print('probability:', predictions)
    
    predictions = np.argmax(predictions, axis=2)
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    true_labels = [
        [label_names[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [None]:
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint, num_labels=3, id2label=id2label, label2id=label2id
)

In [None]:
tokenized_datasets = tokenized_datasets.remove_columns(['tokens', 'annotations', 'id'])

train_test = tokenized_datasets.train_test_split(seed=0)
train_test_dataset = DatasetDict({
    'train': train_test['train'],
    'test': train_test['test']})
print("Gesplittetes Dataset (train/test):")
print(train_test_dataset)

In [None]:
!pip install --upgrade accelerate

In [None]:
training_args = TrainingArguments(
    output_dir="german_tc_professions_debates",
    learning_rate=5e-05,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    optim="adamw_torch",
    load_best_model_at_end=True,
    push_to_hub=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_test_dataset["train"],
    eval_dataset=train_test_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
trainer.push_to_hub()