In [1]:
# install huggingface and datasets
!pip install -q datasets transformers
!pip install torch
!pip install pandas
!pip install pyarrow

[K     |████████████████████████████████| 346 kB 5.3 MB/s 
[K     |████████████████████████████████| 4.2 MB 65.8 MB/s 
[K     |████████████████████████████████| 86 kB 6.0 MB/s 
[K     |████████████████████████████████| 1.1 MB 57.5 MB/s 
[K     |████████████████████████████████| 140 kB 62.5 MB/s 
[K     |████████████████████████████████| 212 kB 74.1 MB/s 
[K     |████████████████████████████████| 86 kB 6.8 MB/s 
[K     |████████████████████████████████| 596 kB 58.0 MB/s 
[K     |████████████████████████████████| 127 kB 75.8 MB/s 
[K     |████████████████████████████████| 6.6 MB 54.0 MB/s 
[K     |████████████████████████████████| 144 kB 71.4 MB/s 
[K     |████████████████████████████████| 271 kB 27.2 MB/s 
[K     |████████████████████████████████| 94 kB 3.6 MB/s 
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
datascience 0.10.6 requires folium

In [2]:
from datasets import load_dataset, ClassLabel
from transformers import BertForTokenClassification, BertTokenizer, TrainingArguments, Trainer
import numpy as np
import torch
from sklearn import metrics


import pandas as pd
import datasets
from datasets.features import ClassLabel

In [3]:
torch.cuda.is_available()
device = torch.device("cuda")

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
datapath = '/content/drive/MyDrive/Universitat/ML4NLP/v2.1/'

In [6]:
# There is no train split 
train_path = datapath + 'hipe2020/de/HIPE-2022-v2.1-hipe2020-train-de.tsv'
val_path = datapath + 'hipe2020/de/HIPE-2022-v2.1-hipe2020-dev-de.tsv'
test_path = datapath + 'hipe2020/de/HIPE-2022-v2.1-hipe2020-test_allmasked-de.tsv'

model = 'bert-base-german-cased'
shuffle = True

In [7]:
# import dataset from cloned git repo
def load_dataset(path):
    df = pd.read_csv(path, sep='\t', skip_blank_lines=False, engine='python', quoting=3)
    # error_bad_lines=False, 
    return df


tsv_train = load_dataset(train_path)
tsv_val = load_dataset(val_path)
tsv_test = load_dataset(test_path)

In [8]:
def simple_preprocess(dataframe):
    # Add end_of_document token in df
    dataframe = dataframe.dropna(subset=['TOKEN'])

    # Filter out metadata rows beginning with #
    dataframe = dataframe[~dataframe['TOKEN'].astype(str).str.startswith('#')]
    dataframe = dataframe[~dataframe['TOKEN'].astype(str).str.startswith('\t')]

    #transforming nan var from Float to string to use in (***)
    dataframe.MISC = dataframe.MISC.fillna('')

    return dataframe

In [9]:
tsv_train = simple_preprocess(tsv_train)
tsv_val = simple_preprocess(tsv_val)
tsv_test = simple_preprocess(tsv_test)

In [10]:
label_set = tsv_train['NE-COARSE-LIT'].unique()
# define the label mapping for NER
label_list = label_set.tolist()
label_list.append('_')
label_num = len(label_list)

labels = ClassLabel(num_classes=label_num, names=label_list)

In [11]:
def create_huggingface_file(dataframe):
    import pyarrow as pa
    import re
    from datasets import Dataset

    #creating dataset in json
    hug_out = []
    idx = 0 
    items = {'id': idx,'words':[ ], 'ner': [ ]}
    hug_out.append(items)
    for index, row in dataframe.iterrows():
        if  not re.search(r'EndOfSentence', row['MISC']):
            items['words'].append(row['TOKEN'])
            items['ner'].append(labels.str2int(row['NE-COARSE-LIT']))
        else:
            items['words'].append(row['TOKEN'])
            items['ner'].append(labels.str2int(row['NE-COARSE-LIT']))
            idx += 1
            items = {'id': idx,'words':[ ], 'ner': [ ]}
            hug_out.append(items)
    #filter hug_out out, delete items which has len(words) > 380
    #hug_out = filter(lambda x: len(x['words']) < 380, hug_out)
    #json to df
    hug_out = pd.DataFrame(hug_out)

    # delete all sentences that are too long
    #hug_out = hug_out[hug_out['words'].map(len) < 512] #why does not work? QA

    ### convert to Huggingface dataset
    hug_out = Dataset(pa.Table.from_pandas(hug_out))

    return hug_out

In [12]:
train = create_huggingface_file(tsv_train)
val = create_huggingface_file(tsv_val)
test = create_huggingface_file(tsv_test)

#look at training data
for i in range(10):
  print(train[i])

print(len(train))
print(len(val))
print(len(test))

{'id': 0, 'words': ['Frankreich', '.'], 'ner': [0, 1]}
{'id': 1, 'words': ['Gesetzgeber', '.'], 'ner': [1, 1]}
{'id': 2, 'words': ['Den', '19', '.', 'Niv', '.'], 'ner': [1, 1, 1, 1, 1]}
{'id': 3, 'words': ['(', '8', '.', 'Jän', '.', ')', 'ward', 'die', 'Staatskleidung', 'der', 'Sekretär', '-', 'Redacteurs', ',', 'der', 'Staatsbothen', 'und', 'Thorwächter', 'für', 'beyde', 'Räthe', 'bestimmt', '.'], 'ner': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
{'id': 4, 'words': ['Auf', 'Talot', "'", 's', 'Vorschlag', 'beschloß', 'man', 'über', 'den', 'constitutionellen', 'Umkreis', ',', 'welchen', 'das', 'gesetzgebende', 'Corps', 'in', 'Zukunft', 'inne', 'haben', 'soll', ',', 'fol', '¬', 'gendes', ':', 'Vom', 'Tage', 'an', ',', 'da', 'der', 'Rath', 'der', '500', 'in', 'sei', '¬', 'nen', 'neuen', 'Pallast', 'installirt', 'seyn', 'wird', ',', 'sind', 'die', 'äußerlichen', 'Bezirke', 'für', 'beyde', 'Räthe', 'folgendermas', '¬', 'sen', 'firir', ':', 'Rarh', 'der', 'Alten',

In [14]:
tokenizer = BertTokenizer.from_pretrained(model)

Downloading:   0%|          | 0.00/249k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/433 [00:00<?, ?B/s]

In [15]:
train = train.filter(lambda example, idx: idx != len(train)-1, with_indices=True)
val = val.filter(lambda example, idx: idx != len(val)-1, with_indices=True)
test = test.filter(lambda example, idx: idx != len(test)-1, with_indices=True)

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [16]:
def tokenize_func(batch):
    tokenized = tokenizer(batch["words"], is_split_into_words=True, padding="max_length", max_length=100, truncation=True)
    tokenized_words = [
        [
            tokenizer(word, add_special_tokens=False)["input_ids"]
            for word in sent
        ]
        for sent in batch["words"]
    ]
    tokenized_ner = []

    for sent_words, sent_nes in zip(tokenized_words, batch["ner"]):
        tokenized_ner.append(
            [labels.str2int("O")]  # BOS symbol
            + [
                ne
                for subwords, ne in zip(sent_words, sent_nes)
                for _ in range(len(subwords))
            ]
            + [labels.str2int("O")]  # EOS symbol
        )

    # Padding with "O"
    tokenized["labels"] = [(ner + [labels.str2int("O")] * (100 - len(ner)))[:100] for ner in tokenized_ner]
    tokenized["subwords"] = tokenized_words
    return tokenized

def tokenize_nolabel_func(batch):
    tokenized = tokenizer(batch["words"], is_split_into_words=True, padding="max_length", max_length=100, truncation=True)
    tokenized_words = [
        [
            tokenizer(word, add_special_tokens=False)["input_ids"]
            for word in sent
        ]
        for sent in batch["words"]
    ]

    tokenized["subwords"] = tokenized_words
    return tokenized

#tiny_train_tokenized = tiny_train.map(tokenize_func, batched=True, batch_size=50)
#tiny_test_tokenized = tiny_test.map(tokenize_nolabel_func, batched=True, batch_size=50)

train_tokenized = train.map(tokenize_func, batched=True, batch_size=50)
val_tokenized = val.map(tokenize_func, batched=True, batch_size=50)
test_tokenized = test.map(tokenize_nolabel_func, batched=True, batch_size=50)


  0%|          | 0/70 [00:00<?, ?ba/s]

  0%|          | 0/25 [00:00<?, ?ba/s]

  0%|          | 0/25 [00:00<?, ?ba/s]

In [17]:
training_args = TrainingArguments(
    output_dir = "/content/drive/MyDrive/Universitat/ML4NLP/bert_europeana_en"
)

In [18]:
if shuffle:
  train_tokenized.shuffle()

In [19]:
model = BertForTokenClassification.from_pretrained(model, num_labels=labels.num_classes)
# trainer = Trainer(model=model, args=training_args, train_dataset=train_tokenized)
trainer = Trainer(model=model, train_dataset=train_tokenized)
trainer.train()

Downloading:   0%|          | 0.00/419M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-german-cased were not used when initializing BertForTokenClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-b

Step,Training Loss
500,0.0699
1000,0.0178


Saving model checkpoint to tmp_trainer/checkpoint-500
Configuration saved in tmp_trainer/checkpoint-500/config.json
Model weights saved in tmp_trainer/checkpoint-500/pytorch_model.bin
Saving model checkpoint to tmp_trainer/checkpoint-1000
Configuration saved in tmp_trainer/checkpoint-1000/config.json
Model weights saved in tmp_trainer/checkpoint-1000/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=1302, training_loss=0.035532787465096985, metrics={'train_runtime': 237.0401, 'train_samples_per_second': 43.904, 'train_steps_per_second': 5.493, 'total_flos': 531164296677600.0, 'train_loss': 0.035532787465096985, 'epoch': 3.0})

In [20]:
predictions = trainer.predict(test_tokenized)

The following columns in the test set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: id, ner, words, subwords. If id, ner, words, subwords are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 1214
  Batch size = 8


In [21]:
outfilepath = f"/content/drive/MyDrive/e_ML4NLP2/Outputs/hipe2020_de/bert_base_cased{'_shuffled' if shuffle else ''}.tsv"
outfilepath

'/content/drive/MyDrive/e_ML4NLP2/Outputs/hipe2020_de/bert_base_cased_shuffled.tsv'

In [22]:
outfile = open(outfilepath, "w")
outfile.write("TOKEN\tNE-COARSE-LIT\tNE-COARSE-METO\tNE-FINE-LIT\tNE-FINE-METO\tNE-FINE-COMP\tNE-NESTED\tNEL-LIT\tNEL-METO\tMISC")

for pred, tinput in zip(predictions.predictions, test_tokenized):
    pred_word_labels = []

    try:
        sent_len = tinput["input_ids"].index(0)
    except ValueError:
        sent_len = len(tinput["input_ids"])
    # Start at 1 to skip pred for cls token
    i = 1

    for word in tinput['subwords']:
        
        word_score = np.zeros_like(pred[0])
        eos_reached = False
        try:
            for subword in word:
                word_score += pred[i]
                i += 1
            label = labels.int2str(int(np.argmax(word_score)))
            pred_word_labels.append(label)
        except IndexError:
            pred_word_labels.append("O")

    for token, label in zip(tinput['words'], pred_word_labels):
        outfile.write("\n" + f"{token}\t{label}\t" + ('\t'.join(['_'] * 8)))

outfile.close()

In [None]:
accuracy = datasets.load_metric("accuracy")
f1_metric = datasets.load_metric("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    refs = labels.flatten()
    predictions = np.argmax(logits, axis=-1)
    print(logits.shape)
    print(predictions.shape)
    pred = predictions.flatten()
    return {
        "accuracy": accuracy.compute(predictions=pred, references=refs)["accuracy"],
        "f1_micro": f1_metric.compute(predictions=pred, references=refs, average="micro")["f1"],
        "f1_macro": f1_metric.compute(predictions=pred, references=refs, average="macro")["f1"],
    }

Downloading builder script:   0%|          | 0.00/1.41k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/2.06k [00:00<?, ?B/s]

In [None]:
Trainer(
    model=model,
    eval_dataset=val_tokenized,
    compute_metrics=compute_metrics,
).evaluate()

No `TrainingArguments` passed, using `output_dir=tmp_trainer`.
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the evaluation set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: subwords, words, ner, id. If subwords, words, ner, id are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1202
  Batch size = 8


(1202, 100, 12)
(1202, 100)


{'eval_accuracy': 0.9908319467554076,
 'eval_f1_macro': 0.759910415468632,
 'eval_f1_micro': 0.9908319467554076,
 'eval_loss': 0.03373601287603378,
 'eval_runtime': 24.7053,
 'eval_samples_per_second': 48.654,
 'eval_steps_per_second': 6.112}