In [None]:
!pip install git+https://github.com/huggingface/transformers

Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-dwpv002_
  Running command git clone -q https://github.com/huggingface/transformers /tmp/pip-req-build-dwpv002_
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Building wheels for collected packages: transformers
  Building wheel for transformers (PEP 517) ... [?25l[?25hdone
  Created wheel for transformers: filename=transformers-4.0.0.dev0-cp36-none-any.whl size=1345022 sha256=ed86f882d440a8107c37c238ef7c537a4b60f8d4675bb0a0851b2ce581bf96e1
  Stored in directory: /tmp/pip-ephem-wheel-cache-opt_9zr1/wheels/70/d3/52/b3fa4f8b8ef04167ac62e5bb2accb62ae764db2a378247490e
Successfully built transformers


In [None]:
!pip install datasets sklearn



In [None]:
from transformers import BertTokenizer, BertForTokenClassification, Trainer, TrainingArguments, AutoConfig
import numpy as np
import pandas as pd

# Getting to know the Dataset

In [None]:
import datasets
dataset = datasets.load_dataset('polyglot_ner', 'de', split='train[:8000]')

Reusing dataset polyglot_ner (/root/.cache/huggingface/datasets/polyglot_ner/de/1.0.0/c929318589a30ee3f0dc5d53f1f99bf25a7ec16d3f319b3b671765c5ea464c99)


checking the lengths

In [None]:
print(len(dataset))

8000


example

In [None]:
dataset[456]

{'id': '456',
 'lang': 'de',
 'ner': ['O', 'PER', 'PER', 'O', 'O', 'O', 'O', 'O', 'O'],
 'words': ['Mitautor',
  'Ben',
  'Bernie',
  'spielte',
  'den',
  'Song',
  'am',
  '19',
  '.']}

loading the BERT tokenizer

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-german-cased')

# Encoding the Dataset

> first the words...



In [None]:
encoded_dataset = [tokenizer(item['words'], return_tensors="pt", padding='max_length', truncation=True, max_length=512, is_split_into_words=True) for item in dataset]

In [None]:
from sklearn.preprocessing import LabelEncoder


> ...then the labels



In [None]:
le = LabelEncoder()

As can be seen below, the labels in this dataset are


*   ``LOC, O, ORG, PER``

I decided to keep those labels as they are, because I think it makes sense that the classifier learns to classify those fine-grained labels. Even though the labels are not in the classical IOB-format as explained in the lecture and tutorial. 






In [None]:
# setting the labels manually because there are so little, I previously extracted them from the dataset
# I added an <UNK> token in case that there is another label in the test set. Additionally, I added a <PAD> label because I want
# to exclude this in the end for the eval. 
labels_correct = ['<UNK>', '<PAD>', 'LOC', 'O', 'ORG', 'PER']

In [None]:
y_encoded = []
le.fit(labels_correct)

for idx, item in enumerate(dataset['ner']):
    item = ['<UNK>' if s not in le.classes_ else s for s in item]
    y_encoded.append(le.transform(item))

In [None]:
print(le.classes_)

['<PAD>' '<UNK>' 'LOC' 'O' 'ORG' 'PER']


> checking the encoded labels

In [None]:
print(len(y_encoded))
print(y_encoded[2]) # note that the 3 refers to 'O' and 4 to 'ORG' given the order of the labels above

8000
[3 3 3 3 3 3 4 4 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3]




> Zipping the words and the labels together again \\
> Padding the labels to the same length as the words



In [None]:
import torch
for enc_item, item in zip(encoded_dataset, y_encoded):
    i = item.size
    while i < 512:
         item = np.append(item, 0)
         i += 1
    enc_item['labels'] = torch.LongTensor([item])

> Shuffeling the dataset

In [None]:
from random import shuffle
shuffle(encoded_dataset)

#Getting the model and the dataset ready

In [None]:
model = BertForTokenClassification.from_pretrained('bert-base-german-cased', num_labels=6)

Some weights of the model checkpoint at bert-base-german-cased were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-b

### Optional: Freeze the embeddings


> The following code block was only executed for those models that needed to be trained with frozen embeddings



In [None]:
for param in model.base_model.parameters():
    param.requires_grad = False

### Preparing the dataset

> This next cell squeezes the tensors in the dataset such that they are basically just one list with the numbers in it. This can be seen a few cells below. 

In [None]:
for item in encoded_dataset:
    for key in item:
        item[key] = torch.squeeze(item[key])
        
train_set_small = encoded_dataset[:1000]
train_set_big = encoded_dataset[1000:6000]
test_set = encoded_dataset[6000:8000]

> Checking the dimensions

In [None]:
for key, val in test_set[3].items():
    print(f'key: {key}, dimensions: {val.size()}')

key: input_ids, dimensions: torch.Size([512])
key: token_type_ids, dimensions: torch.Size([512])
key: attention_mask, dimensions: torch.Size([512])
key: labels, dimensions: torch.Size([512])


In [None]:
print(len(train_set_big))
print(len(train_set_small))
print(len(test_set))

5000
1000
2000


In [None]:
# checking that everything is correct
train_set_big[0]

{'input_ids': tensor([    3,  1718,  1195,    21,   417,    21,   255,    81,   813,  1427,
          765, 17376,    65,  3554, 24370, 23324,  3698, 26901,   523,   140,
          144,  2572, 26897,    91, 15736,  7508, 26902, 26914,     4,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0, 

# Model with 5000 sentences


> I decided to choose rather small epochs and batch sizes. Using small batch size was recommended in the tutorial. I actually tried to have batch size 16 but this did not work with the memory. \\
In previous exercises I noticed that  more epochs did mostly not improve the model. So, I just decided to keep those low this time. I tried once with a few more but the model did not sigificantly improve. Also, I think that it might eventually overfit when chosing too many epochs.
> 





In [None]:
training_args = TrainingArguments(
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    output_dir='results',
    logging_dir='logs',
    no_cuda=False,  # defaults to false anyway, just to be explicit
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=train_set_big
)

In [None]:
trainer.train()

Step,Training Loss
500,0.020114
1000,0.009731
1500,0.007949
2000,0.005894
2500,0.004481
3000,0.003188


TrainOutput(global_step=3125, training_loss=0.008339030199050903)

In [None]:
preds = trainer.predict(test_set)

### I will quickly explain the following print statement's output (because this helped me a lot to understand the model's working):

1.   This shows the model's predictions for the first two sentences in the test set for each token it outputs a list that contains the prediction score for each class \\

2.   Calling argmax gives the index of the highest value in the lists of the first output. E.g. the first three lists (i.e. the first three tokens of the first sentence) have a their highest value at index 2 (all around value 9.0) 
this is reflected in the first list of the second print statement. The first three elements are 2.  Note that as the output is flattened (-1) Each prediction consists of a 1D list (i.e. each sentence is a list of the label for each token). \\

3. These are simply the true labels for each sentence \\

4. This is the loss on the test set



In [None]:
print(preds.predictions[:2])
print(preds.predictions[:2].argmax(-1))
print(preds.label_ids[:2])
print(preds.metrics)

[[[-1.7349178  -2.5869465  -2.5082388   9.576312   -2.16313
   -2.4085698 ]
  [-1.6795355  -2.3828106  -2.6598024   9.288394   -1.8000873
   -2.6051702 ]
  [-1.6196618  -2.1783748  -2.808496    9.147063   -1.9905958
   -2.7224247 ]
  ...
  [13.005723   -1.9613012  -2.6587849  -2.4666796  -2.4906647
   -2.0427177 ]
  [12.991143   -1.9389175  -2.673071   -2.508739   -2.4844792
   -2.1005576 ]
  [13.010515   -1.9479333  -2.6457367  -2.5132499  -2.4810648
   -2.0734043 ]]

 [[-0.6155376  -2.8471537  -3.0123286   9.659774   -2.2078788
   -2.3735435 ]
  [-0.90619814 -3.0712223  -2.9500248   9.447913   -2.3497012
   -1.8572366 ]
  [-0.7095085  -2.6548617  -3.323987    9.27624    -2.2581627
   -2.3166654 ]
  ...
  [13.002865   -1.8609704  -2.6197937  -2.6060476  -2.4698765
   -2.0456367 ]
  [12.975988   -1.838455   -2.6327024  -2.6447833  -2.4639058
   -2.0878456 ]
  [12.99367    -1.8370837  -2.6119642  -2.645088   -2.4545825
   -2.0522325 ]]]
[[3 3 3 ... 0 0 0]
 [3 3 3 ... 0 0 0]]
[[3 3 3 ...

### Calculation of f1-score


> In the next two cells I calculate the f1-micro and the f1-macro score. For each true-label - prediction pair, I excluded the padding labels at the end as those are not relevant for the evaluation. I then concatenate all the lists and calculate the score over the entire list of predictions. 




In [None]:
from sklearn.metrics import f1_score

all_y_true = []
all_y_pred = []


for y_true, y_pred in zip(preds.label_ids, preds.predictions.argmax(-1)):
    y_true = [label for label in y_true if label != 0]
    all_y_true.extend(y_true)

    y_pred = y_pred[:len(y_true)]
    all_y_pred.extend(y_pred)

f1_score(all_y_true, all_y_pred, average='micro')

0.931673743512891

In [None]:
f1_score(all_y_true, all_y_pred, average='macro')

0.49008843221416837

# Model with 1000 sentences


> This model and the following ones work exactley as the one above, I won't comment everything again



In [None]:
training_args = TrainingArguments(
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    output_dir='results',
    logging_dir='logs',
    no_cuda=False,  # defaults to false anyway, just to be explicit
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=train_set_small
)

In [None]:
trainer.train()

Step,Training Loss
500,0.018792


TrainOutput(global_step=625, training_loss=0.016381402587890625)

In [None]:
preds = trainer.predict(test_set)

In [None]:
print(preds.predictions[:2])
print(preds.predictions[:2].argmax(-1))
print(preds.label_ids[:2])
print(preds.metrics)

[[[-2.9156985  -3.836385   -0.5930642   5.9930077  -1.6499581
    1.2419959 ]
  [-2.769393   -3.5535655  -1.5530106   5.5897694  -1.6562132
    2.0578024 ]
  [-2.2806468  -3.4086447  -1.4520355   6.4426684  -1.4975652
    1.234459  ]
  ...
  [11.272462   -1.944296   -2.6601825  -2.4194908  -2.7369988
   -2.3305857 ]
  [11.270475   -2.0024924  -2.679599   -2.465958   -2.7375848
   -2.2849898 ]
  [11.244193   -2.025214   -2.7094889  -2.3955238  -2.7369878
   -2.329712  ]]

 [[-2.0221992  -3.8438432  -0.83458495  7.592824   -1.8784277
   -0.68975884]
  [-1.2488247  -3.4709244  -0.66187155  7.548788   -1.2896341
   -1.2947161 ]
  [-0.9994811  -3.41322    -0.5937271   7.5105724  -1.4148058
   -1.4303062 ]
  ...
  [11.279121   -1.9371833  -2.6390607  -2.4033275  -2.7177348
   -2.3511748 ]
  [11.277723   -1.9979805  -2.6647825  -2.4496853  -2.721288
   -2.300159  ]
  [11.259235   -2.0222461  -2.6967072  -2.3749454  -2.7070818
   -2.3590255 ]]]
[[3 3 3 ... 0 0 0]
 [3 3 3 ... 0 0 0]]
[[3 3 3 ..

In [None]:
from sklearn.metrics import f1_score

all_y_true = []
all_y_pred = []


for y_true, y_pred in zip(preds.label_ids, preds.predictions.argmax(-1)):
    y_true = [label for label in y_true if label != 0]
    all_y_true.extend(y_true)

    y_pred = y_pred[:len(y_true)]
    all_y_pred.extend(y_pred)

f1_score(all_y_true, all_y_pred, average='micro')

0.9158834844737566

In [None]:
f1_score(all_y_true, all_y_pred, average='macro')

0.347656092377417

# Model with 1000 sentences frozen embeddings

In [None]:
training_args = TrainingArguments(
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    output_dir='results',
    logging_dir='logs',
    no_cuda=False,  # defaults to false anyway, just to be explicit
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=train_set_small
)

In [None]:
trainer.train()

Step,Training Loss
500,0.25759


TrainOutput(global_step=625, training_loss=0.219908056640625)

In [None]:
preds = trainer.predict(test_set)

In [None]:
print(preds.predictions[:2])
print(preds.predictions[:2].argmax(-1))
print(preds.label_ids[:2])
print(preds.metrics)

[[[ 3.0805361  -4.4334545  -3.6101274  -0.6713129  -4.4284472
   -4.4712625 ]
  [ 1.4856422  -2.8480468  -2.8110518   1.1655972  -3.157579
   -2.9233978 ]
  [ 0.74164206 -1.7661538  -1.2426201   1.7834777  -1.3608127
   -1.3447394 ]
  ...
  [ 3.6326692  -4.2259693  -3.6661148  -1.7152568  -4.8849883
   -4.721519  ]
  [ 3.520103   -4.4289217  -3.5748076  -1.5725886  -4.659845
   -4.9389744 ]
  [ 3.5478072  -4.293312   -3.6680615  -1.5539088  -4.579565
   -4.7203746 ]]

 [[ 3.0844004  -4.432086   -3.6001828  -0.6720069  -4.4204917
   -4.4626284 ]
  [ 2.0117276  -3.1768694  -2.4977648   0.57732165 -3.83344
   -3.5368087 ]
  [ 1.2618809  -2.6064522  -1.9052707   0.93889    -2.9567604
   -2.7056599 ]
  ...
  [ 3.6280358  -4.223399   -3.6556547  -1.7111757  -4.877194
   -4.7172346 ]
  [ 3.5156293  -4.420115   -3.557813   -1.5602709  -4.6515093
   -4.9252996 ]
  [ 3.542303   -4.290576   -3.6565883  -1.5457845  -4.5712023
   -4.7143784 ]]]
[[0 0 3 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
[[3 3 3 ... 0 0

In [None]:
from sklearn.metrics import f1_score

all_y_true = []
all_y_pred = []


for y_true, y_pred in zip(preds.label_ids, preds.predictions.argmax(-1)):
    y_true = [label for label in y_true if label != 0]
    all_y_true.extend(y_true)

    y_pred = y_pred[:len(y_true)]
    all_y_pred.extend(y_pred)

f1_score(all_y_true, all_y_pred, average='micro')

0.511267217630854

In [None]:
f1_score(all_y_true, all_y_pred, average='macro')

0.138321936313328

# Model with 5000 sentences frozen embeddings

In [None]:
training_args = TrainingArguments(
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    output_dir='results',
    logging_dir='logs',
    no_cuda=False,  # defaults to false anyway, just to be explicit
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=train_set_big
)

In [None]:
trainer.train()

Step,Training Loss
500,0.196692
1000,0.0494
1500,0.043822
2000,0.040632
2500,0.039951
3000,0.038977


TrainOutput(global_step=3125, training_loss=0.06701560241699218)

In [None]:
preds = trainer.predict(test_set)

In [None]:
print(preds.predictions[:2])
print(preds.predictions[:2].argmax(-1))
print(preds.label_ids[:2])
print(preds.metrics)

[[[ 2.220413   -6.1790056  -3.818466    1.7472477  -4.5413694
   -4.0529046 ]
  [ 2.4006584  -5.785126   -3.1284258   1.7519944  -3.72584
   -3.0948648 ]
  [ 0.9290033  -5.4676     -1.9664247   3.5016837  -2.6962779
   -2.6081924 ]
  ...
  [ 6.139125   -6.680579   -4.4689126  -3.0176024  -4.8494444
   -4.753057  ]
  [ 5.8732257  -6.5412946  -4.514598   -2.6618178  -4.767673
   -4.825128  ]
  [ 5.522051   -6.419426   -4.4964004  -2.6009846  -4.713314
   -4.7750864 ]]

 [[ 2.2352834  -6.221768   -3.8681667   1.8091202  -4.66357
   -4.00101   ]
  [ 0.41250107 -3.079933   -1.3278934   2.7463598  -1.3408433
   -2.3782635 ]
  [-0.35789034 -3.191218   -1.3136376   3.300728   -1.5793434
   -2.3327353 ]
  ...
  [ 6.220709   -6.7253857  -4.474293   -3.0850656  -4.9264107
   -4.7803063 ]
  [ 5.969802   -6.597586   -4.5245657  -2.737121   -4.8510923
   -4.838787  ]
  [ 5.5878115  -6.4642253  -4.509656   -2.6166134  -4.795164
   -4.7647114 ]]]
[[0 0 3 ... 0 0 0]
 [0 3 3 ... 0 0 0]]
[[3 3 2 ... 0 0 

In [None]:
from sklearn.metrics import f1_score

all_y_true = []
all_y_pred = []


for y_true, y_pred in zip(preds.label_ids, preds.predictions.argmax(-1)):
    y_true = [label for label in y_true if label != 0]
    all_y_true.extend(y_true)

    y_pred = y_pred[:len(y_true)]
    all_y_pred.extend(y_pred)

f1_score(all_y_true, all_y_pred, average='micro')

0.8146182999458581

In [None]:
f1_score(all_y_true, all_y_pred, average='macro')

0.18042119463389045