# Configs

In [1]:
MAX_TOKEN_LEN = 128
LEARNING_RATE = 2e-5 
NUM_EPOCHS = 8
BATCH_SIZE = 32
MODEL_NAME = f"models/kategoribert-lr-{LEARNING_RATE}-maxtoken-{MAX_TOKEN_LEN}-epochs-{NUM_EPOCHS}-bs-{BATCH_SIZE}"
MODEL_NAME

'models/kategoribert-lr-2e-05-maxtoken-128-epochs-8-bs-32'

# Check GPU

In [2]:
import torch

In [3]:
# If there's a GPU available...
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


# Load data

In [4]:
from datasets import load_dataset

In [5]:
wikicat = load_dataset("Johannesemme/wiki_kategori", download_mode='force_redownload')

Downloading builder script:   0%|          | 0.00/3.32k [00:00<?, ?B/s]

Using custom data configuration default


Downloading and preparing dataset wiki_kategori/default (download: 24.93 MiB, generated: 25.82 MiB, post-processed: Unknown size, total: 50.75 MiB) to /home/jupyter/.cache/huggingface/datasets/Johannesemme___wiki_kategori/default/1.1.0/d1af97ac4d53f3037dfa4071a662c627516dcf691eed74e25f3b9a523c425c9b...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/7.88M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/965k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.07M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/8522 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/947 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1052 [00:00<?, ? examples/s]

Dataset wiki_kategori downloaded and prepared to /home/jupyter/.cache/huggingface/datasets/Johannesemme___wiki_kategori/default/1.1.0/d1af97ac4d53f3037dfa4071a662c627516dcf691eed74e25f3b9a523c425c9b. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [6]:
wikicat

DatasetDict({
    train: Dataset({
        features: ['Text', 'Title', 'Labels'],
        num_rows: 8522
    })
    validation: Dataset({
        features: ['Text', 'Title', 'Labels'],
        num_rows: 947
    })
    test: Dataset({
        features: ['Text', 'Title', 'Labels'],
        num_rows: 1052
    })
})

In [7]:
# Convert dataset to pandas dataframe
df = wikicat['train'].to_pandas()

In [8]:
# Overview of data
df.head()

Unnamed: 0,Text,Title,Labels
0,Forbøn er at bede for andre end sig selv. En p...,Forbøn,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]"
1,"Mad er en essentiel energikilde for mennesker,...",Mad,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]"
2,Latinskolerne blev i de fleste købstæder opret...,Latinskole,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
3,"Galgenfrist betød oprindelig et kort tidsrum, ...",Galgenfrist,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
4,Briller er betegnelsen for et synsforbedrende ...,Briller,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]"


In [9]:
label_names = wikicat["train"].features["Labels"].feature.names
id2label = {idx:label for idx, label in enumerate(label_names)}
label2id = {label:idx for idx, label in enumerate(label_names)}
id2label

{0: 'Uddannelse',
 1: 'Samfund',
 2: 'Videnskab',
 3: 'Natur',
 4: 'Teknologi',
 5: 'Kultur',
 6: 'Historie',
 7: 'Sundhed',
 8: 'Geografi',
 9: 'Økonomi',
 10: 'Sport',
 11: 'Religion',
 12: 'Politik',
 13: 'Erhvervsliv'}

In [10]:
import numpy as np
from collections import Counter,OrderedDict

In [11]:
total_label_list = []
for ix, row in df.iterrows():
    list_ = np.array(row["Labels"])
    vals = np.where(list_ == 1)[0]
    total_label_list.extend(vals)

In [12]:
d = dict(OrderedDict(sorted(Counter(total_label_list).items())))
w = np.array(list(d.values())).sum() / (14*np.array(list(d.values())))
class_weights = torch.tensor(w).to(device)
class_weights

tensor([1.6988, 0.3708, 1.0630, 1.4064, 3.0649, 0.4606, 0.9406, 1.8765, 2.7549,
        0.9081, 1.8298, 0.8246, 0.8523, 1.7472], device='cuda:0',
       dtype=torch.float64)

# Tokenize and encode

In [13]:
model_ckpt = "Maltehb/danish-bert-botxo"

In [14]:
from transformers import AutoTokenizer

In [15]:
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [16]:
def preprocess_data(batch):
    text = batch["Text"]
    encoding = tokenizer(text, padding=True, truncation=True, max_length=MAX_TOKEN_LEN) 
    # Create numpy array of shape (batch_size, num_labels)
    labels_matrix = np.zeros((len(text), len(label_names)))
    # Fill labels_matrix with indices of the text labels
    for idx, row in enumerate(batch["Labels"]):
        labels_matrix[idx,:] = np.array([float(x) for x in row])
    encoding["labels"] = labels_matrix.tolist()
    return encoding

In [17]:
encoded_dataset = wikicat.map(preprocess_data, batched=True, batch_size=None, remove_columns=wikicat['train'].column_names)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [18]:
# Convert all to torch
encoded_dataset.set_format("torch")

In [19]:
# Check dataset columns
print(encoded_dataset["train"].column_names)

['input_ids', 'token_type_ids', 'attention_mask', 'labels']


In [20]:
# Retrieve an example
example = encoded_dataset["train"][0]

In [21]:
# Get token ids + the result of applying id2token
print(example["input_ids"][:20])
print(tokenizer.convert_ids_to_tokens(example["input_ids"])[:20])

tensor([   2,  438,  222,   33,   39, 4742,   30,  394,  292,  176,  250,  771,
          38,  493,   59, 5407,   30,   38,  617,  151])
['[CLS]', 'forb', '##øn', 'er', 'at', 'bede', 'for', 'andre', 'end', 'sig', 'selv', '.', 'en', 'person', 'der', 'beder', 'for', 'en', 'anden', 'eller']


In [22]:
# Use decoder to check how much text we actually process
tokenizer.decode(example["input_ids"])

'[CLS] forbøn er at bede for andre end sig selv. en person der beder for en anden eller noget andet end sig selv betegnes som en forbeder. forbøn har en stor rolle i mange af verdens religioner, herunder også kristendommen. forbønnen findes både i de protestantiske kirker, den katolske kirke og den ortodokse kirke ; i den ortodokse kirke kaldes forbøn for ekteni. tanken med bønnen er et forsøg på at ændre noget i den usynlige verden, som derefter vil vise sig i den synlige. trods forbønnens store udbredelse blandt mange forskellige religioner og trosretninger, er der intet empirisk bevis for [SEP]'

In [23]:
# Check that labels are float - import because of the way BCE loss is defined
example["labels"]

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.])

# Load model

In [24]:
from transformers import AutoModelForSequenceClassification

In [25]:
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, 
                                                           problem_type="multi_label_classification", 
                                                           num_labels=len(label_names),
                                                           id2label=id2label,
                                                           label2id=label2id).to(device)

Some weights of the model checkpoint at Maltehb/danish-bert-botxo were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not 

In [26]:
model.config

BertConfig {
  "_name_or_path": "Maltehb/danish-bert-botxo",
  "architectures": [
    "BertForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "Uddannelse",
    "1": "Samfund",
    "2": "Videnskab",
    "3": "Natur",
    "4": "Teknologi",
    "5": "Kultur",
    "6": "Historie",
    "7": "Sundhed",
    "8": "Geografi",
    "9": "\u00d8konomi",
    "10": "Sport",
    "11": "Religion",
    "12": "Politik",
    "13": "Erhvervsliv"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "Erhvervsliv": 13,
    "Geografi": 8,
    "Historie": 6,
    "Kultur": 5,
    "Natur": 3,
    "Politik": 12,
    "Religion": 11,
    "Samfund": 1,
    "Sport": 10,
    "Sundhed": 7,
    "Teknologi": 4,
    "Uddannelse": 0,
    "Videnskab": 2,
    "\u00d8konomi": 9
  },
  "layer_norm_

# Metrics

In [27]:
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction

In [28]:
# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

In [29]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions
    result = multi_label_metrics(
        predictions=preds, 
        labels=labels)
    return result

# Training setup

In [30]:
# configure logging so we see training loss
logging_steps = len(encoded_dataset["train"]) // BATCH_SIZE
logging_steps

266

In [31]:
from transformers import TrainingArguments, Trainer

In [32]:
training_args = TrainingArguments(
    output_dir=MODEL_NAME,
    evaluation_strategy = "epoch",
    save_strategy="epoch",
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=NUM_EPOCHS,
    weight_decay=0.01,
    logging_steps=logging_steps,
    metric_for_best_model="f1",
    load_best_model_at_end=True
)

In [33]:
from torch import nn
from transformers import Trainer

class MyTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        labels = labels.float()
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get('logits')
        # compute custom loss
        loss_fct = nn.BCEWithLogitsLoss(weight = class_weights)
        loss = loss_fct(logits, labels.float())
        return (loss, outputs) if return_outputs else loss

In [34]:
trainer = MyTrainer(
    model,
    training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [35]:
# Sanity check
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 947
  Batch size = 32
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'eval_loss': 1.0801347494125366,
 'eval_f1': 0.1434878587196468,
 'eval_roc_auc': 0.4805361102718602,
 'eval_accuracy': 0.0,
 'eval_runtime': 6.505,
 'eval_samples_per_second': 145.58,
 'eval_steps_per_second': 4.612}

# Train

In [36]:
trainer.train()

***** Running training *****
  Num examples = 8522
  Num Epochs = 8
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 2136


Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,0.3045,0.196714,0.349076,0.608013,0.210137
2,0.1724,0.157608,0.570334,0.718535,0.419219
3,0.1307,0.141314,0.624211,0.749528,0.488912
4,0.1045,0.141226,0.675073,0.791486,0.536431
5,0.087,0.139501,0.669612,0.785906,0.537487
6,0.0748,0.140659,0.67897,0.793027,0.553326
7,0.0674,0.141447,0.684008,0.797521,0.550158
8,0.0622,0.141775,0.682555,0.799446,0.550158


***** Running Evaluation *****
  Num examples = 947
  Batch size = 32
Saving model checkpoint to models/kategoribert-lr-2e-05-maxtoken-128-epochs-8-bs-32/checkpoint-267
Configuration saved in models/kategoribert-lr-2e-05-maxtoken-128-epochs-8-bs-32/checkpoint-267/config.json
Model weights saved in models/kategoribert-lr-2e-05-maxtoken-128-epochs-8-bs-32/checkpoint-267/pytorch_model.bin
tokenizer config file saved in models/kategoribert-lr-2e-05-maxtoken-128-epochs-8-bs-32/checkpoint-267/tokenizer_config.json
Special tokens file saved in models/kategoribert-lr-2e-05-maxtoken-128-epochs-8-bs-32/checkpoint-267/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 947
  Batch size = 32
Saving model checkpoint to models/kategoribert-lr-2e-05-maxtoken-128-epochs-8-bs-32/checkpoint-534
Configuration saved in models/kategoribert-lr-2e-05-maxtoken-128-epochs-8-bs-32/checkpoint-534/config.json
Model weights saved in models/kategoribert-lr-2e-05-maxtoken-128-epochs-8-bs-32/check

TrainOutput(global_step=2136, training_loss=0.12519889475589388, metrics={'train_runtime': 1603.6435, 'train_samples_per_second': 42.513, 'train_steps_per_second': 1.332, 'total_flos': 4484947997958144.0, 'train_loss': 0.12519889475589388, 'epoch': 8.0})

# Evaluate model on test set

In [37]:
preds_output = trainer.predict(encoded_dataset["test"])
preds_output.metrics

***** Running Prediction *****
  Num examples = 1052
  Batch size = 32


{'test_loss': 0.14116135239601135,
 'test_f1': 0.6775907883082373,
 'test_roc_auc': 0.7947170417516027,
 'test_accuracy': 0.5484790874524715,
 'test_runtime': 8.2314,
 'test_samples_per_second': 127.803,
 'test_steps_per_second': 4.009}

# Inference

In [43]:
def predicted_labels(test_sentence):
    encoding = encoding = tokenizer(test_sentence, padding=True, truncation=True, max_length=MAX_TOKEN_LEN, return_tensors="pt") 
    encoding = {k:v.to(device) for k,v in encoding.items() if k in ["input_ids", "attention_mask"]}

    with torch.no_grad():
        outputs = trainer.model(**encoding)
    logits = outputs.logits

    # apply sigmoid + threshold
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(logits.squeeze().cpu())
    predictions = np.zeros(probs.shape)

    predictions[np.where(probs >= 0.5)] = 1
    # turn predicted id's into actual label names
    predicted_labels = [id2label[idx] for idx, label in enumerate(predictions) if label == 1.0]
    return predicted_labels, probs

In [54]:
test_sentence = "Visse betalingskort, mobilbanker, netbanker og pengeautomater er nede"
test_sentence = "Det er godt at spise mange gulerødder, da disse er høje på c vitaminer"
test_sentence = "I weekenden er VM i ridesport kommet rigtig godt i gang, og de danske ryttere er kommet utroligt godt fra start. Danmark sikrede sig en guldmedalje"
test_sentence = "På søndag er der gudstjeneste i Allerød"
test_sentence = "På kunstmuseet Arken kan man opleve mange spændende udstillinger"
test_sentence = "Flere uddannelser mangler studerende"

predicted_labels(test_sentence)

(['Uddannelse'],
 tensor([0.7858, 0.1520, 0.0479, 0.0289, 0.0154, 0.0378, 0.0259, 0.0158, 0.0114,
         0.0748, 0.0266, 0.0204, 0.0498, 0.0134]))

In [40]:
label_names

['Uddannelse',
 'Samfund',
 'Videnskab',
 'Natur',
 'Teknologi',
 'Kultur',
 'Historie',
 'Sundhed',
 'Geografi',
 'Økonomi',
 'Sport',
 'Religion',
 'Politik',
 'Erhvervsliv']

# Save best model

In [49]:
# Save best model 
trainer.save_model("../HF/kategoriBERT")

Saving model checkpoint to ../HF/kategoriBERT
Configuration saved in ../HF/kategoriBERT/config.json
Model weights saved in ../HF/kategoriBERT/pytorch_model.bin
tokenizer config file saved in ../HF/kategoriBERT/tokenizer_config.json
Special tokens file saved in ../HF/kategoriBERT/special_tokens_map.json


# Remove checkpoints

Since the checkpoints use storage and we do not need these we delete them

In [53]:
!rm -rf models

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
