# Multilingual Named Entity Recognition

In [1]:
%env TOKENIZERS_PARALLELISM=False

env: TOKENIZERS_PARALLELISM=False


In [2]:
import pandas as pd
import numpy as np
import torch

from transformers import AutoConfig
from transformers import AutoTokenizer

from seqeval.metrics import f1_score

import torch.nn as nn
from transformers import DistilBertConfig
from transformers.modeling_outputs import TokenClassifierOutput
from transformers import DistilBertModel
from transformers.models.bert.modeling_bert import BertPreTrainedModel

import NERutils as NU


  from .autonotebook import tqdm as notebook_tqdm


## Loading the Dataset

In [3]:

bert_model_name = "distilbert-base-multilingual-cased"
bert_tokenizer = AutoTokenizer.from_pretrained(bert_model_name)

trainNER = NU.NERdataset("data/train.parquet", bert_tokenizer)


## Loading pretrained Bert

In [4]:
import model
from transformers import DistilBertConfig
# bert_config = AutoConfig.from_pretrained(
#     bert_model_name, num_labels=len(trainNER.tags), id2label=trainNER.index2tag, label2id=trainNER.tag2index
# )

bert_config = DistilBertConfig()
print(bert_config)

if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")


model.BertForTokenClassification.from_pretrained(
    bert_model_name, config=bert_config, tags=trainNER.tags, ignore_mismatched_sizes=True
).to(device)



DistilBertConfig {
  "activation": "gelu",
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "transformers_version": "4.39.0",
  "vocab_size": 30522
}



Some weights of BertForTokenClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['bert.embeddings.LayerNorm.bias', 'bert.embeddings.LayerNorm.weight', 'bert.embeddings.position_embeddings.weight', 'bert.embeddings.word_embeddings.weight', 'bert.transformer.layer.0.attention.k_lin.bias', 'bert.transformer.layer.0.attention.k_lin.weight', 'bert.transformer.layer.0.attention.out_lin.bias', 'bert.transformer.layer.0.attention.out_lin.weight', 'bert.transformer.layer.0.attention.q_lin.bias', 'bert.transformer.layer.0.attention.q_lin.weight', 'bert.transformer.layer.0.attention.v_lin.bias', 'bert.transformer.layer.0.attention.v_lin.weight', 'bert.transformer.layer.0.ffn.lin1.bias', 'bert.transformer.layer.0.ffn.lin1.weight', 'bert.transformer.layer.0.ffn.lin2.bias', 'bert.transformer.layer.0.ffn.lin2.weight', 'bert.transformer.layer.0.output_layer_norm.bias', 'bert.transformer.layer.0.output_layer_norm.weight', 'bert.

BertForTokenClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
          

In [5]:
def tag_text(text, tags, model, tokenizer):
    # Get tokens with special characters
    tokens = tokenizer(text).tokens()
    # Encode the sequence into IDs
    input_ids = bert_tokenizer(text, return_tensors="pt").input_ids.to(device)
    # Get predictions as distribution over 7 possible classes
    outputs = model(input_ids)[0]
    # Take argmax to get most likely class per token
    predictions = torch.argmax(outputs, dim=2)
    # Convert to DataFrame
    preds = [tags[p] for p in predictions[0].cpu().numpy()]
    return pd.DataFrame([tokens, preds], index=["Tokens", "Tags"])

## Tokenizing Texts for NER

In [6]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = bert_tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    labels = []
    for idx, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=idx)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None or word_idx == previous_word_idx:
                label_ids.append(-100)
            else:
                label_ids.append(label[word_idx])
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

## Encode Dataset as tokens

## Performance Measures

In [7]:
def align_predictions(predictions, label_ids):
    preds = np.argmax(predictions, axis=2)
    batch_size, seq_len = preds.shape
    labels_list, preds_list = [], []

    for batch_idx in range(batch_size):
        example_labels, example_preds = [], []
        for seq_idx in range(seq_len):
            # Ignore label IDs = -100
            if label_ids[batch_idx, seq_idx] != -100:
                example_labels.append(trainNER.index2tag[label_ids[batch_idx][seq_idx]])
                example_preds.append(trainNER.index2tag[preds[batch_idx][seq_idx]])

        labels_list.append(example_labels)
        preds_list.append(example_preds)

    return preds_list, labels_list

## Training

In [8]:
train_params = {'batch_size': 8,
                'shuffle': True,
                'num_workers': 6,
                'pin_memory': True
                }

training_loader = torch.utils.data.DataLoader(trainNER, **train_params)

In [9]:
def compute_metrics(eval_pred):
    y_pred, y_true = align_predictions(eval_pred.predictions, eval_pred.label_ids)
    return {"f1": f1_score(y_true, y_pred)}

In [10]:
def model_init():
    return model.BertForTokenClassification.from_pretrained(
        bert_model_name, config=bert_config, tags=trainNER.tags, ignore_mismatched_sizes=True
    ).to(device)

In [11]:
# trainer.is_model_parallel
model = model_init()

Some weights of BertForTokenClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['bert.embeddings.LayerNorm.bias', 'bert.embeddings.LayerNorm.weight', 'bert.embeddings.position_embeddings.weight', 'bert.embeddings.word_embeddings.weight', 'bert.transformer.layer.0.attention.k_lin.bias', 'bert.transformer.layer.0.attention.k_lin.weight', 'bert.transformer.layer.0.attention.out_lin.bias', 'bert.transformer.layer.0.attention.out_lin.weight', 'bert.transformer.layer.0.attention.q_lin.bias', 'bert.transformer.layer.0.attention.q_lin.weight', 'bert.transformer.layer.0.attention.v_lin.bias', 'bert.transformer.layer.0.attention.v_lin.weight', 'bert.transformer.layer.0.ffn.lin1.bias', 'bert.transformer.layer.0.ffn.lin1.weight', 'bert.transformer.layer.0.ffn.lin2.bias', 'bert.transformer.layer.0.ffn.lin2.weight', 'bert.transformer.layer.0.output_layer_norm.bias', 'bert.transformer.layer.0.output_layer_norm.weight', 'bert.

In [12]:
from tqdm import tqdm
N_EPOCHS = 1
LEARNING_RATE = 1e-05
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)


def run_epoch(epoch):
    model.train()

    tr_loss, tr_accuracy = 0, 0

    for idx, batch in enumerate(tqdm(training_loader)):
        ids = batch["input_ids"].to(device, dtype=torch.long)
        mask = batch["attention_mask"].to(device, dtype=torch.long)
        targets = batch["labels"].to(device, dtype=torch.long)
        
        outputs = model(input_ids = ids,
                        attention_mask = mask,
                        labels = targets)
        
        loss, tr_logits = outputs.loss, outputs.logits
        tr_loss += loss.item()

        flattened_targets = targets.view(-1) # shape (batch_size * seq_len,)
        active_logits = tr_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
        # now, use mask to determine where we should compare predictions with targets (includes [CLS] and [SEP] token predictions)
        active_accuracy = mask.view(-1) == 1 # active accuracy is also of shape (batch_size * seq_len,)
        targets = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()


for epoch in range(N_EPOCHS):
    print(f"Training epoch: {epoch + 1}")
    run_epoch(epoch)

Training epoch: 1


  4%|▍         | 66/1471 [00:50<14:24,  1.63it/s] 

In [None]:
# hide_output
text_de = (
    "Jeff Dean ist ein Informatiker bei Google in Kalifornien, syntes jeg næste år"
)
tag_text(text_de, trainNER.tags, model, bert_tokenizer)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
Tokens,[CLS],Jeff,Dean,ist,ein,Info,##rmat,##iker,bei,Google,in,Kalifornien,",",syn,##tes,jeg,n,##æste,år,[SEP]
Tags,O,O,O,O,O,PERCENT,QUANTITY,O,LOCATION,O,WORK OF ART,O,O,O,ORDINAL,O,O,O,O,O
