# Multilingual Named Entity Recognition

In [5]:
%env TOKENIZERS_PARALLELISM=False

env: TOKENIZERS_PARALLELISM=False


In [6]:
import pandas as pd
import numpy as np
import torch

from transformers import AutoConfig
from transformers import AutoTokenizer

from seqeval.metrics import f1_score

import torch.nn as nn
from transformers import BertConfig
from transformers.modeling_outputs import TokenClassifierOutput
from transformers.models.bert.modeling_bert import BertModel
from transformers.models.bert.modeling_bert import BertPreTrainedModel

import utils.NERutils as NU


## Loading the Dataset

In [7]:

bert_model_name = "bert-base-multilingual-cased"
bert_tokenizer = AutoTokenizer.from_pretrained(bert_model_name)

trainNER = NU.NERdataset("data/train.parquet", bert_tokenizer)


## Loading pretrained Bert

In [8]:
import model

bert_config = AutoConfig.from_pretrained(
    bert_model_name, num_labels=len(trainNER.tags), id2label=trainNER.index2tag, label2id=trainNER.tag2index
)

if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")


model.BertForTokenClassification.from_pretrained(
    bert_model_name, config=bert_config, tags=trainNER.tags
).to(device)



Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, e

In [None]:
def tag_text(text, tags, model, tokenizer):
    # Get tokens with special characters
    tokens = tokenizer(text).tokens()
    # Encode the sequence into IDs
    input_ids = bert_tokenizer(text, return_tensors="pt").input_ids.to(device)
    # Get predictions as distribution over 7 possible classes
    outputs = model(input_ids)[0]
    # Take argmax to get most likely class per token
    predictions = torch.argmax(outputs, dim=2)
    # Convert to DataFrame
    preds = [tags[p] for p in predictions[0].cpu().numpy()]
    return pd.DataFrame([tokens, preds], index=["Tokens", "Tags"])

## Tokenizing Texts for NER

In [None]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = bert_tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    labels = []
    for idx, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=idx)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None or word_idx == previous_word_idx:
                label_ids.append(-100)
            else:
                label_ids.append(label[word_idx])
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

## Encode Dataset as tokens

## Performance Measures

In [None]:
def align_predictions(predictions, label_ids):
    preds = np.argmax(predictions, axis=2)
    batch_size, seq_len = preds.shape
    labels_list, preds_list = [], []

    for batch_idx in range(batch_size):
        example_labels, example_preds = [], []
        for seq_idx in range(seq_len):
            # Ignore label IDs = -100
            if label_ids[batch_idx, seq_idx] != -100:
                example_labels.append(trainNER.index2tag[label_ids[batch_idx][seq_idx]])
                example_preds.append(trainNER.index2tag[preds[batch_idx][seq_idx]])

        labels_list.append(example_labels)
        preds_list.append(example_preds)

    return preds_list, labels_list

## Training

In [None]:
train_params = {'batch_size': 8,
                'shuffle': True,
                'num_workers': 0,
                }

training_loader = torch.utils.data.DataLoader(trainNER, **train_params)

In [None]:
def compute_metrics(eval_pred):
    y_pred, y_true = align_predictions(eval_pred.predictions, eval_pred.label_ids)
    return {"f1": f1_score(y_true, y_pred)}

In [None]:
def model_init():
    return model.BertForTokenClassification.from_pretrained(
        bert_model_name, config=bert_config, tags=trainNER.tags
    ).to(device)

In [None]:
# trainer.is_model_parallel
model = model_init()

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from tqdm import tqdm
N_EPOCHS = 1
LEARNING_RATE = 1e-05
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)


def run_epoch(epoch):
    model.train()

    tr_loss, tr_accuracy = 0, 0

    for idx, batch in enumerate(tqdm(training_loader)):
        ids = batch["input_ids"].to(device, dtype=torch.long)
        mask = batch["attention_mask"].to(device, dtype=torch.long)
        targets = batch["labels"].to(device, dtype=torch.long)
        
        outputs = model(input_ids = ids,
                        attention_mask = mask,
                        labels = targets)
        
        loss, tr_logits = outputs.loss, outputs.logits
        tr_loss += loss.item()

        flattened_targets = targets.view(-1) # shape (batch_size * seq_len,)
        active_logits = tr_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
        # now, use mask to determine where we should compare predictions with targets (includes [CLS] and [SEP] token predictions)
        active_accuracy = mask.view(-1) == 1 # active accuracy is also of shape (batch_size * seq_len,)
        targets = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()


for epoch in range(N_EPOCHS):
    print(f"Training epoch: {epoch + 1}")
    run_epoch(epoch)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Training epoch: 1


  0%|          | 2/1471 [00:19<3:53:16,  9.53s/it]


KeyboardInterrupt: 

In [None]:
# hide_output
text_de = (
    "Jeff Dean ist ein Informatiker bei Google in Kalifornien, syntes jeg næste år"
)
tag_text(text_de, trainNER.tags, model, bert_tokenizer)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
Tokens,[CLS],Jeff,Dean,ist,ein,Info,##rmat,##iker,bei,Google,in,Kalifornien,",",syn,##tes,jeg,n,##æste,år,[SEP]
Tags,O,O,O,O,O,PERCENT,QUANTITY,O,LOCATION,O,WORK OF ART,O,O,O,ORDINAL,O,O,O,O,O
