# Multilingual Named Entity Recognition

## Loading the Dataset

In [69]:
import pyarrow as pa
import pyarrow.dataset as ds
import pandas as pd
from datasets import Dataset

train = pd.read_parquet("data/train-00000-of-00001.parquet")
dev = pd.read_parquet("data/dev-00000-of-00001.parquet")
test = pd.read_parquet("data/test-00000-of-00001.parquet")


### convert to Huggingface dataset
train_dataset = Dataset(pa.Table.from_pandas(train))
dev_dataset = Dataset(pa.Table.from_pandas(dev))
test_dataset = Dataset(pa.Table.from_pandas(test))

In [75]:
tags = ["O"] + list(set([x[0]["label"] for x in train_dataset["ents"] if x]))
index2tag = {idx: tag for idx, tag in enumerate(tags)}
tag2index = {tag: idx for idx, tag in enumerate(tags)}

['O',
 'EVENT',
 'TIME',
 'CARDINAL',
 'QUANTITY',
 'GPE',
 'NORP',
 'DATE',
 'PRODUCT',
 'FACILITY',
 'LOCATION',
 'MONEY',
 'LANGUAGE',
 'PERSON',
 'WORK OF ART',
 'PERCENT',
 'LAW',
 'ORDINAL',
 'ORGANIZATION']

## Multilingual Transformers

In [79]:
# hide_output
from transformers import AutoTokenizer

bert_model_name = "bert-base-multilingual-cased"
bert_tokenizer = AutoTokenizer.from_pretrained(bert_model_name)

### Model Definition

In [83]:
import torch.nn as nn
from transformers import BertConfig
from transformers.modeling_outputs import TokenClassifierOutput
from transformers.models.bert.modeling_bert import BertModel
from transformers.models.bert.modeling_bert import BertPreTrainedModel


class BertForTokenClassification(BertPreTrainedModel):
    config_class = BertConfig

    def __init__(self, config):
        super().__init__(config)
        self.num_labels = len(tags)
        # Load model body
        self.bert = BertModel(config, add_pooling_layer=False)
        # Set up token classification head
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, self.num_labels)
        # Load and initialize weights
        self.init_weights()

    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, 
                labels=None, **kwargs):
        # Use model body to get encoder representations
        outputs = self.bert(input_ids, attention_mask=attention_mask,
                               token_type_ids=token_type_ids, **kwargs)
        # Apply classifier to encoder representation
        sequence_output = self.dropout(outputs[0])
        logits = self.classifier(sequence_output)
        # Calculate losses
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        # Return model output object
        return TokenClassifierOutput(loss=loss, logits=logits, 
                                     hidden_states=outputs.hidden_states, 
                                     attentions=outputs.attentions)

### Loading pretrained Bert

In [85]:
# hide_output
from transformers import AutoConfig

bert_config = AutoConfig.from_pretrained(bert_model_name, 
                                         num_labels=len(tags),
                                         id2label=index2tag, label2id=tag2index)

In [86]:
import torch

device = torch.device("cpu")
# device = torch.device("mps")

bert_model = (BertForTokenClassification
              .from_pretrained(bert_model_name, config=bert_config)
              .to(device))

In [90]:
def tag_text(text, tags, model, tokenizer):
    # Get tokens with special characters
    tokens = tokenizer(text).tokens()
    # Encode the sequence into IDs
    input_ids = bert_tokenizer(text, return_tensors="pt").input_ids.to(device)
    # Get predictions as distribution over 7 possible classes
    outputs = model(input_ids)[0]
    # Take argmax to get most likely class per token
    predictions = torch.argmax(outputs, dim=2)
    # Convert to DataFrame
    preds = [tags[p] for p in predictions[0].cpu().numpy()]
    return pd.DataFrame([tokens, preds], index=["Tokens", "Tags"])
    

## Tokenizing Texts for NER

In [96]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = bert_tokenizer(examples["tokens"], truncation=True, 
                                      is_split_into_words=True)
    labels = []
    for idx, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=idx)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None or word_idx == previous_word_idx:
                label_ids.append(-100)
            else:
                label_ids.append(label[word_idx])
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

## Encode Dataset as tokens

In [98]:
# hide_output
def encode_line(line):
    ents = line["ents"]
    tokenized = bert_tokenizer(line["text"])
    labels = ["O"]

    word_start = 0
    for word in bert_tokenizer.convert_ids_to_tokens(tokenized["input_ids"]):
        if word in ("[CLS]", "[SEP]"):
            continue
        if word.startswith("##"):
            word = word[2:]
        
        word_start += line["text"][word_start:].find(word)

        if ents:
            if word_start >= ents[0]["start"] and word_start <= ents[0]["end"]:
                labels.append(ents[0]["label"])
            else:
                labels.append("O")
            if ents[0]["end"] <= word_start + len(word):
                ents = ents[1:]
        else:
            labels.append("O")

        # print(line["text"][word_start: word_start+len(word)])
        word_start += len(word)

    labels = [tag2index[x] for x in labels + ["O"]]
    tokenized["labels"] = labels
    # print(pd.DataFrame([i for i in zip(bert_tokenizer.convert_ids_to_tokens(tokenized["input_ids"]),labels)]))
    return tokenized

#  encode_line(train_dataset[2])

train_dataset = pd.DataFrame([encode_line(l) for l in train_dataset])
dev_dataset = pd.DataFrame([encode_line(l) for l in dev_dataset])
test_dataset = pd.DataFrame([encode_line(l) for l in test_dataset])
                      
train_dataset = Dataset(pa.Table.from_pandas(train_dataset))
dev_dataset = Dataset(pa.Table.from_pandas(dev_dataset))
test_dataset = Dataset(pa.Table.from_pandas(test_dataset))

## Performance Measures

In [100]:
import numpy as np

def align_predictions(predictions, label_ids):
    preds = np.argmax(predictions, axis=2)
    batch_size, seq_len = preds.shape
    labels_list, preds_list = [], []

    for batch_idx in range(batch_size):
        example_labels, example_preds = [], []
        for seq_idx in range(seq_len):
            # Ignore label IDs = -100
            if label_ids[batch_idx, seq_idx] != -100:
                example_labels.append(index2tag[label_ids[batch_idx][seq_idx]])
                example_preds.append(index2tag[preds[batch_idx][seq_idx]])

        labels_list.append(example_labels)
        preds_list.append(example_preds)

    return preds_list, labels_list

## Training

In [101]:
# hide_output
from transformers import TrainingArguments

num_epochs = 0.1
batch_size = 30
logging_steps = len(train_dataset) // batch_size
model_name = f"{bert_model_name}-finetuned-panx-de"
training_args = TrainingArguments(
    output_dir=model_name, log_level="error", num_train_epochs=num_epochs, 
    per_device_train_batch_size=batch_size, 
    per_device_eval_batch_size=batch_size, evaluation_strategy="epoch", 
    save_steps=1e6, weight_decay=0.01, disable_tqdm=False, 
    logging_steps=logging_steps, push_to_hub=False)

In [103]:
from seqeval.metrics import f1_score

def compute_metrics(eval_pred):
    y_pred, y_true = align_predictions(eval_pred.predictions, 
                                       eval_pred.label_ids)
    return {"f1": f1_score(y_true, y_pred)}

In [104]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(bert_tokenizer)

In [105]:
def model_init():
    return (BertForTokenClassification
            .from_pretrained(bert_model_name, config=bert_config)
            .to(device))

In [106]:
%env TOKENIZERS_PARALLELISM=false

env: TOKENIZERS_PARALLELISM=false


In [108]:
# hide_output
from transformers import Trainer

trainer = Trainer(model_init=model_init, args=training_args, 
                  data_collator=data_collator, compute_metrics=compute_metrics,
                  train_dataset=train_dataset,
                  eval_dataset=dev_dataset, 
                  tokenizer=bert_tokenizer)

In [109]:
#hide_input
trainer.train()



  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/49 [00:00<?, ?it/s]

{'eval_loss': 0.45122504234313965, 'eval_f1': 0.16574585635359115, 'eval_runtime': 24.5047, 'eval_samples_per_second': 59.621, 'eval_steps_per_second': 2.0, 'epoch': 0.1}
{'train_runtime': 90.2683, 'train_samples_per_second': 13.03, 'train_steps_per_second': 0.443, 'train_loss': 0.7313706398010253, 'epoch': 0.1}




TrainOutput(global_step=40, training_loss=0.7313706398010253, metrics={'train_runtime': 90.2683, 'train_samples_per_second': 13.03, 'train_steps_per_second': 0.443, 'train_loss': 0.7313706398010253, 'epoch': 0.1})

In [110]:
# trainer.model.to(device)

In [111]:
trainer.state.log_history

[{'eval_loss': 0.45122504234313965,
  'eval_f1': 0.16574585635359115,
  'eval_runtime': 24.5047,
  'eval_samples_per_second': 59.621,
  'eval_steps_per_second': 2.0,
  'epoch': 0.1,
  'step': 40},
 {'train_runtime': 90.2683,
  'train_samples_per_second': 13.03,
  'train_steps_per_second': 0.443,
  'total_flos': 40103987437620.0,
  'train_loss': 0.7313706398010253,
  'epoch': 0.1,
  'step': 40}]

In [113]:
# hide_output
text_de = "Jeff Dean ist ein Informatiker bei Google in Kalifornien, syntes jeg næste år"
tag_text(text_de, tags, trainer.model, bert_tokenizer)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
Tokens,[CLS],Jeff,Dean,ist,ein,Info,##rmat,##iker,bei,Google,in,Kalifornien,",",syn,##tes,jeg,n,##æste,år,[SEP]
Tags,O,PERSON,PERSON,O,O,O,O,O,O,ORGANIZATION,O,O,O,O,O,O,O,O,O,O
