In [1]:
from transformers import AutoTokenizer

bert_model_name = "bert-base-cased"
xlmr_model_name = "xlm-roberta-base"
roberta_model_name = "klue/roberta-base"

bert_tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
xlmr_tokenizer = AutoTokenizer.from_pretrained(xlmr_model_name)
roberta_tokenizer = AutoTokenizer.from_pretrained(roberta_model_name)

  from .autonotebook import tqdm as notebook_tqdm
tokenizer_config.json: 100%|██████████| 29.0/29.0 [00:00<00:00, 8.72kB/s]
config.json: 100%|██████████| 570/570 [00:00<00:00, 185kB/s]
vocab.txt: 100%|██████████| 213k/213k [00:00<00:00, 533kB/s]
tokenizer.json: 100%|██████████| 436k/436k [00:00<00:00, 84.1MB/s]
config.json: 100%|██████████| 615/615 [00:00<00:00, 287kB/s]
sentencepiece.bpe.model: 100%|██████████| 5.07M/5.07M [00:00<00:00, 12.2MB/s]
tokenizer.json: 100%|██████████| 9.10M/9.10M [00:01<00:00, 6.48MB/s]
tokenizer_config.json: 100%|██████████| 375/375 [00:00<00:00, 187kB/s]
vocab.txt: 100%|██████████| 248k/248k [00:00<00:00, 630kB/s]
tokenizer.json: 100%|██████████| 752k/752k [00:00<00:00, 40.5MB/s]
special_tokens_map.json: 100%|██████████| 173/173 [00:00<00:00, 88.8kB/s]


In [32]:
#text = "Jack Sparrow loves New York !"
text = "quiero comer carne y amo corea"

bert_tokens = bert_tokenizer(text).tokens()
xlmr_tokens = xlmr_tokenizer(text).tokens()
roberta_tokens = roberta_tokenizer(text).tokens()

print(bert_tokens)
print(xlmr_tokens)
print(roberta_tokens)

['[CLS]', 'q', '##ui', '##ero', 'come', '##r', 'car', '##ne', 'y', 'am', '##o', 'core', '##a', '[SEP]']
['<s>', '▁quiero', '▁comer', '▁carne', '▁y', '▁amo', '▁core', 'a', '</s>']
['[CLS]', 'qu', '##ier', '##o', 'com', '##er', 'car', '##ne', 'y', 'am', '##o', 'co', '##re', '##a', '[SEP]']


In [33]:
import torch.nn as nn
from transformers import XLMRobertaConfig
from transformers.modeling_outputs import TokenClassifierOutput
from transformers.models.roberta.modeling_roberta import RobertaModel, RobertaPreTrainedModel

In [34]:
class XLMRobertaForTokenClassification(RobertaPreTrainedModel):
    config_class = XLMRobertaConfig

    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.roberta = RobertaModel(config, add_pooling_layer=False)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        self.init_weights()

    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, labels=None, **kwargs):
        outputs = self.roberta(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, **kwargs)
        sequence_output = self.dropout(outputs[0])
        logits = self.classifier(sequence_output)

        loss = None

        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        return TokenClassifierOutput(loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions)

In [35]:
from collections import defaultdict
from datasets import load_dataset, DatasetDict

langs = ["es", "ko", "en", "ja"]
fracs = [0.629, 0.229, 0.084, 0.059]
panx_ch = defaultdict(DatasetDict)

for lang, frac in zip(langs, fracs):
    ds = load_dataset("xtreme", name=f"PAN-X.{lang}")

    for split in ds:
        panx_ch[lang][split] = (
            ds[split]
            .shuffle(seed=0)
            .select(range(int(frac * ds[split].num_rows)))
        )

In [36]:
tags = panx_ch["es"]["train"].features["ner_tags"].feature
print(tags)

ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None)


In [37]:
index2tag = {idx: tag for idx, tag in enumerate(tags.names)}
tag2index = {tag: idx for idx, tag in enumerate(tags.names)}

In [38]:
from transformers import AutoConfig

xlmr_config = AutoConfig.from_pretrained(xlmr_model_name, num_labels=tags.num_classes, id2label=index2tag, label2id=tag2index)

loading configuration file config.json from cache at /Users/parkhyerin/.cache/huggingface/hub/models--xlm-roberta-base/snapshots/77de1f7a7e5e737aead1cd880979d4f1b3af6668/config.json
Model config XLMRobertaConfig {
  "_name_or_path": "xlm-roberta-base",
  "architectures": [
    "XLMRobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "O",
    "1": "B-PER",
    "2": "I-PER",
    "3": "B-ORG",
    "4": "I-ORG",
    "5": "B-LOC",
    "6": "I-LOC"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "B-LOC": 5,
    "B-ORG": 3,
    "B-PER": 1,
    "I-LOC": 6,
    "I-ORG": 4,
    "I-PER": 2,
    "O": 0
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_

In [39]:
import sys
sys.path.append("../")

import torch
from utils import utils

device = utils.get_device()

xlmr_model = (XLMRobertaForTokenClassification.from_pretrained(xlmr_model_name, config=xlmr_config).to(device))

loading weights file model.safetensors from cache at /Users/parkhyerin/.cache/huggingface/hub/models--xlm-roberta-base/snapshots/77de1f7a7e5e737aead1cd880979d4f1b3af6668/model.safetensors
Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForTokenClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

In [40]:
import pandas as pd

input_ids = xlmr_tokenizer.encode(text, return_tensors="pt")
pd.DataFrame([xlmr_tokens, input_ids[0].numpy()], index=["Tokens", "Input IDs"])

Unnamed: 0,0,1,2,3,4,5,6,7,8
Tokens,<s>,▁quiero,▁comer,▁carne,▁y,▁amo,▁core,a,</s>
Input IDs,0,71122,41885,20366,113,48947,56458,11,2


In [41]:
outputs = xlmr_model(input_ids.to(device)).logits
predictions = torch.argmax(outputs, dim=-1)

print(f"token count : {len(xlmr_tokens)}")
print(f"output shape : {outputs.shape}")    # [batch_size, num_tokens, num_tags]

token count : 9
output shape : torch.Size([1, 9, 7])


In [42]:
preds = [tags.names[p] for p in predictions[0].cpu().numpy()]
pd.DataFrame([xlmr_tokens, preds], index=["Tokens", "Tags"])

Unnamed: 0,0,1,2,3,4,5,6,7,8
Tokens,<s>,▁quiero,▁comer,▁carne,▁y,▁amo,▁core,a,</s>
Tags,I-LOC,I-LOC,I-LOC,I-LOC,I-LOC,I-LOC,I-LOC,I-LOC,I-LOC


In [43]:
def tag_text(text, tags, model, tokenizer):
    tokens = tokenizer(text).tokens()
    input_ids = xlmr_tokenizer(text, return_tensors="pt").input_ids.to(device)
    outputs = model(input_ids)[0]
    predictions = torch.argmax(outputs, dim=2)
    preds = [tags.names[p] for p in predictions[0].cpu().numpy()]

    return pd.DataFrame([tokens, preds], index=["Tokens", "Tags"])

In [44]:
def create_tag_names(batch):
    return {"ner_tags_str": [tags.int2str(idx) for idx in batch["ner_tags"]]}

panx_es = panx_ch["es"].map(create_tag_names)
es_example = panx_es["train"][0]

words, labels = es_example["tokens"], es_example["ner_tags"]

In [45]:
tokenized_input = xlmr_tokenizer(es_example["tokens"], is_split_into_words=True)
tokens = xlmr_tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
pd.DataFrame([tokens], index=["Tokens"])

Unnamed: 0,0,1,2,3,4,5,6
Tokens,<s>,▁Liga,▁de,▁la,▁Justicia,▁Europa,</s>


In [46]:
word_ids = tokenized_input.word_ids()
pd.DataFrame([tokens, word_ids], index=["Tokens", "Word IDs"])

Unnamed: 0,0,1,2,3,4,5,6
Tokens,<s>,▁Liga,▁de,▁la,▁Justicia,▁Europa,</s>
Word IDs,,0,1,2,3,4,


In [47]:
previous_word_idx = None
label_ids = []

for word_idx in word_ids:
    if word_idx is None or word_idx == previous_word_idx:
        label_ids.append(-100)
    elif word_idx != previous_word_idx:
        label_ids.append(labels[word_idx])
    previous_word_idx = word_idx

labels = [index2tag[l] if l != -100 else "IGN" for l in label_ids]   # -100 : torch.nn.CrossEntropyLoss 의 ignore_index 의 속성 값이 -100 임
index = ["Tokens", "Word IDs", "Label IDs", "Labels"]

pd.DataFrame([tokens, word_ids, label_ids, labels], index=index)

Unnamed: 0,0,1,2,3,4,5,6
Tokens,<s>,▁Liga,▁de,▁la,▁Justicia,▁Europa,</s>
Word IDs,,0,1,2,3,4,
Label IDs,-100,3,4,4,4,4,-100
Labels,IGN,B-ORG,I-ORG,I-ORG,I-ORG,I-ORG,IGN
