In [None]:
# ドイツ語の例
## 固有表現のデータセット用意
from collections import defaultdict
from datasets import load_dataset, DatasetDict

langs = ["de", "fr", "it", "en"]
fracs = [0.629, 0.229, 0.084, 0.059]
panx_ch = defaultdict(DatasetDict)

for lang, frac in zip(langs,fracs):
  ds = load_dataset("xtreme", name=f"PAN-X.{lang}")
  for split in ds:
    panx_ch[lang][split] = (
        ds[split]
        .shuffle(seed=0)
        .select(range(int(frac*ds[split].num_rows)))
    )

## ドイツ語の例
tags = panx_ch["de"]["train"].features["ner_tags"].feature
def create_tag_names(batch):
  return {"ner_tags_str": [tags.int2str(idx) for idx in batch["ner_tags"]]}
panx_de = panx_ch["de"].map(create_tag_names)
de_example = panx_de["train"][0]

words, labels = de_example["tokens"], de_example["ner_tags"]
print(f"Tokens: {words}") # 各トークン内容
print(f"Labels: {labels}") # 各トークンラベル

In [None]:
# トークン化
## トークナイザー
from transformers import AutoTokenizer
xlmr_model_name = "xlm-roberta-base"
xlmr_tokenizer = AutoTokenizer.from_pretrained(xlmr_model_name)

## トークン化処理
import pandas as pd
tokenized_input = xlmr_tokenizer(de_example["tokens"], is_split_into_words=True)
tokens = xlmr_tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"]) # ID をトークン化
pd.DataFrame([tokens], index=["Tokens"])

In [None]:
# 2つ目以降のサブワード表現をマスク
word_ids = tokenized_input.word_ids()
pd.DataFrame([tokens, word_ids], index=["Tokens", "Word IDs"])

In [None]:
# 学習したくないトークンのラベルを -100
previous_word_idx = None
label_ids = []

for word_idx in word_ids:
  if word_idx is None or word_idx == previous_word_idx: # 特殊トークンもしくは連続トークン
    label_ids.append(-100)
  elif word_idx != previous_word_idx:
    label_ids.append(labels[word_idx])
  previous_word_idx = word_idx

index2tag = {idx: tag for idx, tag in enumerate(tags.names)}
labels = [index2tag[l] if l != -100 else "IGN" for l in label_ids]
index = ["Tokens", "Word IDs", "Label IDs", "Labels"]

pd.DataFrame([tokens, word_ids, label_ids, labels], index=index)

In [None]:
# データセット全体に拡張
def tokenize_and_align_labels(examples):
  tokenized_inputs = xlmr_tokenizer(examples["tokens"], truncation=True,
                                    is_split_into_words=True)
  labels = []
  for idx, label in enumerate(examples["ner_tags"]):
    word_ids = tokenized_inputs.word_ids(batch_index=idx)
    previous_word_idx = None
    label_ids = []
    for word_idx in word_ids:
      if word_idx is None or word_idx == previous_word_idx:
        label_ids.append(-100)
      else:
        label_ids.append(label[word_idx])
      previous_word_idx = word_idx
    labels.append(label_ids)
  tokenized_inputs["labels"] = labels
  return tokenized_inputs

In [None]:
# トークン化を反復
def encode_panx_dataset(corpus):
  return corpus.map(tokenize_and_align_labels, batched=True,
                    remove_columns=['langs', 'ner_tags', 'tokens'])

In [None]:
# ドイツ語コーパスをエンコード
panx_de_encoded = encode_panx_dataset(panx_ch["de"])
print(panx_de_encoded["train"][0]) # input_ids, attention_mask, labels のリスト