In [None]:
from datasets import get_dataset_config_names

xtreme_subsets=get_dataset_config_names("xtreme")
print(f"XTREME 서브셋 개수: {len(xtreme_subsets)}")

In [None]:
panx_subsets = [s for s in xtreme_subsets if s.startswith("PAN")]
panx_subsets[:3]

In [None]:
from datasets import load_dataset

load_dataset("xtreme", name="PAN-X.de")

In [None]:
from collections import defaultdict
from datasets import DatasetDict

langs=["de", "fr", "it", "en"]
fracs=[0.629, 0.229, 0.084, 0.059]
# 키가 없는 경우 DatasetDict를 반환합니다.
panx_ch=defaultdict(DatasetDict)

for lang, frac in zip(langs, fracs):
    # 다국어 말뭉치를 로드한다.
    ds=load_dataset("xtreme", name=f"PAN-X.{lang}")
    # 각 분할을 언어 비율에 따라 다운샘플링하고 섞는다.
    for split in ds:
        panx_ch[lang][split]=(
            ds[split]
            .shuffle(seed=0)
            .select(range(int(frac *ds[split].num_rows)))
        )

In [None]:
import pandas as pd
pd.DataFrame({lang: [panx_ch[lang]["train"].num_rows] for lang in langs},
             index=["Number of training examples"])

In [None]:
element=panx_ch["de"]["train"][0]
for key, value in element.items():
    print(f"{key}:{value}")

In [None]:
for key, value in panx_ch["de"]["train"].features.items():
    print(f"{key}: {value}")

In [None]:
tags=panx_ch["de"]["train"].features["ner_tags"].feature
print(tags)

In [None]:
def create_tag_names(batch):
    return {
        "ner_tags_str": [tags.int2str(idx) for idx in batch["ner_tags"]]
    }

panx_de=panx_ch["de"].map(create_tag_names)

In [None]:
de_example=panx_de["train"][0]
pd.DataFrame([de_example["tokens"], de_example["ner_tags_str"]],["Tokens","Tags"])

In [None]:
from collections import Counter

split2freqs=defaultdict(Counter)
for split, dataset in panx_de.items():
    for row in dataset["ner_tags_str"]:
        for tag in row:
            if tag.startswith("B"):
                tag_type=tag.split("-")[1]
                split2freqs[split][tag_type] +=1
pd.DataFrame.from_dict(split2freqs, orient="index")

In [17]:
from transformers import AutoTokenizer

bert_model_name="bert-base-cased"
xlmr_model_name="xlm-roberta-base"
bert_tokenizer=AutoTokenizer.from_pretrained(bert_model_name)
xlmr_tokenizer=AutoTokenizer.from_pretrained(xlmr_model_name)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

In [19]:
text="Jack Sparrow loves New York!"
bert_tokens=bert_tokenizer(text).tokens()
xlmr_tokens=xlmr_tokenizer(text).tokens()
print(bert_tokenizer)
print(xlmr_tokenizer)

BertTokenizerFast(name_or_path='bert-base-cased', vocab_size=28996, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
XLMRobertaTokenizerFast(name_or_path='xlm-roberta-base', vocab_size=250002, model_max_length=512, is_fast=True, padding_side='rig

In [20]:
"".join(xlmr_tokens).replace(u"\u2581", " ")

'<s> Jack Sparrow loves New York!</s>'

In [21]:
import torch.nn as nn
from transformers import XLMRobertaConfig
from transformers.modeling_outputs import TokenClassifierOutput
from transformers.models.roberta.modeling_roberta import RobertaModel
from transformers.models.roberta.modeling_roberta import RobertaPreTrainedModel

class XLMRobertaForTokenClassification(RobertaPreTrainedModel):
    config_class=XLMRobertaConfig

    def __init__(self, config):
        super().__init__(config)
        self.num_labels=config.num_labels
        # 모델 바디 로드
        self.roberta=RobertaModel(config, add_pooling_layer=False)
        # 토큰 분류 헤드
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier=nn.Linear(config.hidden_size, config.num_labels)
        # 가중치 로드, 초기화
        self.init_weights()

    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, labels=None, **kwargs):
        # 모델 바디를 사용해 인코더 표현 얻기
        outputs=self.roberta(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, **kwargs)
        # 인코더 표현 헤드에 통과
        sequence_output=self.dropout(outputs[0])
        logits=self.classifier(sequence_output)
        # 손실 계산
        loss=None
        if labels is not None:
            loss_fct=nn.CrossEntropyLoss()
            loss=loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        # 모델 출력 객체 반환
        return TokenClassifierOutput(loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions)