In [None]:
# v5.0.0 では Jupyter でカスタムモデルをロードした際に AttributeError: module '__main__' has no attribute '__file__' が発生する
# 関連 Issue: https://github.com/huggingface/transformers/issues/43645
# 2026/02/05 時点では修正が反映されていないため、原因となったコードが入る前のバージョンに戻す
!pip install transformers==4.52.4

In [None]:
# トークン分類用カスタムモデル
import torch.nn as nn
from transformers import XLMRobertaConfig
from transformers.modeling_outputs import TokenClassifierOutput
from transformers.models.roberta.modeling_roberta import RobertaModel, RobertaPreTrainedModel

class XLMRobertaForTokenClassification(RobertaPreTrainedModel):
  config_class = XLMRobertaConfig

  def __init__(self, config):
    super().__init__(config)
    self.num_labels = config.num_labels
    # ボディをロード
    self.roberta = RobertaModel(config, add_pooling_layer=False) # [CLS]トークンによる表現抽出層の無効化
    # トークン分類ヘッドの用意
    self.dropout = nn.Dropout(config.hidden_dropout_prob)
    self.classifier = nn.Linear(config.hidden_size, config.num_labels)
    # 重みのロードと初期化
    self.init_weights()

  def forward(self, input_ids=None, attention_mask=None, token_type_ids=None,
              labels=None, **kwargs):
    # ボディによりエンコーダの表現を取得
    outputs = self.roberta(input_ids, attention_mask=attention_mask,
                           token_type_ids=token_type_ids, **kwargs)
    # 分類器を適用
    sequence_output = self.dropout(outputs[0]) # 最後の隠れ状態
    logits = self.classifier(sequence_output)

    # 損失計算
    loss = None
    if labels is not None:
      loss_fct = nn.CrossEntropyLoss()
      loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
    # モデルの出力オブジェクトとして返す
    return TokenClassifierOutput(loss=loss, logits=logits,
                                 hidden_states=outputs.hidden_states,
                                 attentions=outputs.attentions)

In [None]:
# 各固有表現のタグ
## 固有表現のデータセット用意
from collections import defaultdict
from datasets import load_dataset, DatasetDict

langs = ["de", "fr", "it", "en"]
fracs = [0.629, 0.229, 0.084, 0.059]
panx_ch = defaultdict(DatasetDict)

for lang, frac in zip(langs,fracs):
  ds = load_dataset("xtreme", name=f"PAN-X.{lang}")
  for split in ds:
    panx_ch[lang][split] = (
        ds[split]
        .shuffle(seed=0)
        .select(range(int(frac*ds[split].num_rows)))
    )

tags = panx_ch["de"]["train"].features["ner_tags"].feature

## タグの取得
index2tag = {idx: tag for idx, tag in enumerate(tags.names)}
tag2index = {tag: idx for idx, tag in enumerate(tags.names)}

In [None]:
# 設定
from transformers import AutoConfig
xlmr_model_name = "xlm-roberta-base"

xlmr_config = AutoConfig.from_pretrained(xlmr_model_name,
                                         num_labels=tags.num_classes,
                                         id2label=index2tag, label2id=tag2index)

In [None]:
# モデルロード
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
xlmr_model = (XLMRobertaForTokenClassification
              .from_pretrained(xlmr_model_name, config=xlmr_config)
              .to(device))

In [None]:
# 簡単な系列のトークン化
from transformers import AutoTokenizer
xlmr_tokenizer = AutoTokenizer.from_pretrained(xlmr_model_name)
text = "Jack Sparrow loves New York!"
xlmr_tokens = xlmr_tokenizer(text).tokens()

import pandas as pd

input_ids = xlmr_tokenizer.encode(text, return_tensors="pt")
pd.DataFrame([xlmr_tokens, input_ids[0].numpy()], index=["Tokens", "Input IDs"])

In [None]:
# 各トークンの予測値
outputs = xlmr_model(input_ids.to(device)).logits
print(f"Number of tokens in sequence: {len(xlmr_tokens)}")
print(f"Shape of outputs: {outputs.shape}") # [1, 10, 7]

In [None]:
# 各トークンの予測ラベル
predictions = torch.argmax(outputs, dim=-1)
preds = [tags.names[p] for p in predictions[0].cpu().numpy()]
pd.DataFrame([xlmr_tokens, preds], index=["Tokens", "Tags"])

In [None]:
# ヘルパー関数
def tag_text(text, tags, model, tokenizer):
  # 特殊文字列も含んだトークンを取得
  tokens = tokenizer(text).tokens()
  # 系列をID化
  input_ids = xlmr_tokenizer(text, return_tensors="pt").input_ids.to(device)
  # 7クラス分布の予測を取得
  outputs = model(input_ids)[0]
  # argmax で最も可能性の高いクラスを予測ラベルとする
  predictions = torch.argmax(outputs, dim=2)
  # DataFrame へ変換
  preds = [tags.names[p] for p in predictions[0].cpu().numpy()]
  return pd.DataFrame([tokens, preds], index=["Tokens", "Tags"])