In [2]:
from transformers import pipeline
import torch

print(f"PyTorch版本: {torch.__version__}")

classifier = pipeline(
    task="sentiment-analysis",
    model="distilbert/distilbert-base-uncased-finetuned-sst-2-english",
    framework="pt"
)

print(classifier("I love using Hugging Face models!"))


PyTorch版本: 2.6.0+cpu


Device set to use cpu


[{'label': 'POSITIVE', 'score': 0.9992625117301941}]


测试

In [3]:
from transformers import pipeline

# 显式禁用权重初始化检查
classifier = pipeline(
    task="sentiment-analysis",
    model="distilbert/distilbert-base-uncased-finetuned-sst-2-english",
    framework="pt",
    model_kwargs={"low_cpu_mem_usage": True}  # 替代方案
)

text = "hello my name is abc,yesterday i was in india,today i am in usa,thank you for your help"
print(classifier(text))

Device set to use cpu


[{'label': 'POSITIVE', 'score': 0.9994016885757446}]


# 4.多语言命名体识别
## 4.1数据集
使用xtreme 数据集的子集
首先查看xtreme数据集的配置

In [28]:
from datasets import get_dataset_config_names
from datasets import load_dataset

xtreme_subsets = get_dataset_config_names("xtreme")
print(len(xtreme_subsets))

183


In [29]:
panx_subsets = [s for s in xtreme_subsets if s.startswith("PAN")]
panx_subsets[:5]

['PAN-X.af', 'PAN-X.ar', 'PAN-X.bg', 'PAN-X.bn', 'PAN-X.de']

加载德语语料库的示例代码

In [30]:
data = load_dataset("xtreme", name="PAN-X.de")
german_test = data["test"][10]

In [31]:
print(german_test)
print(data)
print(data["train"].features)   # Dataset对象中的features属性中指定了与每列相关的底层数据类型

{'tokens': ['God', 'Forgives', ',', 'I', 'Don’t', "''"], 'ner_tags': [3, 4, 4, 4, 4, 0], 'langs': ['de', 'de', 'de', 'de', 'de', 'de']}
DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 20000
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 10000
    })
})
{'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'ner_tags': Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None), length=-1, id=None), 'langs': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)}


为了追踪每种语言，我们创建了一个Python defaultdict，将语言代码作为key，并将类型为DatesetDict的PAN-X语料库作为value。

In [32]:
from collections import defaultdict
from datasets import  DatasetDict

langs = ["de","fr","it","en"]   #代表语言
fracs = [0.629,0.229,0.084,0.059]   #代表数据集的比例
panx_ch = defaultdict(DatasetDict)  #创建一个字典，键为langs中的每个语言，值为一个DatasetDict对象

for lang, frac in zip(langs, fracs):
    # 加载数据集
    ds = load_dataset("xtreme", name=f"PAN-X.{lang}")
    # 根据口语比例对每个分割进行打乱和下采样
    for split in ds:
        panx_ch[lang][split] = (
            ds[split]
            .shuffle(seed=42)   # seed用来保证每次生成的数据集都是相同的 
            .select(range(int(frac * ds[split].num_rows)))
        )

In [33]:
import pandas as pd

pd.DataFrame({lang: [panx_ch[lang]["train"].num_rows] for lang in langs},
             index=["Number of training examples"])

Unnamed: 0,de,fr,it,en
Number of training examples,12580,4580,1680,1180


从德语语料库中抽取一个样本

In [34]:
element = panx_ch["de"]["train"][0]
for key, value in element.items():
    print(f"{key}: {value}")

tokens: ['Olympique', 'Nîmes', ',', 'Auxerres', 'seinerzeitiger', 'drittklassiger', 'Endspielgegner', ',', 'hatte', 'sich', 'erst', 'gar', 'nicht', 'für', 'die', 'Hauptrunde', 'qualifizieren', 'können', '.']
ner_tags: [3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
langs: ['de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de']


In [35]:
print(panx_ch["fr"]["test"][23])

{'tokens': ['Pour', 'la', 'saison', '2002-2003', ',', 'il', 'rejoint', 'les', 'Hawks', "d'Atlanta", '.'], 'ner_tags': [0, 0, 3, 4, 0, 0, 0, 0, 3, 4, 0], 'langs': ['fr', 'fr', 'fr', 'fr', 'fr', 'fr', 'fr', 'fr', 'fr', 'fr', 'fr']}


转换ner_tags的数字为人类能看懂的标记

In [36]:
for key, value in panx_ch["de"]["train"].features.items():
    print(f"{key}: {value}")

tokens: Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)
ner_tags: Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None), length=-1, id=None)
langs: Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)


从训练集中提取特征列表

In [37]:
tags = panx_ch["de"]["train"].features["ner_tags"].feature
print(tags)

ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None)


In [38]:
def create_tag_names(batch):
    return{"ner_tags_str": [tags.int2str(tag) for tag in batch["ner_tags"]]}

panx_de = panx_ch["de"].map(create_tag_names)


In [39]:
de_example = panx_de["train"][0]
pd.DataFrame([de_example["tokens"], de_example["ner_tags_str"]],
             index=["tokens", "tags"])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
tokens,Olympique,Nîmes,",",Auxerres,seinerzeitiger,drittklassiger,Endspielgegner,",",hatte,sich,erst,gar,nicht,für,die,Hauptrunde,qualifizieren,können,.
tags,B-ORG,I-ORG,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O


创建用于词元分类的自定义模型：
使用RoBERTa作为基本模型，然后在此之上增加适用于XLM-R的设置。

In [40]:
import torch.nn as nn
from transformers import XLMRobertaConfig
from transformers.modeling_outputs import TokenClassifierOutput
from transformers.models.roberta.modeling_roberta import RobertaModel
from transformers.models.roberta.modeling_roberta import RobertaPreTrainedModel

class XLMRobertaForTokenClassification(RobertaPreTrainedModel):
    config_class = XLMRobertaConfig
    
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        # 加载roberta模型
        self.roberta = RobertaModel(config, add_pooling_layer=False)
        # 设置词元分类头
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        # 初始化权重
        self.init_weights()

    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, labels=None, **kwargs):
        # 使用模型本体得到编码器表示
        outputs = self.roberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            **kwargs
        )
        # 将分类器应用于编码器输出
        sequence_output = outputs[0]
        sequence_output = self.dropout(sequence_output)
        logits = self.classifier(sequence_output)
        # 计算losses
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        # 返回模型输出的对象
        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )



这里config_class确保在初始化新模型时使用标准的XLM-R设置。如果你想更改默认的参数，则可以通过覆盖配置中的默认设置来实现。使用super()方法调用RobertaPreTrainedModel类的初始化函数。这个抽象类处理预训练权重的初始化或加载。然后，我们加载我们的模型主体，即RobertaModel，并使用自己的分类头进行扩展，其中包括一个dropout和一个标准前馈层。请注意，我们设置add_pooling_layer=False，以确保所有隐藏状态都被返回，而不仅仅是与[CLS]词元相关联的状态。最后，我们通过调用从RobertaPreTrainedModel继承的init weights()方法来初始化所有权重，该方法将为模型主体加载预训练权重，并随机初始化我们的词元分类头的权重。

我们唯一需要做的是定义模型如何在forward()方法中进行前向传递。在前向传递期间，首先将数据通过模型主体进行馈送。有许多输入变量，但我们现在所需的只是input_ids和attention_mask。随后，将模型主体输出的隐藏状态馈送到dropout和分类层中。如果我们在前向传递还提供了标注，则可以直接计算损失。如果存在注意力掩码，则需要做一些额外的工作以确保我们仅计算未掩码词元的损失。最后，我们将所有输出包装在TokenClassifierOutput对象中，从而允许我们通过前几章中熟悉的命名元组来访问元素。

通过实现一个简单类的两个函数，我们可以构建自己的自定义Transformer模型。由于我们继承自PreTrainedModel，因此能立即获得Hugging FaceTransformers库所有有用的Transformer工具，如from_pretrained()！接下来我们看看如何将预训练的权重加载到我们的自定义模型中。

In [41]:
index2tag = {idx: tag for tag, idx in enumerate(tags.names)}
tag2index = {tag: idx for idx, tag in enumerate(tags.names)} 

In [42]:
from transformers import AutoConfig
xlmr_model_name = "xlm-roberta-base"  
xlmr_config = AutoConfig.from_pretrained(xlmr_model_name, 
                                         num_labels=tags.num_classes,
                                         id2label=index2tag,
                                         label2id=tag2index,)


loading configuration file config.json from cache at C:\Users\liangzhi_danta\.cache\huggingface\hub\models--xlm-roberta-base\snapshots\e73636d4f797dec63c3081bb6ed5c7b0bb3f2089\config.json
Model config XLMRobertaConfig {
  "architectures": [
    "XLMRobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "B-LOC": 5,
    "B-ORG": 3,
    "B-PER": 1,
    "I-LOC": 6,
    "I-ORG": 4,
    "I-PER": 2,
    "O": 0
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "B-LOC": 5,
    "B-ORG": 3,
    "B-PER": 1,
    "I-LOC": 6,
    "I-ORG": 4,
    "I-PER": 2,
    "O": 0
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "abs

In [43]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")   
xlmr_model = (XLMRobertaForTokenClassification
              .from_pretrained(xlmr_model_name, config=xlmr_config)
              .to(device))

loading weights file model.safetensors from cache at C:\Users\liangzhi_danta\.cache\huggingface\hub\models--xlm-roberta-base\snapshots\e73636d4f797dec63c3081bb6ed5c7b0bb3f2089\model.safetensors
Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForTokenClassification: ['lm_head.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some 

In [None]:
from transformers import AutoTokenizer

# 定义XLM-R模型名称（需与自定义模型配置一致）
xlmr_model_name = "xlm-roberta-base"  # 或其他XLM-R变体
# 初始化分词器
xlmr_tokenizer = AutoTokenizer.from_pretrained(xlmr_model_name)

inputs_ids = xlmr_tokenizer.encode(text, return_tensors="pt")
pd.DataFrame([xlmr_tokens, input_ids[0].numpy()], index=["Tokens", "Input IDs"])

NameError: name 'xlmr_tokenizer' is not defined