In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
!pip install evaluate
!pip install seqeval

Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Collecting fsspec>=2021.05.0 (from fsspec[http]>=2021.05.0->evaluate)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading evaluate-0.4.5-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, evaluate
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.5.1
    Uninstalling fsspec-2025.5.1:
      Successfully uninstalled fsspec-2025.5.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframes 2.8.0 req

In [4]:
from transformers import AutoModelForTokenClassification,AutoTokenizer,TrainingArguments,Trainer
import torch
import evaluate  # pip install evaluate
import seqeval   # pip install seqeval
from datasets import load_dataset
from transformers import DataCollatorForTokenClassification
import numpy as np

2025-07-25 10:52:46.297874: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753440766.666950      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753440766.779875      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
def train(rank):
    # 定义标签  O: 表示非实体
    entites = ['O'] + list({'PER', 'ORG', 'LOC'})
    tags = ['O']
    for entity in entites[1:]:
        tags.append('B-'+entity)
        tags.append('I-'+entity)
    print(tags)

    # 加载分词器
    tokenizer = AutoTokenizer.from_pretrained('google-bert/bert-base-chinese')

    # 定义回调函数处理数据
    def process_data(items):
        input_data = {} 
        max_length = 512  # 模型支持的最大长度
        # 生成iput_ids, token_type_ids, attention_mask, labels
        input_ids = []
        for tokens in items['tokens']:
            token_ids = tokenizer.convert_tokens_to_ids(tokens)
            # 截断 token_ids 到最大长度
            token_ids = token_ids[:max_length]
            input_ids.append(token_ids)
        input_data['input_ids'] = input_ids
        input_data['token_type_ids'] = [[0]*len(token_ids) for token_ids in input_ids]
        input_data['attention_mask'] = [[1]*len(token_ids) for token_ids in input_ids]
        # 对标签进行同样的截断操作
        input_data['labels'] = [labels[:max_length] for labels in items['ner_tags']]
        return input_data

    # 加载hf中dataset
    ds = load_dataset('doushabao4766/msra_ner_k_V3')
    ds1 = ds.map(process_data, batched=True)  # batched 每次传入自定义方法样本数量多个，加快处理速度
    ds1.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'labels'])
    num_labels = len(tags)
    id2label = {i: label for i, label in enumerate(tags)}
    label2id = {v: k for k, v in id2label.items()}

    model = AutoModelForTokenClassification.from_pretrained('google-bert/bert-base-chinese',
                                                            num_labels=num_labels,
                                                            id2label=id2label,
                                                            label2id=label2id)

    args = TrainingArguments(
        output_dir="ner_train",  # 模型训练工作目录（tensorboard，临时模型存盘文件，日志）
        num_train_epochs = 3,    # 训练 epoch
        save_safetensors=False,  # 设置False保存文件可以通过torch.load加载
        per_device_train_batch_size=32,  # 训练批次
        per_device_eval_batch_size=32,
        report_to='tensorboard',  # 训练输出记录，不写的话会默认到XX网站里,所以要写上
        eval_strategy="epoch",
        local_rank=rank,   # 当前进程 RANK
        fp16=True,               # 使用混合精度
        lr_scheduler_type='linear',  # 动态学习率
        warmup_steps=100,        # 预热步数
        ddp_find_unused_parameters=False  # 优化DDP性能
    )

    # metric 方法
    def compute_metric(result):
        # result 是一个tuple (predicts, labels)
        
        # 获取评估对象
        seqeval = evaluate.load('seqeval')
        predicts,labels = result
        predicts = np.argmax(predicts, axis=2)
        
        # 准备评估数据
        predicts = [[tags[p] for p,l in zip(ps,ls) if l != -100]
                    for ps,ls in zip(predicts,labels)]
        labels = [[tags[l] for p,l in zip(ps,ls) if l != -100]
                    for ps,ls in zip(predicts,labels)]
        results = seqeval.compute(predictions=predicts, references=labels)

        return results

    data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, padding=True)

    # Initialize the Trainer
    trainer = Trainer(
        model,
        args,
        train_dataset=ds1['train'],
        eval_dataset=ds1['test'],
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metric,
    )

    # Start training
    trainer.train()

In [5]:
rank = 2
train(rank)

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/624 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

README.md:   0%|          | 0.00/697 [00:00<?, ?B/s]

(…)-00000-of-00001-42717a92413393f9.parquet:   0%|          | 0.00/13.9M [00:00<?, ?B/s]

(…)-00000-of-00001-8899cab5fdab45bc.parquet:   0%|          | 0.00/946k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/45001 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3443 [00:00<?, ? examples/s]

Map:   0%|          | 0/45001 [00:00<?, ? examples/s]

Map:   0%|          | 0/3443 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/412M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Loc,Org,Per,Overall Precision,Overall Recall,Overall F1,Overall Accuracy
1,0.1128,0.027313,"{'precision': 0.9464221503056455, 'recall': 0.9228611500701263, 'f1': 0.934493165276052, 'number': 2852}","{'precision': 0.809719370294319, 'recall': 0.8962121212121212, 'f1': 0.8507731032002877, 'number': 1320}","{'precision': 0.9368078175895765, 'recall': 0.9567531603459747, 'f1': 0.9466754443712969, 'number': 1503}",0.909295,0.925639,0.917394,0.991777
2,0.0215,0.028254,"{'precision': 0.9540270848182466, 'recall': 0.9386395511921458, 'f1': 0.9462707670554966, 'number': 2852}","{'precision': 0.8798820928518791, 'recall': 0.9045454545454545, 'f1': 0.8920433320881584, 'number': 1320}","{'precision': 0.9582504970178927, 'recall': 0.9620758483033932, 'f1': 0.9601593625498007, 'number': 1503}",0.937412,0.936916,0.937164,0.993116
3,0.0066,0.035411,"{'precision': 0.9583481666073336, 'recall': 0.9438990182328191, 'f1': 0.9510687157745982, 'number': 2852}","{'precision': 0.8806941431670282, 'recall': 0.9227272727272727, 'f1': 0.9012208657047724, 'number': 1320}","{'precision': 0.9572086899275839, 'recall': 0.9673985362608117, 'f1': 0.9622766379880874, 'number': 1503}",0.93924,0.945198,0.94221,0.993275




Downloading builder script: 0.00B [00:00, ?B/s]

Trainer is attempting to log a value of "{'precision': 0.9464221503056455, 'recall': 0.9228611500701263, 'f1': 0.934493165276052, 'number': 2852}" of type <class 'dict'> for key "eval/LOC" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.809719370294319, 'recall': 0.8962121212121212, 'f1': 0.8507731032002877, 'number': 1320}" of type <class 'dict'> for key "eval/ORG" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.9368078175895765, 'recall': 0.9567531603459747, 'f1': 0.9466754443712969, 'number': 1503}" of type <class 'dict'> for key "eval/PER" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.9540270848182466, 'recall': 0.9386395511921458, 