# 处理数据
1. 使用datasets进行数据的加载
2. 加载tokenizer
3. 写process_function
4. 对DataDict进行映射
5. 定义数据处理器

In [None]:
from datasets import load_dataset


dataset = load_dataset("wnut_17", trust_remote_code=True)
dataset

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("your/model/name or path")
tokenizer

**下面想要表达的意思是命名实体识别中的标记方式**

In [None]:
dataset_labes = dataset["train"].features["ner_tags"].feature.names
dataset_labes

**下面想要表达的意思是:**
tokens中的一个字符串可以被拆分成为多个ids,所以需要进行特殊的处理,需要和labels对应上

`注意点1`如果不加如is_split_into_words参数会把example["tokens"]中的每个元素看成一个单独的句子进行切分

`注意点2`FastTokenizer的word_ids使用方法

In [None]:
example = dataset["train"][0]
tokenized_input = tokenizer(example["tokens"], is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])

tokenized_input.word_ids(0)

**任务中数据处理的关键**：这里需要将ner_tags和tokens对应上

`注意点1`:examples的形状是{"input_ids":[[...],[...]],"attention_mask":[[...],[...]]}
`注意点2`：word_ids是有特殊标记的token偏移的，但是label是没有的

In [None]:
def process_function(examples):
    tokenized_inputs = tokenizer(examples["tokens"], is_split_into_words=True, truncation=True)
    labels = []         # 最终的长度应该是tokenized_inputs["input_ids"]的长度是一致的
    for i, label in enumerate(examples["ner_tags"]):
        # 这里的word_ids和label刚好就是需要对齐处理的两条数据
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []      # 用于存储和tokenized_input[i]["input_ids"]对齐的标签信息
        # 接下来的工作就是根据word_ids, label转化到label_ids中然后将label_ids放到labels中
        for word_id in word_ids:
            if word_id is None:
                label_ids.append(-100)
            elif word_id != previous_word_idx:
                # 如何和前面不一样就添加到label中
                label_ids.append(label[word_id])
            else:
                # 和前面的一样但不是None（因为字词应该被忽略掉，只看第一个）
                label_ids.append(-100)
            previous_word_idx = word_id
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs
        
    

In [None]:
# train_dataset = dataset["train"].select(range(3))
# train_dataset.map(process_function, batched=True, remove_columns=dataset["train"].column_names)
tokenized_dataset = dataset.map(process_function,batched=True, remove_columns=dataset["train"].column_names)

In [None]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

# 准备训练
1. 创建模型（确定创建模型的时候是否要传入特殊的config）
2. 定义评估函数
3. 准备TrainingArguments
4. 准备Trainer

In [None]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

id2label = {index:value for index, value in enumerate(dataset_labes)}
label2id = {value:index for index, value in enumerate(dataset_labes)}

model = AutoModelForTokenClassification.from_pretrained("your/model/name or path", num_labels=len(dataset_labes), id2label=id2label, label2id=label2id)

In [None]:
import numpy as np
from evaluate import load

seqeval = load("../../evaluate/seqeval.py")
def compute_metrics(eval_pred):
    # 这里是以batch_size的形式传入的，所以需要两层解耦
    # 而且seqeval传入的predictions和references需要的是命名实体识别的字符串类型数据，而不是数值型数据，所以需要根据数值型数据映射到字符串中
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=-1)
    true_predictions = [
        [dataset_labes[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [dataset_labes[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    print(results)
    return {
        "precision":results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"]
    }
seqeval
    

In [None]:
training_args = TrainingArguments(
    output_dir='./checkpoint',
    per_device_train_batch_size=2,
    per_device_eval_batch_size=4,
    logging_strategy="steps",
    logging_steps=300,
    save_strategy="epoch",
    save_total_limit=3,
    gradient_accumulation_steps=2,
    learning_rate=3e-5,
    warmup_steps=3,
    weight_decay=0.01,
    num_train_epochs=2,
    eval_strategy="steps",
    eval_steps=500
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"].shuffle(),
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate(tokenized_dataset["test"])